In [None]:
# 📊 Data Science Starter Project

## Project: Titanic Survival Prediction (Classification)
# Goal: Predict whether a passenger survived based on features like age, class, and gender

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load dataset
data = pd.read_csv("https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv")

# Quick view of the data
print(data.head())

# Select features and target
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']
data = data[features + ['Survived']]

# Handle missing data
print("Missing values before:", data.isnull().sum())
data['Age'].fillna(data['Age'].median(), inplace=True)
print("Missing values after:", data.isnull().sum())

# Convert categorical variables
data = pd.get_dummies(data, columns=['Sex'], drop_first=True)

# Split features and target
X = data.drop('Survived', axis=1)
y = data['Survived']

# Split train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Visualize feature relationships
sns.boxplot(x='Survived', y='Age', data=data)
plt.title('Age distribution by survival')
plt.show()

sns.barplot(x='Pclass', y='Survived', data=data)
plt.title('Survival rate by class')
plt.show()
