In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv("../data/train.csv")
df["Age"].fillna(df["Age"].median(), inplace=True)

Data Cleaning

In [2]:
# 1. Extract Titles from Name
df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

# Simplify titles
title_mapping = {
    'Mr': 'Mr', 'Miss': 'Miss', 'Mrs': 'Mrs', 'Master': 'Master',
    'Dr': 'Officer', 'Rev': 'Officer', 'Col': 'Officer', 'Major': 'Officer', 'Mlle': 'Miss',
    'Mme': 'Mrs', 'Don': 'Royalty', 'Dona': 'Royalty', 'Lady': 'Royalty', 
    'Countess': 'Royalty', 'Jonkheer': 'Royalty', 'Sir': 'Royalty', 'Capt': 'Officer', 'Ms': 'Miss'
}
df['Title'] = df['Title'].map(title_mapping)

# 2. Family Features
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
df['IsAlone'] = (df['FamilySize'] == 1).astype(int)

# 3. Grouped Imputation for Age
# Fill missing Age based on the median age of the Title group
df['Age'] = df['Age'].fillna(df.groupby('Title')['Age'].transform('median'))

# 4. Impute Embarked with Mode
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

# 5. Drop Unnecessary Columns
df = df.drop(columns=['Name', 'Ticket', 'Cabin', 'PassengerId'])

df.head()

  df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,FamilySize,IsAlone
0,0,3,male,22.0,1,0,7.25,S,Mr,2,0
1,1,1,female,38.0,1,0,71.2833,C,Mrs,2,0
2,1,3,female,26.0,0,0,7.925,S,Miss,1,1
3,1,1,female,35.0,1,0,53.1,S,Mrs,2,0
4,0,3,male,35.0,0,0,8.05,S,Mr,1,1


Encode Categorical Variables

In [3]:
# Manual map for binary Sex
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})

# One-Hot Encode remaining categorical features
df = pd.get_dummies(df, columns=['Embarked', 'Title'], drop_first=True)
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,FamilySize,IsAlone,Embarked_Q,Embarked_S,Title_Miss,Title_Mr,Title_Mrs,Title_Officer,Title_Royalty
0,0,3,0,22.0,1,0,7.25,2,0,False,True,False,True,False,False,False
1,1,1,1,38.0,1,0,71.2833,2,0,False,False,False,False,True,False,False
2,1,3,1,26.0,0,0,7.925,1,1,False,True,True,False,False,False,False
3,1,1,1,35.0,1,0,53.1,2,0,False,True,False,False,True,False,False
4,0,3,0,35.0,0,0,8.05,1,1,False,True,False,True,False,False,False


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = df.drop("Survived", axis=1)
y = df["Survived"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features (important for Logistic Regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

First Model: Logistic Regression

In [5]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model.fit(X_train_scaled, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [6]:
from sklearn.metrics import accuracy_score

y_pred = model.predict(X_val_scaled)
accuracy_score(y_val, y_pred)

0.8100558659217877

In [7]:
from sklearn.metrics import classification_report

print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.84      0.84       105
           1       0.77      0.77      0.77        74

    accuracy                           0.81       179
   macro avg       0.80      0.80      0.80       179
weighted avg       0.81      0.81      0.81       179



In [8]:
coeffs = pd.Series(model.coef_[0], index=X.columns).sort_values()
coeffs

Title_Mr        -1.593481
Pclass          -0.718690
Title_Officer   -0.437972
SibSp           -0.405723
FamilySize      -0.366237
Age             -0.234447
Embarked_S      -0.199813
Title_Miss      -0.186457
Parch           -0.175462
IsAlone         -0.148158
Embarked_Q      -0.046443
Sex              0.008829
Title_Royalty    0.161767
Title_Mrs        0.206721
Fare             0.206786
dtype: float64

Second Model: Random Forest

In [9]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=200,
    random_state=42
)
rf.fit(X_train, y_train)

rf_pred = rf.predict(X_val)
accuracy_score(y_val, rf_pred)


0.8212290502793296

In [10]:
print(classification_report(y_val, rf_pred))

              precision    recall  f1-score   support

           0       0.85      0.84      0.85       105
           1       0.78      0.80      0.79        74

    accuracy                           0.82       179
   macro avg       0.82      0.82      0.82       179
weighted avg       0.82      0.82      0.82       179



In [11]:
importances = pd.Series(
    rf.feature_importances_,
    index=X.columns
).sort_values(ascending=False)

importances

Fare             0.237178
Age              0.220759
Title_Mr         0.120394
Sex              0.117318
Pclass           0.073574
FamilySize       0.047204
Title_Mrs        0.044664
Title_Miss       0.034457
SibSp            0.031472
Parch            0.023230
Embarked_S       0.021075
Embarked_Q       0.010034
IsAlone          0.009864
Title_Officer    0.008323
Title_Royalty    0.000455
dtype: float64

Cross-Validation

In [12]:
from sklearn.model_selection import cross_val_score

# Cross-validate Random Forest
rf_scores = cross_val_score(rf, X, y, cv=5)
print(f"Random Forest CV Accuracy: {rf_scores.mean():.4f} (+/- {rf_scores.std() * 2:.4f})")

# Cross-validate Logistic Regression (on scaled data)
X_scaled = StandardScaler().fit_transform(X)
lr_scores = cross_val_score(LogisticRegression(max_iter=1000), X_scaled, y, cv=5)
print(f"Logistic Regression CV Accuracy: {lr_scores.mean():.4f} (+/- {lr_scores.std() * 2:.4f})")

Random Forest CV Accuracy: 0.7980 (+/- 0.0582)
Logistic Regression CV Accuracy: 0.8204 (+/- 0.0471)
