In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams["figure.dpi"] = 72
import numpy as np
import pandas as pd
import re




In [2]:
# Load test data
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

# Store PassengerId for test set
test_ids = df_test['PassengerId']

# Drop PassengerId from both datasets
df_train = df_train.drop('PassengerId', axis=1)
df_test = df_test.drop('PassengerId', axis=1)

print("Training data shape:", df_train.shape)
print("Test data shape:", df_test.shape)
print("\n" + "="*50)

Training data shape: (891, 11)
Test data shape: (418, 10)



In [3]:
# explore training data
display(df_train.head)
print('data types')
display(df_train.dtypes)

print('Do we have NaN in our dataset?')
display(df_train.isnull().any())

print("Missing value counts:")
display(df_train.isnull().sum())

print("\nMissing value percentages:")
display(df_train.isnull().sum() / len(df_train) * 100)

<bound method NDFrame.head of      Survived  Pclass                                               Name  \
0           0       3                            Braund, Mr. Owen Harris   
1           1       1  Cumings, Mrs. John Bradley (Florence Briggs Th...   
2           1       3                             Heikkinen, Miss. Laina   
3           1       1       Futrelle, Mrs. Jacques Heath (Lily May Peel)   
4           0       3                           Allen, Mr. William Henry   
..        ...     ...                                                ...   
886         0       2                              Montvila, Rev. Juozas   
887         1       1                       Graham, Miss. Margaret Edith   
888         0       3           Johnston, Miss. Catherine Helen "Carrie"   
889         1       1                              Behr, Mr. Karl Howell   
890         0       3                                Dooley, Mr. Patrick   

        Sex   Age  SibSp  Parch            Ticket     Far

data types


Survived      int64
Pclass        int64
Name         object
Sex          object
Age         float64
SibSp         int64
Parch         int64
Ticket       object
Fare        float64
Cabin        object
Embarked     object
dtype: object

Do we have NaN in our dataset?


Survived    False
Pclass      False
Name        False
Sex         False
Age          True
SibSp       False
Parch       False
Ticket      False
Fare        False
Cabin        True
Embarked     True
dtype: bool

Missing value counts:


Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64


Missing value percentages:


Survived     0.000000
Pclass       0.000000
Name         0.000000
Sex          0.000000
Age         19.865320
SibSp        0.000000
Parch        0.000000
Ticket       0.000000
Fare         0.000000
Cabin       77.104377
Embarked     0.224467
dtype: float64

In [4]:
# explore test data
display(df_test.head)
print('data types')
display(df_test.dtypes)

print('Do we have NaN in our dataset?')
display(df_test.isnull().any())

print("Missing value counts:")
display(df_test.isnull().sum())

print("\nMissing value percentages:")
display(df_test.isnull().sum() / len(df_test) * 100)

<bound method NDFrame.head of      Pclass                                          Name     Sex   Age  \
0         3                              Kelly, Mr. James    male  34.5   
1         3              Wilkes, Mrs. James (Ellen Needs)  female  47.0   
2         2                     Myles, Mr. Thomas Francis    male  62.0   
3         3                              Wirz, Mr. Albert    male  27.0   
4         3  Hirvonen, Mrs. Alexander (Helga E Lindqvist)  female  22.0   
..      ...                                           ...     ...   ...   
413       3                            Spector, Mr. Woolf    male   NaN   
414       1                  Oliva y Ocana, Dona. Fermina  female  39.0   
415       3                  Saether, Mr. Simon Sivertsen    male  38.5   
416       3                           Ware, Mr. Frederick    male   NaN   
417       3                      Peter, Master. Michael J    male   NaN   

     SibSp  Parch              Ticket      Fare Cabin Embarked  
0   

data types


Pclass        int64
Name         object
Sex          object
Age         float64
SibSp         int64
Parch         int64
Ticket       object
Fare        float64
Cabin        object
Embarked     object
dtype: object

Do we have NaN in our dataset?


Pclass      False
Name        False
Sex         False
Age          True
SibSp       False
Parch       False
Ticket      False
Fare         True
Cabin        True
Embarked    False
dtype: bool

Missing value counts:


Pclass        0
Name          0
Sex           0
Age          86
SibSp         0
Parch         0
Ticket        0
Fare          1
Cabin       327
Embarked      0
dtype: int64


Missing value percentages:


Pclass       0.000000
Name         0.000000
Sex          0.000000
Age         20.574163
SibSp        0.000000
Parch        0.000000
Ticket       0.000000
Fare         0.239234
Cabin       78.229665
Embarked     0.000000
dtype: float64

In [5]:
print("\nImputing Age")

age_mean = df_train['Age'].mean()
df_train['Age'] = df_train['Age'].fillna(age_mean)

# IMPORTANT: Use training mean for test set too (avoid data leakage)

df_test['Age'] = df_test['Age'].fillna(age_mean)

print(f"Filled missing Age with mean: {age_mean:.2f}")


Imputing Age
Filled missing Age with mean: 29.70


In [6]:
# Cabin Feature is missing 77% of data - Feature Engineer a HasCabin Feature
# The presence of cabin info could be an indicator of surviving

print("\nAdd HasCabin feature")
df_train['HasCabin'] = df_train['Cabin'].notna().astype(int)
df_test['HasCabin'] = df_test['Cabin'].notna().astype(int)

print(f"Created HasCabin feature (train): {df_train['HasCabin'].sum()} had cabin info")
print(f"Created HasCabin feature (test): {df_test['HasCabin'].sum()} had cabin info")

# Now drop the original Cabin column
df_train = df_train.drop('Cabin', axis=1)
df_test = df_test.drop('Cabin', axis=1)



Add HasCabin feature
Created HasCabin feature (train): 204 had cabin info
Created HasCabin feature (test): 91 had cabin info


In [7]:
print("\nImputing Fare")
if df_test['Fare'].isnull().any():
    fare_median = df_train['Fare'].median()
    df_test['Fare'] = df_test['Fare'].fillna(fare_median)
    print(f"Filled missing test Fare with median: {fare_median:.2f}")


Imputing Fare
Filled missing test Fare with median: 14.45


In [8]:
# Replace 2 missing values in Embarked feature with the mode of the feature
embarked_mode = df_train['Embarked'].mode()[0]
print("Imputing Embarked")
df_train['Embarked'] = df_train['Embarked'].fillna(embarked_mode)
print(f'Filled missing train Embarked with mode: {embarked_mode}')

Imputing Embarked
Filled missing train Embarked with mode: S


In [9]:
print("\n" + "="*50)
print("AFTER IMPUTATION - Missing values check:")
print("\nTraining set:")
print(df_train.isnull().sum())
print("\nTest set:")
print(df_test.isnull().sum())


AFTER IMPUTATION - Missing values check:

Training set:
Survived    0
Pclass      0
Name        0
Sex         0
Age         0
SibSp       0
Parch       0
Ticket      0
Fare        0
Embarked    0
HasCabin    0
dtype: int64

Test set:
Pclass      0
Name        0
Sex         0
Age         0
SibSp       0
Parch       0
Ticket      0
Fare        0
Embarked    0
HasCabin    0
dtype: int64


In [10]:
# Individual names do not provide much information.
# But a person's title could indicate status - Feature engineer 'Title' column
# Use regex function to extract title from 'Name' feature
def extract_title(name):
    """Extract title from name string"""
    title_search = re.search(' ([A-Za-z]+)\.', name)
    if title_search:
        return title_search.group(1)
    return ""


In [11]:
df_train['Title'] = df_train['Name'].apply(extract_title)
df_test['Title'] = df_test['Name'].apply(extract_title)

print("Unique titles in training set:")
print(df_train['Title'].value_counts())
print("\nUnique titles in test set:")
print(df_test['Title'].value_counts())

Unique titles in training set:
Title
Mr          517
Miss        182
Mrs         125
Master       40
Dr            7
Rev           6
Mlle          2
Major         2
Col           2
Countess      1
Capt          1
Ms            1
Sir           1
Lady          1
Mme           1
Don           1
Jonkheer      1
Name: count, dtype: int64

Unique titles in test set:
Title
Mr        240
Miss       78
Mrs        72
Master     21
Col         2
Rev         2
Ms          1
Dr          1
Dona        1
Name: count, dtype: int64


In [12]:

# Group rare titles into categories

def simplify_title(title):
    """Group rare titles into common categories"""
    if title in ['Mr']:
        return 'Mr'
    elif title in ['Miss', 'Mlle', 'Ms']:  # Mlle = Mademoiselle (French Miss)
        return 'Miss'
    elif title in ['Mrs', 'Mme']:  # Mme = Madame (French Mrs)
        return 'Mrs'
    elif title in ['Master']:
        return 'Master'
    elif title in ['Dr', 'Rev', 'Col', 'Major', 'Capt']:  # Professional/Military
        return 'Officer'
    else:  # Rare nobility titles: Lady, Sir, Countess, Don, Dona, Jonkheer
        return 'Royalty'

In [13]:

# Now simplify
df_train['Title'] = df_train['Title'].apply(simplify_title)
df_test['Title'] = df_test['Title'].apply(simplify_title)

print("Updated value counts:")
print(df_train['Title'].value_counts())
print("\nSurvival by simplified Title:")
print(df_train.groupby('Title')['Survived'].mean().sort_values(ascending=False))

Updated value counts:
Title
Mr         517
Miss       185
Mrs        126
Master      40
Officer     18
Royalty      5
Name: count, dtype: int64

Survival by simplified Title:
Title
Mrs        0.793651
Miss       0.702703
Royalty    0.600000
Master     0.575000
Officer    0.277778
Mr         0.156673
Name: Survived, dtype: float64


In [14]:
# People have different survival behaviors when they are alone vs in a tribe (family).
# Feature engineer FamilySize and 
# FamilySize = number of siblings/spouses + number of parents/children + self

# sibsp: The dataset defines family relations in this way...
# Sibling = brother, sister, stepbrother, stepsister
# Spouse = husband, wife (mistresses and fiancÃ©s were ignored)

# parch: The dataset defines family relations in this way...
# Parent = mother, father
# Child = daughter, son, stepdaughter, stepson

df_train['FamilySize'] = df_train['SibSp'] + df_train['Parch'] + 1
df_test['FamilySize'] = df_test['SibSp'] + df_test['Parch'] + 1

# IsAlone = 1 if traveling alone, 0 otherwise
df_train['IsAlone'] = (df_train['FamilySize'] == 1).astype(int)
df_test['IsAlone'] = (df_test['FamilySize'] == 1).astype(int)

print("\n" + "="*50)
print("Family size distribution (train):")
print(df_train['FamilySize'].value_counts().sort_index())
print(f"\nNumber traveling alone: {df_train['IsAlone'].sum()}")
print(f"Number with family: {(df_train['IsAlone'] == 0).sum()}")


Family size distribution (train):
FamilySize
1     537
2     161
3     102
4      29
5      15
6      22
7      12
8       6
11      7
Name: count, dtype: int64

Number traveling alone: 537
Number with family: 354


In [15]:
# Explore survival rates using key features

print("\n" + "="*50)
print("SURVIVAL ANALYSIS:")
print("\nSurvival rate by Title:")
print(df_train.groupby('Title')['Survived'].mean().sort_values(ascending=False))

print("\nSurvival rate by FamilySize:")
print(df_train.groupby('FamilySize')['Survived'].mean().sort_values(ascending=False))

print("\nSurvival rate by IsAlone:")
print(df_train.groupby('IsAlone')['Survived'].mean())

print("\nSurvival rate by Sex:")
print(df_train.groupby('Sex')['Survived'].mean())

print("\nSurvival rate by Pclass:")
print(df_train.groupby('Pclass')['Survived'].mean())


SURVIVAL ANALYSIS:

Survival rate by Title:
Title
Mrs        0.793651
Miss       0.702703
Royalty    0.600000
Master     0.575000
Officer    0.277778
Mr         0.156673
Name: Survived, dtype: float64

Survival rate by FamilySize:
FamilySize
4     0.724138
3     0.578431
2     0.552795
7     0.333333
1     0.303538
5     0.200000
6     0.136364
8     0.000000
11    0.000000
Name: Survived, dtype: float64

Survival rate by IsAlone:
IsAlone
0    0.505650
1    0.303538
Name: Survived, dtype: float64

Survival rate by Sex:
Sex
female    0.742038
male      0.188908
Name: Survived, dtype: float64

Survival rate by Pclass:
Pclass
1    0.629630
2    0.472826
3    0.242363
Name: Survived, dtype: float64


This analyis tells us the "women and children first" policy was implemented. Females had a 74% survival rate compared to males at 19%. Characteristic of a chivalrous culture.
The Pclass indicates wealth. 1st class passengers chance of survival was 63%, 2nd class: 47%, 3rd class: 24%. Having wealth increased chances of survival. Not much has changed regarding this fact.
Those with Family_sizes of 3-4 had the best chance of survival over larger families and those traveling solo or with one other person. Large families, likely were caught in the chaos of the event.

In [16]:
# Drop useless features
print("Before dropping features:")
print("Train columns:", df_train.columns.tolist())

# Drop Name and Ticket (Titles were extracted)
df_train = df_train.drop(['Name', 'Ticket'], axis=1)
df_test = df_test.drop(['Name', 'Ticket'], axis=1)

print("\nAfter dropping features:")
print("Train columns:", df_train.columns.tolist())


Before dropping features:
Train columns: ['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Embarked', 'HasCabin', 'Title', 'FamilySize', 'IsAlone']

After dropping features:
Train columns: ['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'HasCabin', 'Title', 'FamilySize', 'IsAlone']


In [17]:
# Sex: male=1, female=0
df_train['Sex'] = df_train['Sex'].map({'male': 1, 'female': 0})
df_test['Sex'] = df_test['Sex'].map({'male': 1, 'female': 0})

# Embarked: S=0, C=1, Q=2
df_train['Embarked'] = df_train['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})
df_test['Embarked'] = df_test['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})

# Title: Use Label Encoding
from sklearn.preprocessing import LabelEncoder
le_title = LabelEncoder()
df_train['Title'] = le_title.fit_transform(df_train['Title'])
df_test['Title'] = le_title.transform(df_test['Title'])

print("ENCODING COMPLETE")
print("\nTitle encoding mapping:")
for i, title in enumerate(le_title.classes_):
    print(f"  {title} -> {i}")

print("FINAL DATASET PREVIEW:")
print("\nTraining set shape:", df_train.shape)
print("Test set shape:", df_test.shape)

print("\nTraining set data types:")
print(df_train.dtypes)

print("\nFirst few rows:")
print(df_train.head(10))

print("\n" + "="*50)
print("FINAL VERIFICATION:")
print("Missing values in train:", df_train.isnull().sum().sum())
print("Missing values in test:", df_test.isnull().sum().sum())

# Show final column list
print("\nFinal features for modeling:")
print([col for col in df_train.columns if col != 'Survived'])

ENCODING COMPLETE

Title encoding mapping:
  Master -> 0
  Miss -> 1
  Mr -> 2
  Mrs -> 3
  Officer -> 4
  Royalty -> 5
FINAL DATASET PREVIEW:

Training set shape: (891, 12)
Test set shape: (418, 11)

Training set data types:
Survived        int64
Pclass          int64
Sex             int64
Age           float64
SibSp           int64
Parch           int64
Fare          float64
Embarked        int64
HasCabin        int64
Title           int64
FamilySize      int64
IsAlone         int64
dtype: object

First few rows:
   Survived  Pclass  Sex        Age  SibSp  Parch     Fare  Embarked  \
0         0       3    1  22.000000      1      0   7.2500         0   
1         1       1    0  38.000000      1      0  71.2833         1   
2         1       3    0  26.000000      0      0   7.9250         0   
3         1       1    0  35.000000      1      0  53.1000         0   
4         0       3    1  35.000000      0      0   8.0500         0   
5         0       3    1  29.699118      0     

In [18]:
# IMPORT MODULES and PREPARE X,y TRAINING SPLITS 
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


# Training data
X_train = df_train.drop('Survived', axis=1)
y_train = df_train['Survived']

# Test data (no Survived column)
X_test = df_test.copy()

print("Training features shape:", X_train.shape)
print("Training labels shape:", y_train.shape)
print("Test features shape:", X_test.shape)

# Split training data for validation
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_train, 
    test_size=0.2, 
    random_state=42,
    stratify=y_train  # Keep same survival ratio in train/val
)

print(f"\nSplit data:")
print(f"  Training: {X_tr.shape[0]} samples")
print(f"  Validation: {X_val.shape[0]} samples")
print(f"  Survival rate in train: {y_tr.mean():.3f}")
print(f"  Survival rate in val: {y_val.mean():.3f}")

Training features shape: (891, 11)
Training labels shape: (891,)
Test features shape: (418, 11)

Split data:
  Training: 712 samples
  Validation: 179 samples
  Survival rate in train: 0.383
  Survival rate in val: 0.385


In [19]:
# MODEL 1: Logistic Regression (baseline)

print("\n" + "="*50)
print("MODEL 1: LOGISTIC REGRESSION")
print("="*50)

lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_tr, y_tr)

# Predictions
y_pred_lr = lr_model.predict(X_val)

# Evaluation
lr_accuracy = accuracy_score(y_val, y_pred_lr)
print(f"\nValidation Accuracy: {lr_accuracy:.4f} ({lr_accuracy*100:.2f}%)")

print("\nClassification Report:")
print(classification_report(y_val, y_pred_lr, target_names=['Died', 'Survived']))

print("\nConfusion Matrix:")
print(confusion_matrix(y_val, y_pred_lr))

# Cross-validation score
cv_scores_lr = cross_val_score(lr_model, X_train, y_train, cv=5, scoring='accuracy')
print(f"\n5-Fold CV Accuracy: {cv_scores_lr.mean():.4f} (+/- {cv_scores_lr.std():.4f})")


MODEL 1: LOGISTIC REGRESSION

Validation Accuracy: 0.8101 (81.01%)

Classification Report:
              precision    recall  f1-score   support

        Died       0.83      0.86      0.85       110
    Survived       0.77      0.72      0.75        69

    accuracy                           0.81       179
   macro avg       0.80      0.79      0.80       179
weighted avg       0.81      0.81      0.81       179


Confusion Matrix:
[[95 15]
 [19 50]]

5-Fold CV Accuracy: 0.7946 (+/- 0.0160)


In [20]:
# MODEL 2: Random Forest

print("\n" + "="*50)
print("MODEL 2: RANDOM FOREST")
print("="*50)

rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42
)
rf_model.fit(X_tr, y_tr)

# Predictions
y_pred_rf = rf_model.predict(X_val)

# Evaluation
rf_accuracy = accuracy_score(y_val, y_pred_rf)
print(f"\nValidation Accuracy: {rf_accuracy:.4f} ({rf_accuracy*100:.2f}%)")

print("\nClassification Report:")
print(classification_report(y_val, y_pred_rf, target_names=['Died', 'Survived']))

print("\nConfusion Matrix:")
print(confusion_matrix(y_val, y_pred_rf))

# Cross-validation score
cv_scores_rf = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='accuracy')
print(f"\n5-Fold CV Accuracy: {cv_scores_rf.mean():.4f} (+/- {cv_scores_rf.std():.4f})")

# Feature importance
print("\nTop 10 Feature Importances (Random Forest):")
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)
print(feature_importance)



MODEL 2: RANDOM FOREST

Validation Accuracy: 0.7933 (79.33%)

Classification Report:
              precision    recall  f1-score   support

        Died       0.82      0.85      0.84       110
    Survived       0.75      0.70      0.72        69

    accuracy                           0.79       179
   macro avg       0.78      0.78      0.78       179
weighted avg       0.79      0.79      0.79       179


Confusion Matrix:
[[94 16]
 [21 48]]

5-Fold CV Accuracy: 0.8339 (+/- 0.0357)

Top 10 Feature Importances (Random Forest):
       feature  importance
1          Sex    0.261650
5         Fare    0.176449
2          Age    0.130602
8        Title    0.126027
0       Pclass    0.083110
7     HasCabin    0.080176
9   FamilySize    0.053623
3        SibSp    0.030839
6     Embarked    0.026811
4        Parch    0.019468
10     IsAlone    0.011246


Attempting gradient boosting to try to mitigate some of the class imbalances, 
(i.e. women and children survival: high vs. men survival: low)

In [21]:
# MODEL 3: XGBoost 

from xgboost import XGBClassifier

print("\n" + "="*50)
print("MODEL 3: XGBoost (Gradient Boosting)")
print("="*50)

# Calculate scale_pos_weight to handle imbalance
# scale_pos_weight = (# negative samples) / (# positive samples)
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
print(f"\nClass imbalance ratio: {scale_pos_weight:.2f}")
print(f"  Died (0): {(y_train == 0).sum()} samples ({(y_train == 0).sum()/len(y_train)*100:.1f}%)")
print(f"  Survived (1): {(y_train == 1).sum()} samples ({(y_train == 1).sum()/len(y_train)*100:.1f}%)")

xgb_model = XGBClassifier(
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    scale_pos_weight=scale_pos_weight,  # Handle imbalance
    random_state=42,
    eval_metric='logloss'
)

xgb_model.fit(X_tr, y_tr)

# Predictions
y_pred_xgb = xgb_model.predict(X_val)

# Evaluation
xgb_accuracy = accuracy_score(y_val, y_pred_xgb)
print(f"\nValidation Accuracy: {xgb_accuracy:.4f} ({xgb_accuracy*100:.2f}%)")

print("\nClassification Report:")
print(classification_report(y_val, y_pred_xgb, target_names=['Died', 'Survived']))

print("\nConfusion Matrix:")
cm_xgb = confusion_matrix(y_val, y_pred_xgb)
print(cm_xgb)
print(f"  True Negatives:  {cm_xgb[0,0]} (correctly predicted deaths)")
print(f"  False Positives: {cm_xgb[0,1]} (predicted survived, actually died)")
print(f"  False Negatives: {cm_xgb[1,0]} (predicted died, actually survived)")
print(f"  True Positives:  {cm_xgb[1,1]} (correctly predicted survivals)")

# Cross-validation score
cv_scores_xgb = cross_val_score(xgb_model, X_train, y_train, cv=5, scoring='accuracy')
print(f"\n5-Fold CV Accuracy: {cv_scores_xgb.mean():.4f} (+/- {cv_scores_xgb.std():.4f})")

# Feature importance
print("\nTop Feature Importances (XGBoost):")
feature_importance_xgb = pd.DataFrame({
    'feature': X_train.columns,
    'importance': xgb_model.feature_importances_
}).sort_values('importance', ascending=False)
print(feature_importance_xgb)



MODEL 3: XGBoost (Gradient Boosting)

Class imbalance ratio: 1.61
  Died (0): 549 samples (61.6%)
  Survived (1): 342 samples (38.4%)

Validation Accuracy: 0.8101 (81.01%)

Classification Report:
              precision    recall  f1-score   support

        Died       0.85      0.84      0.84       110
    Survived       0.75      0.77      0.76        69

    accuracy                           0.81       179
   macro avg       0.80      0.80      0.80       179
weighted avg       0.81      0.81      0.81       179


Confusion Matrix:
[[92 18]
 [16 53]]
  True Negatives:  92 (correctly predicted deaths)
  False Positives: 18 (predicted survived, actually died)
  False Negatives: 16 (predicted died, actually survived)
  True Positives:  53 (correctly predicted survivals)

5-Fold CV Accuracy: 0.8361 (+/- 0.0217)

Top Feature Importances (XGBoost):
       feature  importance
1          Sex    0.552948
7     HasCabin    0.122148
0       Pclass    0.121143
8        Title    0.037144
5    

In [22]:
# MODEL 4: LightGBM

from lightgbm import LGBMClassifier

print("\n" + "="*50)
print("MODEL 4: LightGBM")
print("="*50)

lgbm_model = LGBMClassifier(
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    num_leaves=31,
    scale_pos_weight=scale_pos_weight,  # Handle imbalance
    random_state=42,
    verbose=-1  # Suppress warnings
)

lgbm_model.fit(X_tr, y_tr)

# Predictions
y_pred_lgbm = lgbm_model.predict(X_val)

# Evaluation
lgbm_accuracy = accuracy_score(y_val, y_pred_lgbm)
print(f"\nValidation Accuracy: {lgbm_accuracy:.4f} ({lgbm_accuracy*100:.2f}%)")

print("\nClassification Report:")
print(classification_report(y_val, y_pred_lgbm, target_names=['Died', 'Survived']))

print("\nConfusion Matrix:")
cm_lgbm = confusion_matrix(y_val, y_pred_lgbm)
print(cm_lgbm)
print(f"  True Negatives:  {cm_lgbm[0,0]} (correctly predicted deaths)")
print(f"  False Positives: {cm_lgbm[0,1]} (predicted survived, actually died)")
print(f"  False Negatives: {cm_lgbm[1,0]} (predicted died, actually survived)")
print(f"  True Positives:  {cm_lgbm[1,1]} (correctly predicted survivals)")

# Cross-validation score
cv_scores_lgbm = cross_val_score(lgbm_model, X_train, y_train, cv=5, scoring='accuracy')
print(f"\n5-Fold CV Accuracy: {cv_scores_lgbm.mean():.4f} (+/- {cv_scores_lgbm.std():.4f})")

# Feature importance
print("\nTop Feature Importances (LightGBM):")
feature_importance_lgbm = pd.DataFrame({
    'feature': X_train.columns,
    'importance': lgbm_model.feature_importances_
}).sort_values('importance', ascending=False)
print(feature_importance_lgbm)

# ============================================
# COMPLETE MODEL COMPARISON
# ============================================

print("\n" + "="*50)
print("COMPLETE MODEL COMPARISON")
print("="*50)
print(f"Logistic Regression CV Accuracy: {cv_scores_lr.mean():.4f} (+/- {cv_scores_lr.std():.4f})")
print(f"Random Forest CV Accuracy:       {cv_scores_rf.mean():.4f} (+/- {cv_scores_rf.std():.4f})")
print(f"XGBoost CV Accuracy:             {cv_scores_xgb.mean():.4f} (+/- {cv_scores_xgb.std():.4f})")
print(f"LightGBM CV Accuracy:            {cv_scores_lgbm.mean():.4f} (+/- {cv_scores_lgbm.std():.4f})")

# Find best model
results = {
    'Logistic Regression': cv_scores_lr.mean(),
    'Random Forest': cv_scores_rf.mean(),
    'XGBoost': cv_scores_xgb.mean(),
    'LightGBM': cv_scores_lgbm.mean()
}

# Sort by accuracy
sorted_results = sorted(results.items(), key=lambda x: x[1], reverse=True)

print("\n" + "="*50)
print("LEADERBOARD:")
print("="*50)
for rank, (model_name, score) in enumerate(sorted_results, 1):
    print(f"#{rank}: {model_name:<25} {score:.4f}")

best_model_name = sorted_results[0][0]
best_score = sorted_results[0][1]
print(f"\n Champion: {best_model_name} with {best_score:.4f} accuracy")

# Compare top 2
if len(sorted_results) > 1:
    gap = sorted_results[0][1] - sorted_results[1][1]
    print(f"\nMargin of victory: {gap:.4f} ({gap*100:.2f}%)")
    if gap < 0.005:  # Less than 0.5% difference
        print("Models are statistically tied - difference is likely noise")


MODEL 4: LightGBM

Validation Accuracy: 0.8101 (81.01%)

Classification Report:
              precision    recall  f1-score   support

        Died       0.84      0.85      0.85       110
    Survived       0.76      0.74      0.75        69

    accuracy                           0.81       179
   macro avg       0.80      0.80      0.80       179
weighted avg       0.81      0.81      0.81       179


Confusion Matrix:
[[94 16]
 [18 51]]
  True Negatives:  94 (correctly predicted deaths)
  False Positives: 16 (predicted survived, actually died)
  False Negatives: 18 (predicted died, actually survived)
  True Positives:  51 (correctly predicted survivals)

5-Fold CV Accuracy: 0.8362 (+/- 0.0327)

Top Feature Importances (LightGBM):
       feature  importance
5         Fare         512
2          Age         357
9   FamilySize          75
8        Title          58
0       Pclass          48
1          Sex          47
6     Embarked          47
3        SibSp          36
4        Par

The feature importance ranking for LightGBM is interesting but not surprising. Ticket 'Fare' is indicative of wealth. Those who paid the most money were likely weatlthy and therefore priority for life boats. 

Also, 'Title' ranking higher in importance than 'Sex' makes sense because it captures gender along with the social status making 'Sex' somewhat redundant information.

In [23]:
print("Training final LightGBM model on all data")

# Retrain the model
final_model = LGBMClassifier(
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    num_leaves=31,
    scale_pos_weight=scale_pos_weight,
    random_state=42,
    verbose=-1
)

final_model.fit(X_train, y_train)
print("Model trained")

# Generate predictions
test_predictions = final_model.predict(X_test)
print(f"Generated {len(test_predictions)} predictions")

print("\nMaking predictions on test set")
test_predictions = final_model.predict(X_test)

print(f"\nTest set predictions: {len(test_predictions)} samples")
print(f"Predicted survivors: {test_predictions.sum()} ({test_predictions.sum()/len(test_predictions)*100:.1f}%)")
print(f"Predicted deaths: {(test_predictions == 0).sum()} ({(test_predictions == 0).sum()/len(test_predictions)*100:.1f}%)")

# Compare with training set survival rate
print(f"\nTraining set survival rate: {y_train.mean()*100:.1f}%")
print(f"Test set predicted survival rate: {test_predictions.mean()*100:.1f}%")

Training final LightGBM model on all data
Model trained
Generated 418 predictions

Making predictions on test set

Test set predictions: 418 samples
Predicted survivors: 164 (39.2%)
Predicted deaths: 254 (60.8%)

Training set survival rate: 38.4%
Test set predicted survival rate: 39.2%


In [24]:
def save_preds(fn, y_pred, passenger_ids):
    import csv
    with open(fn, 'w') as fout:
        writer = csv.writer(fout, delimiter=',', lineterminator='\n')
        writer.writerow(['PassengerId', 'Survived'])
        for pid, pred in zip(passenger_ids, y_pred):
            writer.writerow([pid, pred])

save_preds('titanic_survival predictions_bruce.csv', test_predictions, test_ids)
print("Submission CSV file created")

Submission CSV file created


My submission results %74.64
This tells me my model learned patterns specific to the training set that did not generalize well to the test  data, overfitting. The Maybe I could have dropped Cabin information, and it was added noise.