In [92]:
# import statements
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier  # use classifier for Titanic
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import classification_report
from sklearn.base import BaseEstimator, TransformerMixin



In [93]:
# read the data
train_data = pd.read_csv('titanic/train.csv', index_col='PassengerId')

# seperate target from predictors
y = train_data.Survived
x = train_data.drop(['Survived',], axis=1)
# x['Deck'] = x['Cabin'].str[0].fillna('Unknown')

# stratify = yes so that it has the same amount of suriviors and deaths as the original data set
x_train_full, x_valid_full, y_train, y_valid = train_test_split(x, y, train_size = 0.8, stratify=y)



In [94]:
# For a single column
def print_nan_per(col_name):
  missing_count = x_train_full[col_name].isnull().sum()
  missing_ratio = x_train_full[col_name].isnull().mean() * 100  # percent
  print(f"{col_name}: {missing_count} / {len(x_train_full)} missing, {missing_ratio:.2f}% missing")
  


In [95]:
# does it have missing values

# Get names of columns with missing values
cols_with_missing = [col for col in x_train_full.columns
                     if x_train_full[col].isnull().any()]

print(cols_with_missing)
x_train_full

for col in cols_with_missing:
  print_nan_per(col) 

# new col deck based on cabin
# removed age because it has its own imputer now 
numerical_cols   = ['Is_Alone'] #['SibSp', 'Parch', 'Is_Alone'] # numeric columns
categorical_cols = ['Pclass', 'Sex']  # categorical columns
cabin_col        = ['Cabin']


# Drop columns that are not useful for prediction
# Name and Ticket are high cardinality and typically not useful as-is
# not explicity dropped just never passed through
cols_to_drop = ['Name', 'Ticket']

['Age', 'Cabin', 'Embarked']
Age: 138 / 712 missing, 19.38% missing
Cabin: 548 / 712 missing, 76.97% missing
Embarked: 1 / 712 missing, 0.14% missing


Built a transformer to fill the cabin column Nans with 'Unknown' instead

In [96]:
cabin_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

Lets build something to extract titles from names we need to inclue it at the beginnng of our pipline and then one hot encode it

In [97]:
def create_feature(df):
    df.head()
    df = df.copy()
    df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    
    # Create Title_Age: for age imputation (group by demographics)
    df['Title_Age'] = df['Title'].copy()
    df['Title_Age'] = df['Title_Age'].replace(['Mlle', 'Ms'], 'Miss')
    df['Title_Age'] = df['Title_Age'].replace(['Mme', 'Lady', 'Countess', 'Dona'], 'Mrs')
    df['Title_Age'] = df['Title_Age'].replace(['Don', 'Sir', 'Jonkheer', 'Capt', 'Col', 'Major', 'Rev', 'Dr'], 'Mr')
    
    # Create Title_Group: for prediction (preserve meaningful distinctions)
    df['Title_Group'] = df['Title'].copy()
    df['Title_Group'] = df['Title_Group'].replace(['Mlle', 'Ms'], 'Miss')
    df['Title_Group'] = df['Title_Group'].replace(['Mme'], 'Mrs')
    df['Title_Group'] = df['Title_Group'].replace(['Capt', 'Col', 'Major'], 'Military')  # Keep separate!
    df['Title_Group'] = df['Title_Group'].replace(['Lady', 'Countess', 'Dona', 'Don', 'Sir', 'Jonkheer'], 'Noble')
    df['Title_Group'] = df['Title_Group'].replace(['Rev'], 'Clergy')
    # Dr, Master, Mr, Miss, Mrs stay as is

    # Create a family Size column including themselves
    df['Family_Size'] = df['Parch'] + df['SibSp'] + 1
    df['Is_Alone'] = (df['Family_Size'] == 1).astype(int)
    return df

create_feature_extractor = FunctionTransformer(create_feature)

Lets build a pipline

- 3 categories have missing values
- Age, Cabin, Embarked

Strategies
- Age -> impute with median
- Cabin -> fill with 'Unknown'
- Embarked -> One hot encode, don't mark those without Embarked



In [98]:
# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='median')

# Preprocessing for categorical data e.g. embark
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    # if it encounters a category its not famialr with it will just mark all other columns zero
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


In [99]:
class AgeByTitleImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        df = pd.DataFrame(X, columns=['Age', 'Title_Age'])
        self.medians_ = df.groupby('Title_Age')['Age'].median()
        return self
    
    def transform(self, X):
        df = pd.DataFrame(X, columns=['Age', 'Title_Age'])
        df['Age'] = df.apply(
            lambda row: self.medians_[row['Title_Age']] 
                        if pd.isna(row['Age']) 
                        else row['Age'],
            axis=1
        )
        # return just AGE as numeric array
        return df[['Age']].values
    
    def get_feature_names_out(self, input_features=None):
        """Return feature names for output features"""
        return ['Age']

In [100]:
# lets build the preprocessor

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        # ('cabin', cabin_transformer, cabin_col),
        ('age_imputer', AgeByTitleImputer(), ['Age', 'Title_Age']), # created in order to impute age based on name title i.e. master is a kid
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ],
)

In [101]:
# Create the pipeline
classifier_pipeline = Pipeline(steps=[
    ('engineered_features', create_feature_extractor),
    ('preprocessor', preprocessor),  # your ColumnTransformer
    ('classifier', XGBClassifier(
        n_estimators=150,
        learning_rate=0.06,
        max_depth=5,
        subsample=0.8,
        colsample_bytree=0.4,
        random_state=0
    ))
])

In [111]:
from sklearn.ensemble import RandomForestClassifier

rf_pipeline = Pipeline(steps=[
    ('engineered_features', create_feature_extractor),
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(
        n_estimators=200,
        max_depth=10,
        min_samples_split=5,
        min_samples_leaf=2,
        random_state=0
    ))
])

rf_pipeline.fit(x_train_full, y_train)

In [104]:
# After running create_feature
x_train_with_title = create_feature(x_train_full)

# Find rows where Title extraction failed
missing_titles = x_train_with_title[x_train_with_title['Title'].isna()]

print(f"Found {len(missing_titles)} rows with missing titles:")
print(missing_titles[['Name', 'Title']])

Found 0 rows with missing titles:
Empty DataFrame
Columns: [Name, Title]
Index: []


In [105]:
# Preprocessing of training data, fit model 
classifier_pipeline.fit(x_train_full, y_train)

In [106]:
# Get validation accuracy
def print_val_accuracy(my_pipeline, name='n/a'):
  print("pipeline_name:", name)
  preds = my_pipeline.predict(x_valid_full)
  accuracy = (preds == y_valid).mean()
  print(f"Validation Accuracy: {accuracy:.4f}")
  print("\n")
  print(classification_report(y_valid, preds, target_names=['Died', 'Survived']))

In [112]:
print_val_accuracy(classifier_pipeline, 'classifer')
print_val_accuracy(rf_pipeline, 'rf_pipeline')

pipeline_name: classifer
Validation Accuracy: 0.8492


              precision    recall  f1-score   support

        Died       0.85      0.92      0.88       110
    Survived       0.85      0.74      0.79        69

    accuracy                           0.85       179
   macro avg       0.85      0.83      0.84       179
weighted avg       0.85      0.85      0.85       179

pipeline_name: rf_pipeline
Validation Accuracy: 0.8380


              precision    recall  f1-score   support

        Died       0.85      0.89      0.87       110
    Survived       0.81      0.75      0.78        69

    accuracy                           0.84       179
   macro avg       0.83      0.82      0.83       179
weighted avg       0.84      0.84      0.84       179



In [108]:
# ========================================
# TODO TOMORROW MORNING: RUN CROSS-VALIDATION
# ========================================

from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(
    classifier_pipeline, 
    x_train_full, 
    y_train, 
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

print(f"Mean CV Score: {cv_scores.mean():.4f}")
print(f"Std Dev: {cv_scores.std():.4f}")

Mean CV Score: 0.8076
Std Dev: 0.0258


In [109]:
# Get the preprocessor
prep = classifier_pipeline.named_steps['preprocessor']

# Get feature names - this shows what each column represents
feature_names = prep.get_feature_names_out()
print("Feature names:", feature_names)
print("Number of features:", len(feature_names))

# Now transform and create a DataFrame
X_transformed = prep.transform(
    classifier_pipeline.named_steps['engineered_features'].transform(x_train_full)
)

# Create a DataFrame with proper column names
df_transformed = pd.DataFrame(X_transformed, columns=feature_names, index=x_train_full.index)
print("\nTransformed DataFrame:")
print(df_transformed.head(1))

Feature names: ['age_imputer__Age' 'num__Is_Alone' 'cat__Pclass_1' 'cat__Pclass_2'
 'cat__Pclass_3' 'cat__Sex_female' 'cat__Sex_male']
Number of features: 7

Transformed DataFrame:
             age_imputer__Age  num__Is_Alone  cat__Pclass_1  cat__Pclass_2  \
PassengerId                                                                  
184                       1.0            0.0            0.0            1.0   

             cat__Pclass_3  cat__Sex_female  cat__Sex_male  
PassengerId                                                 
184                    0.0              0.0            1.0  


fit to all train data

In [114]:
final_pipeline = classifier_pipeline
final_pipeline.fit(x, y)

In [115]:
# to create a submission on test data
test_data = pd.read_csv('titanic/test.csv', index_col='PassengerId')
# test_data['Deck'] = test_data['Cabin'].str[0].fillna('Unknown')
predictions = final_pipeline.predict(test_data)

output = pd.DataFrame(predictions, index=test_data.index, columns=['Survived'])
output.index.name = 'PassengerId'
output.to_csv('submission.csv')

Lets try different model types

In [117]:
# Test different models using the split
from sklearn.linear_model import LogisticRegression


print("DEVELOPMENT PHASE - Comparing models with validation split")
print("="*60)

models_to_test = {
    'Random Forest': Pipeline(steps=[
        ('engineered_features', create_feature_extractor),
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(n_estimators=200, max_depth=10, random_state=0))
    ]),
    'XGBoost': Pipeline(steps=[
        ('engineered_features', create_feature_extractor),
        ('preprocessor', preprocessor),
        ('classifier', XGBClassifier(
            n_estimators=150, learning_rate=0.06, max_depth=5,
            subsample=0.8, colsample_bytree=0.4, random_state=0
        ))
    ]),
    'Logistic Regression': Pipeline(steps=[
        ('engineered_features', create_feature_extractor),
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(max_iter=1000, random_state=0))
    ])
}

DEVELOPMENT PHASE - Comparing models with validation split


Print out presicion

In [118]:
est_model_name = None
best_accuracy = 0

for name, pipeline in models_to_test.items():
    pipeline.fit(x_train_full, y_train)
    preds = pipeline.predict(x_valid_full)
    accuracy = (preds == y_valid).mean()
    print(f"{name:20s}: {accuracy:.4f}")
    
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model_name = name

print(f"\nBest model: {best_model_name} ({best_accuracy:.4f})")

Random Forest       : 0.8156
XGBoost             : 0.8492
Logistic Regression : 0.8436

Best model: XGBoost (0.8492)


  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  ret = a @ b
  ret = a @ b
  ret = a @ b
