In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score, StratifiedKFold
from imblearn.over_sampling import SMOTE
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import PowerTransformer, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import StackingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import mutual_info_classif, SelectKBest
import optuna
import shap

In [3]:
class SMOTETransformer(BaseEstimator, TransformerMixin):
    def __init__(self, random_state=42):
        self.random_state = random_state
        self.smote = SMOTE(random_state=self.random_state)

    def fit_resample(self, X, y):
        return self.smote.fit_resample(X, y)

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X

In [4]:
# Load the data
train_data = pd.read_csv('input/train.csv')
test_data = pd.read_csv('input/test.csv')

# Impute missing ages with median
train_data['Age'] = train_data['Age'].fillna(train_data['Age'].median())
test_data['Age'] = test_data['Age'].fillna(test_data['Age'].median())

# Impute missing embarkments
most_frequent_embarked = train_data['Embarked'].mode()[0]
train_data['Embarked'] = train_data['Embarked'].fillna(most_frequent_embarked)
test_data['Embarked'] = test_data['Embarked'].fillna(most_frequent_embarked)

# Combine train and test for preprocessing
all_data = pd.concat([train_data, test_data], sort=False).reset_index(drop=True)

In [5]:
# Feature Engineering

# Extract titles from names
all_data['Title'] = all_data['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)
all_data['Title'] = all_data['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
all_data['Title'] = all_data['Title'].replace('Mlle', 'Miss')
all_data['Title'] = all_data['Title'].replace('Ms', 'Miss')
all_data['Title'] = all_data['Title'].replace('Mme', 'Mrs')

# Create family size feature
all_data['FamilySize'] = all_data['SibSp'] + all_data['Parch'] + 1

# Create is_alone feature
all_data['IsAlone'] = 0
all_data.loc[all_data['FamilySize'] == 1, 'IsAlone'] = 1

# Extract deck from cabin
all_data['Deck'] = all_data['Cabin'].str.slice(0,1)
all_data['Deck'] = all_data['Deck'].fillna('U')

# Bin age
all_data['AgeBin'] = pd.cut(all_data['Age'], bins=[0, 12, 20, 40, 60, 100], labels=['Child', 'Teen', 'Adult', 'Senior', 'Elderly'])

# Bin fare
all_data['FareBin'] = pd.qcut(all_data['Fare'], q=4, labels=['Low', 'Medium', 'High', 'Very High'])

# Create interaction features
all_data['Age*Class'] = all_data['Age'] * all_data['Pclass']
all_data['Fare*Class'] = all_data['Fare'] * all_data['Pclass']

In [6]:
# Feature Selection
numeric_features = ['Age', 'Fare', 'FamilySize']
categorical_features = ['Pclass', 'Sex', 'Embarked', 'Title', 'Deck', 'AgeBin', 'FareBin', 'IsAlone']

# Split back into train and test
train_data = all_data[all_data['Survived'].notna()].copy()
test_data = all_data[all_data['Survived'].isna()].copy()

X = train_data[numeric_features + categorical_features]
y = train_data['Survived']

X.head(5)

Unnamed: 0,Age,Fare,FamilySize,Pclass,Sex,Embarked,Title,Deck,AgeBin,FareBin,IsAlone
0,22.0,7.25,2,3,male,S,Mr,U,Adult,Low,0
1,38.0,71.2833,2,1,female,C,Mrs,C,Adult,Very High,0
2,26.0,7.925,1,3,female,S,Miss,U,Adult,Medium,1
3,35.0,53.1,2,1,female,S,Mrs,C,Adult,Very High,0
4,35.0,8.05,1,3,male,S,Mr,U,Adult,Medium,1


In [7]:
# Encode categorical variables
categorical_encoder = OneHotEncoder(drop='first', sparse_output=False)
X_encoded = pd.DataFrame(categorical_encoder.fit_transform(X[categorical_features]))
X_encoded.columns = categorical_encoder.get_feature_names_out(categorical_features)

# Combine with numeric features
X_numeric = X[numeric_features].reset_index(drop=True)
X_preprocessed = pd.concat([X_numeric, X_encoded], axis=1)