In [5]:
# -------------------------------
# Load libraries and data
# -------------------------------

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.inspection import permutation_importance
import re

# specify input paths
train_file_path = '/kaggle/input/titanic/train.csv'
test_file_path = '/kaggle/input/titanic/test.csv'

# load files as pandas dataframes
train_data = pd.read_csv(train_file_path)  
test_data = pd.read_csv(test_file_path)

train_data.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [6]:
# -------------------------------
# Feature Engineering
# -------------------------------

def add_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    
    # Family features
    df['FamilySize'] = df['SibSp'].fillna(0) + df['Parch'].fillna(0) + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)

    # Title
    df["Title"] = df["Name"].str.extract(" ([A-Za-z]+)\\.", expand=False)
    df['Title'] = df['Title'].replace(['Dr', 'Rev', 'Col', 'Major', 'Capt',
                                      'Don', 'Lady', 'Sir', 'Countess', 'Jonkheer'], 'Rare')
    
    df['Title'] = df['Title'].replace('Mlle', 'Miss')
    df['Title'] = df['Title'].replace('Ms', 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')
    
    # Cabin
    df['IsCabin'] = df['Cabin'].isna().astype(int)

    # Rank Fare within each class
    df['FareRankWithinClass'] = df['Fare'].groupby(df['Pclass']).rank()

    # Use an interaction for Sex x Pclass
    df['Sex_Pclass'] = df["Sex"].astype(str) + "_P" + df["Pclass"].astype(str)

    # Use an interaction for Sex x Embarked
    df['Sex_Embarked'] = df["Sex"].astype(str) + "_P" + df["Embarked"].astype(str)

    bins = [0, 5, 12, 18, 120]
    labels = ['0–5', '6-12', '13–18', '19+']

    df['AgeBin'] = pd.cut(df['Age'], bins=bins, labels=labels, right=False)

    bins = [0, 20, 40, 60, 100, 200, 700]
    labels = ['0–20', '21-40', '41–60', '61-100', '101-200', '201+']

    df['FareBin'] = pd.cut(df['Fare'], bins=bins, labels=labels, right=False)
    
    return df

## Add fare and age binned variables, ticket number (without prefix), ethnicity

train_data = add_features(train_data)
test_data = add_features(test_data)

In [7]:
# -------------------------------
# Pipeline set-up
# -------------------------------

# Choose features to process
numerical_features = ['Age', 'SibSp', 'FamilySize', 'Parch', 'FareRankWithinClass']
categorical_features = ['Pclass', 'Sex', 'Title', 'Sex_Pclass', 'Embarked', 'Sex_Embarked', 'AgeBin', 'FareBin']
binary_features = ['IsAlone', 'IsCabin']

# Preprocessing numerical features (insert median for missing values and z-score scaling)
numerical_transformer = Pipeline(steps = [('imputer', SimpleImputer(strategy = 'median')),
                                       ('scaler', StandardScaler())])

# Preprocessing categorical features (insert most frequent for missing values)
categorical_transformer = Pipeline(steps = [('imputer', SimpleImputer(strategy = 'most_frequent')),
                                           ('onehot', OneHotEncoder(handle_unknown = 'ignore'))])

# Preprocessing binary features (insert most frequent for missing values)
binary_transformer = Pipeline(steps = [('imputer', SimpleImputer(strategy = 'most_frequent'))])

preprocessor = ColumnTransformer(transformers = [('num', numerical_transformer, numerical_features),
                                                 ('cat', categorical_transformer, categorical_features),
                                                 ('bin', binary_transformer, binary_features)])

In [8]:
# -------------------------------
# Prepare data
# -------------------------------
# train dataset
y = train_data.Survived
X = train_data[numerical_features + categorical_features + binary_features]


# test dataset
X_test = test_data[numerical_features + categorical_features + binary_features]

In [13]:
# -------------------------------
# Model fit and cross-validation
# -------------------------------
model = Pipeline(steps = [('preprocessor', preprocessor),
                       ('model', LogisticRegression(max_iter = 5000, C = 0.5, penalty = 'l1', solver = 'liblinear'))])

#
#GradientBoostingClassifier(n_estimators=200, learning_rate=0.05, max_depth=3, random_state=42) 

scores = cross_val_score(
    model,
    X,
    y,
    cv=5,
    scoring="accuracy"
)

print("model mean:", scores.mean())
print("model std:", scores.std())

model mean: 0.824913690289373
model std: 0.027163751309070044


In [12]:
# -------------------------------
# Hyperparameter tuning
# -------------------------------
C_grid = [0.05, 0.1, 0.2, 0.5, 1.0, 2.0, 3.0, 5.0, 10.0]

tune_rows = []

for C in C_grid:
    model_c = Pipeline(steps = [('preprocessor', preprocessor), ('model', LogisticRegression(max_iter = 5000, C = C, penalty = 'l1', solver = 'liblinear'))])
    m = cross_val_score(model_c, X, y, cv=5, scoring="accuracy")
    tune_rows.append({"C": C, "mean": m.mean(), "std": m.std()})

tune_df = pd.DataFrame(tune_rows).sort_values("mean", ascending=False)
tune_df

Unnamed: 0,C,mean,std
6,3.0,0.824933,0.033119
4,1.0,0.824926,0.027311
3,0.5,0.824914,0.027164
5,2.0,0.823809,0.031
7,5.0,0.820438,0.033451
8,10.0,0.819321,0.03387
2,0.2,0.814795,0.016859
1,0.1,0.805813,0.015173
0,0.05,0.795719,0.01468


In [14]:
# -------------------------------
# Model fit and save
# -------------------------------
model.fit(X,y)

y_pred = model.predict(X_test)

submission = pd.DataFrame({
    "PassengerId": test_data["PassengerId"],
    "Survived": y_pred
})

submission.to_csv("submission.csv", index=False)

In [117]:
# -------------------------------
# Feature selection & errors
# -------------------------------

In [15]:
# Errors
# -------------------------------
oof_pred = cross_val_predict(model, X, y, cv=5, method="predict")
oof_prob = cross_val_predict(model, X, y, cv=5, method="predict_proba")[:, 1]

errors = X.copy()
errors["true"] = y.values
errors["pred"] = oof_pred
errors["prob_survived"] = oof_prob
errors["error"] = (errors["true"] != errors["pred"])

print("Error rate by Sex:\n", errors.groupby("Sex")["error"].mean().sort_values(ascending=False))
print("\nError rate by Pclass:\n", errors.groupby("Pclass")["error"].mean().sort_values(ascending=False))
print("\nError rate by Title:\n", errors.groupby("Title")["error"].mean().sort_values(ascending=False))

# By interactions (Sex x Pclass)
group_err = errors.groupby(["Sex", "Pclass"])["error"].mean().unstack()
print("\nError rate by Sex x Pclass:\n", group_err)

# By interactions (Sex x Pclass)
group_err = errors.groupby(["Sex", "Embarked"])["error"].mean().unstack()
print("\nError rate by Sex x Pclass:\n", group_err)

Error rate by Sex:
 Sex
female    0.197452
male      0.162912
Name: error, dtype: float64

Error rate by Pclass:
 Pclass
1    0.236111
3    0.185336
2    0.076087
Name: error, dtype: float64

Error rate by Title:
 Title
Rare      0.304348
Miss      0.210811
Mrs       0.182540
Mr        0.164410
Master    0.050000
Name: error, dtype: float64

Error rate by Sex x Pclass:
 Pclass         1         2         3
Sex                                 
female  0.031915  0.078947  0.368056
male    0.393443  0.074074  0.109510

Error rate by Sex x Pclass:
 Embarked         C         Q         S
Sex                                   
female    0.123288  0.250000  0.216749
male      0.284211  0.073171  0.145125


In [16]:
#model.fit(X, y)

feat_names = list(X)

perm = permutation_importance(
    model,
    X, y,
    n_repeats=20,
    random_state=42,
    scoring="accuracy"
)

perm_df = pd.DataFrame({
    "expanded_feature": feat_names,
    "importance_mean": perm.importances_mean,
    "importance_std": perm.importances_std
}).sort_values("importance_mean", ascending=False)

perm_df.head(30)

Unnamed: 0,expanded_feature,importance_mean,importance_std
7,Title,0.070314,0.007276
8,Sex_Pclass,0.045791,0.006123
6,Sex,0.043322,0.006343
2,FamilySize,0.034512,0.005508
0,Age,0.021437,0.006798
10,Sex_Embarked,0.01431,0.001874
13,IsAlone,0.007015,0.003009
14,IsCabin,0.006004,0.003518
1,SibSp,0.004938,0.003894
5,Pclass,0.004602,0.003635
