In [301]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [302]:
train_df = pd.read_csv("./data/transformed_train.csv")
train_df.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_None,Cabin_T,Survived
0,1,0,22.0,1,0,2.110213,1,0,1,0,0,0,0,0,0,1,0,0
1,2,2,38.0,1,0,4.280593,0,0,0,0,1,0,0,0,0,0,0,1
2,3,0,26.0,0,0,2.188856,0,0,1,0,0,0,0,0,0,1,0,1
3,4,2,35.0,1,0,3.990834,0,0,1,0,1,0,0,0,0,0,0,1
4,5,0,35.0,0,0,2.202765,1,0,1,0,0,0,0,0,0,1,0,0


In [303]:
test_df = pd.read_csv("./data/transformed_test.csv")
test_df.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_None
0,892,0,34.5,0,0,2.178064,1,1,0,0,0,0,0,0,0,1
1,893,0,47.0,1,0,2.079442,0,0,1,0,0,0,0,0,0,1
2,894,1,62.0,0,0,2.369075,1,1,0,0,0,0,0,0,0,1
3,895,0,27.0,0,0,2.268252,1,0,1,0,0,0,0,0,0,1
4,896,0,22.0,1,1,2.586824,0,0,1,0,0,0,0,0,0,1


In [304]:
# high_mi_scores = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_male', 'Embarked_S', 'Cabin_B', 'Cabin_D', 'Cabin_E', 'Cabin_None'] # > 0.01
high_mi_scores = ['Cabin_None', 'Pclass', 'Fare', 'Sex_male'] # > 0.04

In [305]:
target = "Survived"

In [306]:
X = train_df.drop([target, "PassengerId"], axis=1)[high_mi_scores]
y = train_df[target]
X.shape, y.shape

((891, 4), (891,))

In [307]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [308]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

In [309]:
X_test = scaler.transform(X_test)

In [310]:
from sklearn.metrics import classification_report, accuracy_score

def report_model(model, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test):
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    train_acc = accuracy_score(y_train, y_train_pred)
    test_acc = accuracy_score(y_test, y_test_pred)
    print(f"training accuracy: {train_acc}")
    print(f"testing accuracy: {test_acc}")
    print(classification_report(y_test, y_test_pred))

In [311]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(random_state=42)
lr_model.fit(X_train, y_train)

report_model(lr_model)

training accuracy: 0.7879213483146067
testing accuracy: 0.7821229050279329
              precision    recall  f1-score   support

           0       0.80      0.84      0.82       105
           1       0.75      0.70      0.73        74

    accuracy                           0.78       179
   macro avg       0.78      0.77      0.77       179
weighted avg       0.78      0.78      0.78       179



In [312]:
from sklearn.model_selection import GridSearchCV

In [313]:
param_grid = {
    "C": [0.01, 0.1, 1, 10, 100]
}

optim_lr_model = GridSearchCV(LogisticRegression(random_state=42), param_grid)
optim_lr_model.fit(X_train, y_train)

In [314]:
optim_lr_model.best_params_

{'C': 0.1}

In [315]:
report_model(optim_lr_model)

training accuracy: 0.7907303370786517
testing accuracy: 0.7821229050279329
              precision    recall  f1-score   support

           0       0.80      0.84      0.82       105
           1       0.75      0.70      0.73        74

    accuracy                           0.78       179
   macro avg       0.78      0.77      0.77       179
weighted avg       0.78      0.78      0.78       179



In [316]:
X_train.shape

(712, 4)

In [317]:
from sklearn.ensemble import RandomForestClassifier

param_grid = {
    "n_estimators": [10, 100, 200],
    "bootstrap": [True, False]
}

optim_rfc = GridSearchCV(RandomForestClassifier(random_state=42), param_grid)
optim_rfc.fit(X_train, y_train)

In [318]:
optim_rfc.best_params_

{'bootstrap': True, 'n_estimators': 10}

In [319]:
report_model(optim_rfc)

training accuracy: 0.9101123595505618
testing accuracy: 0.8212290502793296
              precision    recall  f1-score   support

           0       0.82      0.89      0.85       105
           1       0.82      0.73      0.77        74

    accuracy                           0.82       179
   macro avg       0.82      0.81      0.81       179
weighted avg       0.82      0.82      0.82       179



In [320]:
from sklearn.ensemble import AdaBoostClassifier

param_grid = {
    "n_estimators": [2, 5, 10, 15, 20],
}

optim_ada_model = GridSearchCV(AdaBoostClassifier(random_state=42), param_grid)
optim_ada_model.fit(X_train, y_train)
report_model(optim_ada_model)

training accuracy: 0.7879213483146067
testing accuracy: 0.7821229050279329
              precision    recall  f1-score   support

           0       0.80      0.84      0.82       105
           1       0.75      0.70      0.73        74

    accuracy                           0.78       179
   macro avg       0.78      0.77      0.77       179
weighted avg       0.78      0.78      0.78       179



In [321]:
from sklearn.svm import SVC

param_grid = {
    "C": [0.1, 1, 10],
    "gamma": [0.1, 1, 10]
}

optim_svc = GridSearchCV(SVC(random_state=42), param_grid)
optim_svc.fit(X_train, y_train)

In [322]:
optim_svc.best_params_

{'C': 1, 'gamma': 1}

In [323]:
report_model(optim_svc)

training accuracy: 0.8146067415730337
testing accuracy: 0.7877094972067039
              precision    recall  f1-score   support

           0       0.79      0.88      0.83       105
           1       0.79      0.66      0.72        74

    accuracy                           0.79       179
   macro avg       0.79      0.77      0.77       179
weighted avg       0.79      0.79      0.78       179



In [324]:
from sklearn.ensemble import GradientBoostingClassifier

param_grid = {
    "n_estimators": np.arange(1, 50, 2),
    "max_features": [2, 3, 4, 5]
}

optim_gbc = GridSearchCV(GradientBoostingClassifier(), param_grid)
optim_gbc.fit(X_train, y_train)

In [325]:
optim_gbc.best_params_

{'max_features': 4, 'n_estimators': np.int64(33)}

In [326]:
report_model(optim_gbc)

training accuracy: 0.8426966292134831
testing accuracy: 0.7988826815642458
              precision    recall  f1-score   support

           0       0.79      0.90      0.84       105
           1       0.82      0.66      0.73        74

    accuracy                           0.80       179
   macro avg       0.80      0.78      0.79       179
weighted avg       0.80      0.80      0.79       179



In [327]:
X_train_final = train_df.drop([target, "PassengerId"], axis=1)[high_mi_scores]
y_train_final = train_df[target]
X_test_final = test_df.drop("PassengerId", axis=1)[high_mi_scores]
X_train_final.shape, X_test_final.shape, y_train_final.shape

((891, 4), (418, 4), (891,))

In [328]:
# Rebuilding the best model and training on the entire dataset
final_scaler = StandardScaler()
X_train_final = final_scaler.fit_transform(X_train_final)
X_test_final = final_scaler.transform(X_test_final)

rfc = RandomForestClassifier(n_estimators=10, bootstrap=True, random_state=42)
rfc.fit(X_train_final, y_train_final)

In [329]:
y_final_pred = rfc.predict(X_test_final)
y_final_pred.shape

(418,)

In [330]:
final_pred = pd.DataFrame({
    "PassengerId": test_df["PassengerId"],
    "Survived": y_final_pred
})
final_pred.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [331]:
final_pred.to_csv("./data/submission.csv", index=None)