In [69]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [70]:
train_df = pd.read_csv("./data/transformed_train.csv")
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,Fare,Sex_male,Embarked_Q,Embarked_S,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_None,Cabin_T,FamilyMembers
0,1,0,0,22.0,2.110213,1,0,1,0,0,0,0,0,0,1,0,1
1,2,1,2,38.0,4.280593,0,0,0,0,1,0,0,0,0,0,0,1
2,3,1,0,26.0,2.188856,0,0,1,0,0,0,0,0,0,1,0,0
3,4,1,2,35.0,3.990834,0,0,1,0,1,0,0,0,0,0,0,1
4,5,0,0,35.0,2.202765,1,0,1,0,0,0,0,0,0,1,0,0


In [71]:
test_df = pd.read_csv("./data/transformed_test.csv")
test_df.head()

Unnamed: 0,PassengerId,Pclass,Age,Fare,Sex_male,Embarked_Q,Embarked_S,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_None,Cabin_T,FamilyMembers
0,892,0,34.5,2.178064,1,1,0,0,0,0,0,0,0,1,0.0,0
1,893,0,47.0,2.079442,0,0,1,0,0,0,0,0,0,1,0.0,1
2,894,1,62.0,2.369075,1,1,0,0,0,0,0,0,0,1,0.0,0
3,895,0,27.0,2.268252,1,0,1,0,0,0,0,0,0,1,0.0,0
4,896,0,22.0,2.586824,0,0,1,0,0,0,0,0,0,1,0.0,2


In [72]:
high_mi_scores = ['Cabin_None', 'Pclass', 'Fare', 'Sex_male', 'FamilyMembers'] # mi_score > 0.04

In [73]:
target = "Survived"

In [74]:
X = train_df.drop([target, "PassengerId"], axis=1)[high_mi_scores]
y = train_df[target]
X.shape, y.shape

((891, 5), (891,))

In [75]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [76]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

In [77]:
X_test = scaler.transform(X_test)

In [78]:
from sklearn.metrics import classification_report, accuracy_score

def report_model(model, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test):
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    train_acc = accuracy_score(y_train, y_train_pred)
    test_acc = accuracy_score(y_test, y_test_pred)
    print(f"training accuracy: {train_acc}")
    print(f"testing accuracy: {test_acc}")
    print(classification_report(y_test, y_test_pred))

In [79]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(random_state=42)
lr_model.fit(X_train, y_train)

report_model(lr_model)

training accuracy: 0.7991573033707865
testing accuracy: 0.770949720670391
              precision    recall  f1-score   support

           0       0.79      0.83      0.81       105
           1       0.74      0.69      0.71        74

    accuracy                           0.77       179
   macro avg       0.77      0.76      0.76       179
weighted avg       0.77      0.77      0.77       179



In [80]:
from sklearn.model_selection import GridSearchCV

In [81]:
param_grid = {
    "C": [0.01, 0.1, 1, 10, 100]
}

optim_lr_model = GridSearchCV(LogisticRegression(random_state=42), param_grid)
optim_lr_model.fit(X_train, y_train)

In [82]:
optim_lr_model.best_params_

{'C': 0.1}

In [83]:
report_model(optim_lr_model)

training accuracy: 0.8019662921348315
testing accuracy: 0.776536312849162
              precision    recall  f1-score   support

           0       0.79      0.84      0.81       105
           1       0.75      0.69      0.72        74

    accuracy                           0.78       179
   macro avg       0.77      0.76      0.77       179
weighted avg       0.78      0.78      0.77       179



In [84]:
X_train.shape

(712, 5)

In [85]:
from sklearn.ensemble import RandomForestClassifier

param_grid = {
    "n_estimators": [10, 100, 200],
    "bootstrap": [True, False]
}

optim_rfc = GridSearchCV(RandomForestClassifier(random_state=42), param_grid)
optim_rfc.fit(X_train, y_train)

In [86]:
optim_rfc.best_params_

{'bootstrap': True, 'n_estimators': 10}

In [87]:
report_model(optim_rfc)

training accuracy: 0.9185393258426966
testing accuracy: 0.8156424581005587
              precision    recall  f1-score   support

           0       0.83      0.87      0.85       105
           1       0.80      0.74      0.77        74

    accuracy                           0.82       179
   macro avg       0.81      0.80      0.81       179
weighted avg       0.81      0.82      0.81       179



In [88]:
from sklearn.ensemble import AdaBoostClassifier

param_grid = {
    "n_estimators": [2, 5, 10, 15, 20],
}

optim_ada_model = GridSearchCV(AdaBoostClassifier(random_state=42), param_grid)
optim_ada_model.fit(X_train, y_train)
report_model(optim_ada_model)

training accuracy: 0.8132022471910112
testing accuracy: 0.7988826815642458
              precision    recall  f1-score   support

           0       0.80      0.88      0.84       105
           1       0.80      0.69      0.74        74

    accuracy                           0.80       179
   macro avg       0.80      0.78      0.79       179
weighted avg       0.80      0.80      0.80       179



In [89]:
from sklearn.svm import SVC

param_grid = {
    "C": [0.1, 1, 10],
    "gamma": [0.1, 1, 10]
}

optim_svc = GridSearchCV(SVC(random_state=42), param_grid)
optim_svc.fit(X_train, y_train)

In [90]:
optim_svc.best_params_

{'C': 1, 'gamma': 0.1}

In [91]:
report_model(optim_svc)

training accuracy: 0.8132022471910112
testing accuracy: 0.7988826815642458
              precision    recall  f1-score   support

           0       0.80      0.88      0.84       105
           1       0.80      0.69      0.74        74

    accuracy                           0.80       179
   macro avg       0.80      0.78      0.79       179
weighted avg       0.80      0.80      0.80       179



In [92]:
from sklearn.ensemble import GradientBoostingClassifier

param_grid = {
    "n_estimators": np.arange(1, 50, 2),
    "max_features": [2, 3, 4, 5]
}

optim_gbc = GridSearchCV(GradientBoostingClassifier(), param_grid)
optim_gbc.fit(X_train, y_train)

In [93]:
optim_gbc.best_params_

{'max_features': 5, 'n_estimators': np.int64(35)}

In [94]:
report_model(optim_gbc)

training accuracy: 0.8497191011235955
testing accuracy: 0.8044692737430168
              precision    recall  f1-score   support

           0       0.80      0.90      0.84       105
           1       0.82      0.68      0.74        74

    accuracy                           0.80       179
   macro avg       0.81      0.79      0.79       179
weighted avg       0.81      0.80      0.80       179



In [95]:
X_train_final = train_df.drop([target, "PassengerId"], axis=1)[high_mi_scores]
y_train_final = train_df[target]
X_test_final = test_df.drop("PassengerId", axis=1)[high_mi_scores]
X_train_final.shape, X_test_final.shape, y_train_final.shape

((891, 5), (418, 5), (891,))

In [96]:
# Rebuilding the best model and training on the entire dataset
final_scaler = StandardScaler()
X_train_final = final_scaler.fit_transform(X_train_final)
X_test_final = final_scaler.transform(X_test_final)

rfc = RandomForestClassifier(n_estimators=10, bootstrap=True, random_state=42)
rfc.fit(X_train_final, y_train_final)

In [97]:
y_final_pred = rfc.predict(X_test_final)
y_final_pred.shape

(418,)

In [98]:
final_pred = pd.DataFrame({
    "PassengerId": test_df["PassengerId"],
    "Survived": y_final_pred
})
final_pred.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [99]:
final_pred.to_csv("./data/submission.csv", index=None)