# Kaggle Titanic Dataset Challenge – Train

_created by Austin Poor_

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

In [2]:
import sys
from pathlib import Path

WORK_DIR = Path("..")
SCRIPT_DIR = WORK_DIR / "scripts"
DATA_DIR = WORK_DIR / "data"
MODEL_DIR = WORK_DIR / "models"

# Add the script directory to python's path
# so I can import code from `scripts/`
sys.path.append(str(SCRIPT_DIR.resolve()))

list(DATA_DIR.glob("*"))

[PosixPath('../data/test.csv'),
 PosixPath('../data/train.csv'),
 PosixPath('../data/.ipynb_checkpoints'),
 PosixPath('../data/gender_submission.csv'),
 PosixPath('../data/.gitinclude')]

In [3]:
import preprocess

In [4]:
df = preprocess.load_df(DATA_DIR / "train.csv")
df.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,IsCabinNA,IsAgeNA,Sex_male,Embarked_Q,...,CabinGroup_D,CabinGroup_E,CabinGroup_F,CabinGroup_G,CabinGroup_T,TicketBase_0,TicketBase_1,TicketBase_2,TicketBase_3,TicketBase_4
0,0,3,22.0,1,0,7.25,True,False,1,0,...,0,0,0,0,0,0,0,0,1,0
1,1,1,38.0,1,0,71.2833,False,False,0,0,...,0,0,0,0,0,0,1,0,0,0
2,1,3,26.0,0,0,7.925,True,False,0,0,...,0,0,0,0,0,0,0,0,0,1
3,1,1,35.0,1,0,53.1,False,False,0,0,...,0,0,0,0,0,1,0,0,0,0
4,0,3,35.0,0,0,8.05,True,False,1,0,...,0,0,0,0,0,1,0,0,0,0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 23 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Survived      891 non-null    int64  
 1   Pclass        891 non-null    int64  
 2   Age           891 non-null    float64
 3   SibSp         891 non-null    int64  
 4   Parch         891 non-null    int64  
 5   Fare          891 non-null    float64
 6   IsCabinNA     891 non-null    bool   
 7   IsAgeNA       891 non-null    bool   
 8   Sex_male      891 non-null    uint8  
 9   Embarked_Q    891 non-null    uint8  
 10  Embarked_S    891 non-null    uint8  
 11  CabinGroup_B  891 non-null    uint8  
 12  CabinGroup_C  891 non-null    uint8  
 13  CabinGroup_D  891 non-null    uint8  
 14  CabinGroup_E  891 non-null    uint8  
 15  CabinGroup_F  891 non-null    uint8  
 16  CabinGroup_G  891 non-null    uint8  
 17  CabinGroup_T  891 non-null    uint8  
 18  TicketBase_0  891 non-null    

In [6]:
from sklearn.metrics import roc_auc_score, accuracy_score, make_scorer
from sklearn.model_selection import train_test_split, GridSearchCV, KFold

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

In [7]:
models = {}
param_grids = {}
results = {}

In [8]:
auc_scorer = make_scorer(roc_auc_score)

In [9]:
X, y = df.drop("Survived",1), df.Survived
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [10]:
scale = StandardScaler()
Xs_train = scale.fit_transform(X_train)
Xs_test = scale.fit_transform(X_test)

## Model Training – Logistic Regression

In [11]:
param_grids["logistic_regression"] = {
    "penalty": ["l1","l2"],
    "C": [1e-3,1e-2,1e-1,1e0,1e1,1e2,1e3],
    "class_weight": [None,"balanced"]
}
models["logistic_regression"] = GridSearchCV(
    estimator=LogisticRegression(verbose=0,n_jobs=1),
    param_grid=param_grids["logistic_regression"],
    verbose=5,
    cv=5,
    scoring=auc_scorer,
    n_jobs=-1
)
models["logistic_regression"].fit(Xs_train,y_train)
results["logistic_regression"] = models['logistic_regression'].best_score_

Fitting 5 folds for each of 28 candidates, totalling 140 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 140 out of 140 | elapsed:    2.3s finished


## Model Training – Random Forest

In [12]:
param_grids["random_forest"] = {
    "n_estimators": [120,300,500,800,1200],
    "max_depth": [5,8,15,25,30,None],
    "min_samples_split": [1,2,5,10,15,100],
    "min_samples_leaf": [1,2,5,10],
    "max_features": ["log2","sqrt",None],
}
models["random_forest"] = GridSearchCV(
    estimator=RandomForestClassifier(verbose=0,n_jobs=1),
    param_grid=param_grids["random_forest"],
    verbose=5,
    cv=5,
    scoring=auc_scorer,
    n_jobs=-1
)
models["random_forest"].fit(X_train,y_train)
results["random_forest"] = models['random_forest'].best_score_

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 5 folds for each of 2160 candidates, totalling 10800 fits


[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  86 tasks      | elapsed:   12.2s
[Parallel(n_jobs=-1)]: Done 178 tasks      | elapsed:   26.1s
[Parallel(n_jobs=-1)]: Done 304 tasks      | elapsed:   49.0s
[Parallel(n_jobs=-1)]: Done 466 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 664 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 898 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 1168 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 1474 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 1816 tasks      | elapsed:  5.2min
[Parallel(n_jobs=-1)]: Done 2194 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done 2608 tasks      | elapsed:  7.4min
[Parallel(n_jobs=-1)]: Done 3058 tasks      | elapsed:  8.7min
[Parallel(n_jobs=-1)]: Done 3544 tasks      | elapsed: 10.3min
[Parallel(n_jobs=-1)]: Done 4066 tasks      | elapsed: 11.8min
[Parallel(n_jobs=-1)]: Done 4624 tasks      | elapsed: 13.4min

## Model Training – XGBoost

In [13]:
param_grids["xgboost"] = {
    "eta": [.01,.05,.1,.2,.3],
    "gamma": [.05,.075,.1,.3,.5,.7,.9,1.],
    "max_depth": [3,5,9,12,15,20],
    "min_child_weight": [1,3,5,7],
    "subsample": [.6,.8,1.],
    "colsample_bytree": [.6,.8,1.],
    "lambda": [.01,.05,.1,1.],
    "alpha": [0,.1,.5,1.],
}
models["xgboost"] = GridSearchCV(
    estimator=XGBClassifier(verbose=0,n_jobs=1),
    param_grid=param_grids["xgboost"],
    verbose=5,
    cv=3,
    scoring=auc_scorer,
    n_jobs=-1
)
models["xgboost"].fit(X_train,y_train)
results["xgboost"] = models['xgboost'].best_score_

Fitting 3 folds for each of 138240 candidates, totalling 414720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    9.1s
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:   12.9s
[Parallel(n_jobs=-1)]: Done 1036 tasks      | elapsed:   20.2s
[Parallel(n_jobs=-1)]: Done 1576 tasks      | elapsed:   30.3s
[Parallel(n_jobs=-1)]: Done 2188 tasks      | elapsed:   41.5s
[Parallel(n_jobs=-1)]: Done 2872 tasks      | elapsed:   53.9s
[Parallel(n_jobs=-1)]: Done 3628 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 4456 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 5356 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 6328 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 7372 tasks      | el

Parameters: { verbose } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




## Model Training Results

In [14]:
for model_name, best_score in results.items():
    print(f'{model_name:>20s} %> {best_score:.4f}')

 logistic_regression %> 0.7850
       random_forest %> 0.8270
             xgboost %> 0.8393


## Model Saving

In [15]:
round(results["logistic_regression"],2)

0.79

In [16]:
import joblib
from datetime import datetime

In [17]:
for model_name, model in models.items():
    timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
    auc_val = results.get(model_name)
    filename = MODEL_DIR / f"{model_name}_{timestamp}_auc{auc_val:.02f}.gz"
    with open(filename,"wb") as f:
        joblib.dump(model.best_estimator_,f)