In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

from titanic_preprocess import *

%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings('ignore')

In [2]:
original_df = pd.read_csv('data/titanic.csv')

In [3]:
original_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
original_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [5]:
X, y = preprocess_titanic_no_tts(original_df)

In [6]:
X.head()

Unnamed: 0,Embarked_Q,Embarked_S,Sex_male,Pclass,Age,SibSp,Parch,Fare
0,0.0,1.0,1.0,1.0,0.271174,0.125,0.0,0.014151
1,0.0,0.0,0.0,0.0,0.472229,0.125,0.0,0.139136
2,0.0,1.0,0.0,1.0,0.321438,0.0,0.0,0.015469
3,0.0,1.0,0.0,0.0,0.434531,0.125,0.0,0.103644
4,0.0,1.0,1.0,1.0,0.434531,0.0,0.0,0.015713


In [7]:
y.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [8]:
x_kfold, x_val, y_kfold, y_val = train_test_split(X, y, test_size=0.25, random_state=42)

In [9]:
y_kfold.value_counts()

0    341
1    242
Name: Survived, dtype: int64

## Base Model - Decision Tree Classifier

In [10]:
from sklearn.tree import DecisionTreeClassifier

In [11]:
k_fold = StratifiedKFold(n_splits = 3, random_state = 42)

training_splits = k_fold.split(x_kfold, y_kfold)

In [12]:
from sklearn.metrics import roc_auc_score

roc_auc_results = []

for train_ind, test_ind in training_splits:
    dtc = DecisionTreeClassifier()
    x_train = x_kfold.iloc[train_ind]
    x_test = x_kfold.iloc[test_ind]
    y_train = y_kfold.iloc[train_ind]
    y_test = y_kfold.iloc[test_ind]
    
    dtc.fit(x_train, y_train)
    
    y_hat = dtc.predict(x_test)
    y_hat_proba = dtc.predict_proba(x_test)[:,1]
    roc_auc_results.append(roc_auc_score(y_test, y_hat_proba))

In [13]:
np.mean(roc_auc_results)

0.7176896911056548

## Let's Stop Writing For Loops
We'll start using the GridSearchCV for the easiest method of parameter selection and cross validation.

In [14]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

In [15]:
dtc = DecisionTreeClassifier(random_state=42)

In [16]:
dtc.get_params()

{'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'presort': False,
 'random_state': 42,
 'splitter': 'best'}

In [17]:
param_grid = {'criterion': ['gini', 'entropy'],
             'max_depth': np.arange(2, 20, 1)}

In [18]:
roc_auc_scorer = make_scorer(roc_auc_score, needs_proba=True)
dtc_gs = GridSearchCV(dtc, param_grid, n_jobs=-1, cv=5, scoring=roc_auc_scorer)

In [19]:
dtc_gs.fit(x_kfold, y_kfold)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=42,
                                              splitter='best'),
             iid='warn', n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 

In [20]:
dtc_gs.best_score_

0.8454159555536735

In [21]:
best_dtc = dtc_gs.best_estimator_

In [22]:
y_kfold_hat = best_dtc.predict(x_kfold)
y_kfold_hat_proba = best_dtc.predict_proba(x_kfold)[:,1]

In [23]:
y_val_hat = best_dtc.predict(x_val)
y_val_hat_proba = best_dtc.predict_proba(x_val)[:,1]

In [24]:
roc_auc_score(y_val, y_val_hat_proba)

0.8259916721455184

## Random Forest

In [25]:
from sklearn.ensemble import RandomForestClassifier

In [26]:
rfc = RandomForestClassifier(random_state=42)

In [27]:
rfc.get_params()

{'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 'warn',
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

In [28]:
param_grid = {'criterion': ['gini', 'entropy'],
             'max_depth': np.arange(2, 20, 1),
             'n_estimators': np.arange(2, 20, 1),
             'max_features': np.arange(2, 6, 1)}

In [29]:
rfc_gs = GridSearchCV(rfc, param_grid, scoring=roc_auc_scorer, n_jobs=-1)

In [30]:
rfc_gs.fit(x_kfold, y_kfold)

GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_sco...
             iid='warn', n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': array([ 2,  3,  4,  5,  6

In [31]:
rfc_gs.best_score_

0.8508246689016696

In [32]:
best_rfc = rfc_gs.best_estimator_

In [33]:
best_rfc.get_params()

{'bootstrap': True,
 'class_weight': None,
 'criterion': 'entropy',
 'max_depth': 4,
 'max_features': 3,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 13,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

# AdaBoost

In [34]:
from sklearn.ensemble import AdaBoostClassifier

In [35]:
abc = AdaBoostClassifier(random_state=42)

In [36]:
abc.get_params()

{'algorithm': 'SAMME.R',
 'base_estimator': None,
 'learning_rate': 1.0,
 'n_estimators': 50,
 'random_state': 42}

In [37]:
param_grid = {'algorithm': ['SAMME', 'SAMME.R'],
             'learning_rate': 10**np.arange(-3.0, 3.0, 1.0),
             'n_estimators': np.arange(10, 100, 10)}

In [38]:
abc_gs = GridSearchCV(abc, param_grid, n_jobs=-1, scoring=roc_auc_scorer, cv=5)

In [39]:
abc_gs.fit(x_kfold, y_kfold)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

## SVM

In [40]:
from sklearn.svm import SVC

In [46]:
svc = SVC(random_state=42, probability=True)

In [47]:
svc.get_params()

{'C': 1.0,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'auto_deprecated',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': True,
 'random_state': 42,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [48]:
param_grid = {'C': 10**np.arange(-3.0, 3.0, 1.0),
             'degree': np.arange(1, 8, 1),
             'kernel': ['rbf', 'poly', 'linear']}

In [49]:
svc_gs = GridSearchCV(svc, param_grid, n_jobs=-1, scoring=roc_auc_scorer, cv=5)

In [50]:
svc_gs.fit(x_kfold, y_kfold)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                           probability=True, random_state=42, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=-1,
             param_grid={'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02]),
                         'degree': array([1, 2, 3, 4, 5, 6, 7]),
                         'kernel': ['rbf', 'poly', 'linear']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=make_scorer(roc_auc_score, needs_proba=True), verbose=0)

In [51]:
svc_gs.best_score_

0.8416037969740809