In [1]:
import pandas as pd
import numpy as np

import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(True)

from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [6]:
df = pd.read_csv('df_after_engineering.csv')
df = df.drop('TransformedCabinDeck', axis=1)
df = df.dropna()
df.shape

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, KFold, train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

import warnings
warnings.filterwarnings('ignore')

data = df.drop('PassengerId', axis=1)
#data = data.drop('TransformedCabinDeck', axis=1)

# lets see if we can identify a subset where the model does very well
#data = data.where(data['Sex'] == 1).dropna()
#print(data)
X_total = data.drop('Survived', axis=1)
#X_total = X_total.drop('IsAlone', axis=1)
y_total = data['Survived']

# lets keep a validation set as a sanity check
training_acc_list = []
valid_acc_list = []
n_trials = 20

param_opt_list = []

feature_importances_list = []

for i in range(n_trials):  # should be doing hypothesis testing maybe, or maybe that's a bad idea
    X, X_validation, y, y_validation = train_test_split(X_total, y_total, test_size=0.2, shuffle=True)

    skf = StratifiedKFold(n_splits=5)
    #skf = KFold(n_splits=5)

    steps = 100
    
    """
    RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
    """


    param_dict = {
        'max_leaf_nodes' : [i for i in range(50, 100)],
        'min_samples_leaf' : [i for i in range(1, 100)],
        'max_depth' : [i for i in range(5, 20)],
        'min_samples_split' : [i for i in range(2, 40)],
        #'n_jobs' : [-1],
        #'min_impurity_decrase' : np.linspace(0.1, 0.3, steps),
        'min_weight_fraction_leaf' : np.linspace(0, 0.3, steps)
    }

    random_grid = RandomizedSearchCV(
        estimator=RandomForestClassifier(),
        param_distributions=param_dict,
        scoring='accuracy',
        cv=skf.split(X, y),
        n_jobs=-1,
        n_iter=20,
        verbose=0
    )

    random_grid.fit(X, y)

    #print('results from test', i+1)
    print('best accuracy on cv set was', random_grid.best_score_)
    training_acc_list.append(random_grid.best_score_)
       
    rfc = RandomForestClassifier(**random_grid.best_params_)
    rfc.fit(X, y)
    pred = rfc.predict(X_validation)

    # check for overfitting (?)
    overfit_pred = rfc.predict(X)
    print('acc on training set is', accuracy_score(y, overfit_pred))
    
    print('accuracy on validation set was', accuracy_score(y_validation, pred))
    valid_acc_list.append(accuracy_score(y_validation, pred))
    print('f1_score for validation set is', f1_score(y_validation, pred))
    print('precision score for validation set is', precision_score(y_validation, pred))
    # what fraction of our selected items actually survived?
    print('recall score for validation set is', recall_score(y_validation, pred))
    # fraction of how many of the correctly predicted surviving / all of the actual surviving
    print('best_params_ are', random_grid.best_params_)
    print('test', i, 'complete')
    print()
    
    #opt_param = 'max_depth'
    #param_opt_list.append((random_grid.best_params_[opt_param], valid_acc_list[-1]))
    
print('average training accuracy was', sum(training_acc_list) / n_trials)
print('average validation accuracy was', sum(valid_acc_list) / n_trials)

trace0 = go.Histogram(x=training_acc_list, opacity=0.5, name='train')
trace1 = go.Histogram(x=valid_acc_list, opacity=0.5, name='valid')

layout = go.Layout(barmode='stack')

py.iplot(go.Figure(data=[trace0, trace1], layout=layout))

best accuracy on cv set was 0.8283712784588442
acc on training set is 0.8476357267950964
accuracy on validation set was 0.7552447552447552
f1_score for validation set is 0.7154471544715447
precision score for validation set is 0.7857142857142857
recall score for validation set is 0.6567164179104478
best_params_ are {'min_weight_fraction_leaf': 0.015151515151515152, 'min_samples_split': 14, 'min_samples_leaf': 10, 'max_leaf_nodes': 97, 'max_depth': 7}
test 0 complete

best accuracy on cv set was 0.809106830122592
acc on training set is 0.8528896672504378
accuracy on validation set was 0.8181818181818182
f1_score for validation set is 0.7968749999999999
precision score for validation set is 0.8095238095238095
recall score for validation set is 0.7846153846153846
best_params_ are {'min_weight_fraction_leaf': 0.03333333333333333, 'min_samples_split': 2, 'min_samples_leaf': 7, 'max_leaf_nodes': 62, 'max_depth': 9}
test 1 complete

best accuracy on cv set was 0.8038528896672504
acc on traini

best accuracy on cv set was 0.7863397548161121
acc on training set is 0.8231173380035026
accuracy on validation set was 0.7902097902097902
f1_score for validation set is 0.7222222222222223
precision score for validation set is 0.75
recall score for validation set is 0.6964285714285714
best_params_ are {'min_weight_fraction_leaf': 0.012121212121212121, 'min_samples_split': 39, 'min_samples_leaf': 21, 'max_leaf_nodes': 50, 'max_depth': 11}
test 18 complete

best accuracy on cv set was 0.8161120840630472
acc on training set is 0.8546409807355516
accuracy on validation set was 0.7902097902097902
f1_score for validation set is 0.7500000000000001
precision score for validation set is 0.8181818181818182
recall score for validation set is 0.6923076923076923
best_params_ are {'min_weight_fraction_leaf': 0.030303030303030304, 'min_samples_split': 12, 'min_samples_leaf': 11, 'max_leaf_nodes': 66, 'max_depth': 18}
test 19 complete

average training accuracy was 0.8055166374781086
average validatio