In [1]:
import pandas as pd
import numpy as np
import pickle as pkl

import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(True)

from xgboost import XGBClassifier, XGBRegressor
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, KFold, train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [2]:
df = pd.read_csv('df_after_engineering.csv')
df.shape

(891, 21)

In [7]:
import warnings
warnings.filterwarnings('ignore')

data = df.drop('PassengerId', axis=1)
#data = data.dropna(subset=['TransformedCabinDeck'])
#data = data.drop('TransformedCabinDeck', axis=1)
#display(data)

## TESTING
# data = data.where(data['Sex'] == 0).dropna(subset=['Sex'])
# data = data.where(data['Pclass'] == 0).dropna(subset=['Pclass'])
print(data.shape)

X_total = data.drop('Survived', axis=1)
# seems like gb trees are extremely robust to poor features so I won't bother dropping anything
y_total = data['Survived']

# lets keep a validation set as a sanity check
training_acc_list = []
valid_acc_list = []

poor_women_acc_list = []
middle_women_acc_list = []
rich_women_acc_list = []

men_acc_list = []
n_trials = 30 

steps=1
param_dict = {
    'learning_rate' : np.linspace(0.1, 0.75, steps),
    'gamma' : np.linspace(1, 4.5, steps),
    'max_depth' : [x for x in range(8, 20)],
    'reg_lambda' : np.linspace(2.2, 10, steps),
    'reg_alpha' : np.linspace(7, 10, steps),
    'scale_pos_weight' : np.linspace(1, 1.4, steps)
}

param_opt_dict = {param: [] for param in param_dict.keys()}

feature_importances_list = []

def get_nan_cabin_valid(X, y, pred):
    y = list(y)
    pred = list(pred)
    pred_true = []
    y_true = []
    i = 0
    for row in X.iterrows():
        if pd.isnull(row[1]['TransformedCabinDeck']):
            pred_true.append(pred[i])
            y_true.append(y[i])
        i += 1
    return accuracy_score(y_true, pred_true)

def get_sex_acc(X, y, pred, sex, class_list):
    y = list(y)
    pred = list(pred)
    y_women = []
    pred_women = []
    i = 0
    for row in X.iterrows():
        if row[1]['Sex'] == sex and row[1]['Pclass'] in class_list:
            y_women.append(y[i])
            pred_women.append(pred[i])
        i += 1
    return accuracy_score(y_women, pred_women)

best_score = 0
best_params = None

for i in range(n_trials):  # should be doing hypothesis testing maybe, or maybe that's a bad idea
    X, X_validation, y, y_validation = train_test_split(X_total, y_total, test_size=0.2, shuffle=True)

    skf = StratifiedKFold(n_splits=10)
    #skf = KFold(n_splits=5)

    steps = 80

    param_dict = {
        'learning_rate' : np.linspace(0.1, 0.9, steps),
        'gamma' : np.linspace(0.1, 3.5, steps),
        'max_depth' : [x for x in range(4, 20)],
        'reg_lambda' : np.linspace(.5, 10, steps),
        'reg_alpha' : np.linspace(0.1, 10, steps),
        'scale_pos_weight' : np.linspace(1, 1.35, steps)
    }

    random_grid = RandomizedSearchCV(
        estimator=XGBClassifier(),
        param_distributions=param_dict,
        scoring='accuracy',
        cv=skf.split(X, y),
        n_jobs=-1,
        n_iter=20,
        verbose=0
    )

    random_grid.fit(X, y)


    #print('results from test', i+1)
    print('best accuracy on cv set was', random_grid.best_score_)
    training_acc_list.append(random_grid.best_score_)

    params = random_grid.best_params_
    params['objective'] = 'binary:logistic'
    xgb_class = XGBRegressor(**params)
    fit = xgb_class.fit(X, y)
    pred = xgb_class.predict(X_validation).round()
    
    
    if not np.isnan(fit.feature_importances_).any():
        feature_importances_list.append(fit.feature_importances_)
    else:
        feature_importances_list.append(None)

    # check for overfitting (?)
    overfit_pred = xgb_class.predict(X_total).round()
    print('acc on training set is', accuracy_score(y_total, overfit_pred))
    # overfitting slightly
    
    print('accuracy on validation set was', accuracy_score(y_validation, pred))
    valid_acc_list.append(accuracy_score(y_validation, pred))
    
    if accuracy_score(y_validation, pred) > best_score:
        best_score = accuracy_score(y_validation, pred)
        best_params = random_grid.best_params_
    
    #print(get_nan_cabin_valid(X_validation, y_validation, pred))
    poor_women_acc_list.append(get_sex_acc(X_validation, y_validation, pred, 1, [0]))
    print('poor_women_acc is', poor_women_acc_list[-1])
    middle_women_acc_list.append(get_sex_acc(X_validation, y_validation, pred, 1, [1]))
    print('middle_women_acc is', middle_women_acc_list[-1])
    rich_women_acc_list.append(get_sex_acc(X_validation, y_validation, pred, 1, [2]))
    print('rich_women_acc is', rich_women_acc_list[-1])
    
    men_acc_list.append(get_sex_acc(X_validation, y_validation, pred, 0, [0, 10, 100]))
    
    print('f1_score for validation set is', f1_score(y_validation, pred))
    print('precision score for validation set is', precision_score(y_validation, pred))
    # what fraction of our selected items actually survived?
    print('recall score for validation set is', recall_score(y_validation, pred))
    #print(pred)
    # fraction of how many of the correctly predicted surviving / all of the actual surviving
    
    for param in random_grid.best_params_.keys():
        if param != 'objective':
            param_opt_dict[param].append((random_grid.best_params_[param], training_acc_list[-1]))

    print('best_params_ are', random_grid.best_params_)
    print('test', i, 'complete')
    print()
    
    
    
print('average training accuracy was', sum(training_acc_list) / n_trials)
# print('average validation accuracy was', sum(valid_acc_list) / n_trials)
# print('average men accuracy was', sum(men_acc_list) / n_trials)
# print('average poor women accuracy was', sum(poor_women_acc_list) / n_trials)
# print('average middle women accuracy was', sum(middle_women_acc_list) / n_trials)
# print('average rich women accuracy was', sum(rich_women_acc_list) / n_trials)

trace0 = go.Histogram(x=training_acc_list, opacity=0.5, name='train')
trace1 = go.Histogram(x=valid_acc_list, opacity=0.5, name='valid')

layout = go.Layout(barmode='stack')

py.iplot(go.Figure(data=[trace0, trace1], layout=layout))


# seems like I'm getting around 83% validation accuracy now...
# this is good enough I'll take it

(891, 20)
best accuracy on cv set was 0.8426966292134831
acc on training set is 0.8742985409652076
accuracy on validation set was 0.8044692737430168
poor_women_acc is 0.6774193548387096
middle_women_acc is 0.875
rich_women_acc is 1.0
f1_score for validation set is 0.7517730496453902
precision score for validation set is 0.7910447761194029
recall score for validation set is 0.7162162162162162
best_params_ are {'scale_pos_weight': 1.3101265822784811, 'reg_lambda': 7.3544303797468356, 'reg_alpha': 1.979746835443038, 'max_depth': 8, 'learning_rate': 0.1607594936708861, 'gamma': 0.7025316455696202, 'objective': 'binary:logistic'}
test 0 complete

best accuracy on cv set was 0.8384831460674157
acc on training set is 0.8462401795735129
accuracy on validation set was 0.776536312849162
poor_women_acc is 0.5
middle_women_acc is 0.9333333333333333
rich_women_acc is 1.0
f1_score for validation set is 0.7058823529411764
precision score for validation set is 0.7741935483870968
recall score for valid

best accuracy on cv set was 0.8497191011235955
acc on training set is 0.8518518518518519
accuracy on validation set was 0.7653631284916201
poor_women_acc is 0.59375
middle_women_acc is 0.75
rich_women_acc is 0.875
f1_score for validation set is 0.65
precision score for validation set is 0.7090909090909091
recall score for validation set is 0.6
best_params_ are {'scale_pos_weight': 1.2037974683544304, 'reg_lambda': 2.0632911392405067, 'reg_alpha': 7.243037974683544, 'max_depth': 11, 'learning_rate': 0.839240506329114, 'gamma': 1.089873417721519, 'objective': 'binary:logistic'}
test 13 complete

best accuracy on cv set was 0.8258426966292135
acc on training set is 0.8294051627384961
accuracy on validation set was 0.8324022346368715
poor_women_acc is 0.6071428571428571
middle_women_acc is 0.875
rich_women_acc is 0.9230769230769231
f1_score for validation set is 0.7413793103448276
precision score for validation set is 0.7543859649122807
recall score for validation set is 0.7288135593220338

best accuracy on cv set was 0.8356741573033708
acc on training set is 0.8507295173961841
accuracy on validation set was 0.8324022346368715
poor_women_acc is 0.6333333333333333
middle_women_acc is 0.8571428571428571
rich_women_acc is 1.0
f1_score for validation set is 0.7794117647058824
precision score for validation set is 0.7571428571428571
recall score for validation set is 0.803030303030303
best_params_ are {'scale_pos_weight': 1.2082278481012658, 'reg_lambda': 1.9430379746835444, 'reg_alpha': 3.984810126582279, 'max_depth': 6, 'learning_rate': 0.20126582278481014, 'gamma': 0.7455696202531645, 'objective': 'binary:logistic'}
test 26 complete

best accuracy on cv set was 0.8342696629213483
acc on training set is 0.8383838383838383
accuracy on validation set was 0.8044692737430168
poor_women_acc is 0.6451612903225806
middle_women_acc is 0.9333333333333333
rich_women_acc is 0.9444444444444444
f1_score for validation set is 0.7517730496453902
precision score for validation set is 0.7361

NameError: name 'ignore_below_65' is not defined

In [4]:
pkl_params = True
if pkl_params:
    pkl.dump(best_params, open('xgb_best_params.pkl', 'wb'))

In [13]:
for param in param_opt_dict:
    x = []
    y = []
    param_opt_list = param_opt_dict[param]
    for (xs, ys) in param_opt_list:
        x.append(xs)
        y.append(ys)

    print(param)

    py.iplot([go.Scatter(x=x, y=y, mode='markers')])

learning_rate


gamma


max_depth


reg_lambda


reg_alpha


scale_pos_weight


In [8]:
cols = [col for col in X_total.columns]

for i in range(len(cols)):
    print(cols[i], ':\t\t\t', round(sum([row[i] for row in feature_importances_list]) * 100 / n_trials, 3))

Pclass :			 12.411
Sex :			 7.184
Age :			 8.095
SibSp :			 0.603
Parch :			 0.145
Fare :			 14.27
Salutation :			 10.197
NumRelatives :			 6.341
HasFamily :			 0.0
LastNameOccurance :			 3.057
TicketOccurance :			 6.132
IsBritish :			 0.537
TransformedCabinDeck :			 10.555
BoardedAtSouthamption :			 1.636
BoardedAtQueenstown :			 0.743
BoardedAtCherbourg :			 2.092
IsAlone :			 0.0
AgeLogFare :			 12.879
YouthFamilyScore :			 3.122
