In [2]:
from xgboost import XGBRegressor
from sklearn.svm import SVC

import pandas as pd
import numpy as np
import pickle as pkl

import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(True)

In [58]:
df = pd.read_csv('df_after_engineering.csv')
df = df.drop('TransformedCabinDeck', axis=1)

In [59]:
svc_best_params = pkl.load(open('svc_best_params.pkl', 'rb'))
xgb_best_params = pkl.load(open('xgb_best_params.pkl', 'rb'))

print(svc_best_params)
print()
print(xgb_best_params)

svc = SVC(**svc_best_params)
xgb = XGBRegressor(**xgb_best_params)

{'kernel': 'rbf', 'gamma': 0.014556962025316457, 'coef0': 1.7721518987341773, 'C': 10.126582278481013}

{'scale_pos_weight': 1.0575949367088608, 'reg_lambda': 5.189873417721519, 'reg_alpha': 9.498734177215189, 'max_depth': 15, 'learning_rate': 0.11012658227848102, 'gamma': 1.2620253164556963, 'objective': 'binary:logistic'}


In [113]:
# lets take a look at what both have to say, and visualize it; maybe the right algorithm will be obvious

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.gaussian_process.kernels import RBF

data = df.dropna()
print(data.shape)
data = data.drop(['IsAlone', 'NumRelatives'], axis=1)

X = data.drop('Survived', axis=1)
X = StandardScaler().fit_transform(X)
y = data['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle=True)

xgb.fit(X_train, y_train)
svc.fit(X_train, y_train)

xgb_predict = xgb.predict(X_test)
svc_predict = svc.predict(X_test)

alive = []
dead = []

y_test = list(y_test)

for i in range(len(xgb_predict)):
    tup = (xgb_predict[i] + np.random.rand()/50, svc_predict[i] + np.random.rand()/50)
    if y_test[i] == 1:
        alive.append(tup)
    else:
        dead.append(tup)
        
alive_trace = go.Scatter(x=[tup[0] for tup in alive], y=[tup[1] for tup in alive], mode='markers', name='alive')
dead_trace = go.Scatter(x=[tup[0] for tup in dead], y=[tup[1] for tup in dead], mode='markers', name='dead')

py.iplot([alive_trace, dead_trace])
# knn? knn should be able to destroy any obvious wrongs, but I'm worried about the center cluster
# an alternative _might_ be knn classification trees, which would account for that

(714, 20)


If I exclude a lot of the middle, where xgboost is uncertain, then I should be achieving high accuracy.

I think that's the poor women that xgboost was struggling to predict; I need some way of creating a feature that is tuned to do well on the poor women only, then add it as a feature to this meta model, or create some kind of decision tree like structure for 

In [115]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold

skf = StratifiedKFold(n_splits=5)

X = data.drop('Survived', axis=1)
y = data['Survived']

# in actually using this, I'll fit to my given data, and predict on the test data

n_trials = 30
param_list = []
acc_list = []

for i in range(n_trials):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)
    
    xgb.fit(X_train, y_train)
    svc.fit(X_train, y_train)
    xgb_pred = xgb.predict(X_test)
    svc_pred = svc.predict(X_test)
    X_test = np.c_[xgb_pred, svc_pred]
    
    param_dict = {
#         'n_neighbors': [i for i in range(1, 10)],
#         'p': [1,2],
        'penalty': ['l1', 'l2'],
        'C': np.linspace(0.1, 2, 100)
    }
    
    random_grid = RandomizedSearchCV(
        #estimator=KNeighborsClassifier(),
        estimator=LogisticRegression(),
        param_distributions=param_dict,
        scoring='accuracy',
        cv=skf.split(X_test, y_test),
        n_iter=10
    )
    
    random_grid.fit(X_test, y_test)
    
    print('best score was', random_grid.best_score_)
    acc_list.append(random_grid.best_score_)
    print('associated params were', random_grid.best_params_)
    param_list.append(random_grid.best_params_)
    
print('average acc was', sum(acc_list) / len(acc_list))

best score was 0.8041958041958042
associated params were {'penalty': 'l2', 'C': 0.598989898989899}
best score was 0.8531468531468531
associated params were {'penalty': 'l2', 'C': 2.0}
best score was 0.8181818181818182
associated params were {'penalty': 'l2', 'C': 0.7717171717171717}
best score was 0.8321678321678322
associated params were {'penalty': 'l1', 'C': 0.23434343434343433}
best score was 0.8251748251748252
associated params were {'penalty': 'l1', 'C': 1.6737373737373737}
best score was 0.8181818181818182
associated params were {'penalty': 'l1', 'C': 1.002020202020202}
best score was 0.8181818181818182
associated params were {'penalty': 'l2', 'C': 1.385858585858586}
best score was 0.8461538461538461
associated params were {'penalty': 'l2', 'C': 0.81010101010101}
best score was 0.8461538461538461
associated params were {'penalty': 'l2', 'C': 1.4626262626262627}
best score was 0.7902097902097902
associated params were {'penalty': 'l1', 'C': 0.503030303030303}
best score was 0.846

In [116]:
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(True)

for param in param_list[0].keys():
    
    print(param)
    params = [param_dict[param] for param_dict in param_list]
    acc = [acc for acc in acc_list]
    
    py.iplot([go.Scatter(x=params, y=acc, mode='markers')])
    

penalty


C
