In [2]:
import pandas as pd
import numpy as np
import pickle as pkl

import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(True)

from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [3]:
df = pd.read_csv('df_after_engineering.csv')
df = df.drop('TransformedCabinDeck', axis=1)

In [5]:
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, KFold, train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

data = df.drop('PassengerId', axis=1)
data = data.dropna()
#data = data.where(np.logical_and(data['Sex'] == 1, data['Pclass'] == 0)).dropna()
data = data.drop(['IsAlone', 'NumRelatives'], axis=1)
print(data.columns)

steps=1
param_dict = {
        'C' : np.linspace(0.5, 20, steps),
        'kernel' : ['rbf'],
        'gamma' : np.linspace(0.01, 0.1, steps),
        'coef0' : np.linspace(1, 10, steps),
    }

param_opt_dict = {param: [] for param in param_dict.keys()}

# lets see if we can identify a subset where the model does very well
#data = data.where(data['Sex'] == 1).dropna()
#data = data.where(data['Pclass'] == 0).dropna()
print(data.shape)
X_total = data.drop('Survived', axis=1)
X_total = StandardScaler().fit_transform(X_total)
y_total = data['Survived']

# lets keep a validation set as a sanity check
training_acc_list = []
valid_acc_list = []
n_trials = 30

param_opt_list = []

feature_importances_list = []

best_params = None
best_score = 0

for i in range(n_trials):  # should be doing hypothesis testing maybe, or maybe that's a bad idea
    X, X_validation, y, y_validation = train_test_split(X_total, y_total, test_size=0.2, shuffle=True)

    skf = StratifiedKFold(n_splits=5)
    #skf = KFold(n_splits=5)

    steps = 80


    param_dict = {
        'C' : np.linspace(0.5, 20, steps),
        'kernel' : ['rbf'],
        'gamma' : np.linspace(0.01, 0.04, steps),
        'coef0' : np.linspace(0, 10, steps),
    }

    random_grid = RandomizedSearchCV(
        estimator=SVC(),
        param_distributions=param_dict,
        scoring='accuracy',
        cv=skf.split(X, y),
        n_jobs=-1,
        n_iter=10,
        verbose=0
    )

    random_grid.fit(X, y)

    #print('results from test', i+1)
    print('best accuracy on cv set was', random_grid.best_score_)
    training_acc_list.append(random_grid.best_score_)
       
    svc = SVC(**random_grid.best_params_)
    svc.fit(X, y)
    pred = svc.predict(X_validation)

    # check for overfitting (?)
    overfit_pred = svc.predict(X)
    print('acc on training set is', accuracy_score(y, overfit_pred))
    
    print('accuracy on validation set was', accuracy_score(y_validation, pred))
    valid_acc_list.append(accuracy_score(y_validation, pred))
    
    if accuracy_score(y_validation, pred) > best_score:
        best_score = accuracy_score(y_validation, pred)
        best_params = random_grid.best_params_
    
    print('f1_score for validation set is', f1_score(y_validation, pred))
    print('precision score for validation set is', precision_score(y_validation, pred))
    # what fraction of our selected items actually survived?
    print('recall score for validation set is', recall_score(y_validation, pred))
    # fraction of how many of the correctly predicted surviving / all of the actual surviving
    print('best_params_ are', random_grid.best_params_)
    print('test', i, 'complete')
    print()
    
    for param in param_dict:
        param_opt_dict[param].append((random_grid.best_params_[param], valid_acc_list[-1]))
    
print('average training accuracy was', sum(training_acc_list) / n_trials)
print('average validation accuracy was', sum(valid_acc_list) / n_trials)

trace0 = go.Histogram(x=training_acc_list, opacity=0.5, name='train')
trace1 = go.Histogram(x=valid_acc_list, opacity=0.5, name='valid')

layout = go.Layout(barmode='stack')

py.iplot(go.Figure(data=[trace0, trace1], layout=layout))

Index(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
       'Salutation', 'HasFamily', 'LastNameOccurance', 'TicketOccurance',
       'IsBritish', 'BoardedAtSouthamption', 'BoardedAtQueenstown',
       'BoardedAtCherbourg', 'AgeLogFare', 'YouthFamilyScore'],
      dtype='object')
(714, 17)
best accuracy on cv set was 0.8318739054290718
acc on training set is 0.840630472854641
accuracy on validation set was 0.7972027972027972
f1_score for validation set is 0.7387387387387387
precision score for validation set is 0.7735849056603774
recall score for validation set is 0.7068965517241379
best_params_ are {'kernel': 'rbf', 'gamma': 0.022531645569620253, 'coef0': 5.189873417721519, 'C': 2.227848101265823}
test 0 complete

best accuracy on cv set was 0.8073555166374781
acc on training set is 0.8546409807355516
accuracy on validation set was 0.8461538461538461
f1_score for validation set is 0.7843137254901962
precision score for validation set is 0.8163265306122449
recall score 

best accuracy on cv set was 0.8161120840630472
acc on training set is 0.851138353765324
accuracy on validation set was 0.8461538461538461
f1_score for validation set is 0.7962962962962963
precision score for validation set is 0.9148936170212766
recall score for validation set is 0.7049180327868853
best_params_ are {'kernel': 'rbf', 'gamma': 0.014556962025316457, 'coef0': 4.4303797468354436, 'C': 8.645569620253164}
test 19 complete

best accuracy on cv set was 0.8231173380035026
acc on training set is 0.861646234676007
accuracy on validation set was 0.8111888111888111
f1_score for validation set is 0.7610619469026548
precision score for validation set is 0.8113207547169812
recall score for validation set is 0.7166666666666667
best_params_ are {'kernel': 'rbf', 'gamma': 0.021772151898734177, 'coef0': 1.139240506329114, 'C': 6.670886075949367}
test 20 complete

best accuracy on cv set was 0.8371278458844134
acc on training set is 0.8493870402802102
accuracy on validation set was 0.7972027

In [9]:
save_best_params = True

if save_best_params:
    pkl.dump(best_params, open('svc_best_params.pkl', 'wb'))

In [24]:
for param in param_opt_dict:
    x = []
    y = []
    param_opt_list = param_opt_dict[param]
    for (xs, ys) in param_opt_list:
        x.append(xs)
        y.append(ys)

    print(param)

    py.iplot([go.Scatter(x=x, y=y, mode='markers')])

C


kernel


gamma


coef0
