In [1]:
import pandas as pd
import numpy as np

import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(True)

In [7]:
df = pd.read_csv('df_after_engineering.csv')
df = df.drop('TransformedCabinDeck', axis=1)
df = df.dropna()
df = df[np.logical_and(df['Sex'] == 1, df['Pclass'] == 0)]
df.shape

(102, 20)

In [4]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch',
       'Fare', 'Salutation', 'NumRelatives', 'HasFamily', 'LastNameOccurance',
       'TicketOccurance', 'IsBritish', 'BoardedAtSouthamption',
       'BoardedAtQueenstown', 'BoardedAtCherbourg', 'IsAlone', 'AgeLogFare',
       'YouthFamilyScore'],
      dtype='object')

In [36]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

params = ['n_neighbors', 'p']
param_dict = {param: [] for param in params}

data = df.drop('PassengerId', axis=1)

skf = StratifiedKFold(n_splits=5)

X = df.drop(['Survived', 'Salutation', 'IsAlone', 'YouthFamilyScore'], axis=1)

# X = X[['NumRelatives', 'Fare', 'Age', 'IsBritish', 'BoardedAtSouthamption']]
X = X[['NumRelatives', 'Fare', 'Age', 'IsBritish', 'BoardedAtSouthamption',
       'AgeLogFare', 'SibSp', 'BoardedAtCherbourg']]
#X.loc[:, 'IsBritish'] = 10 * df['IsBritish']
#X = StandardScaler().fit_transform(X)
y = df['Survived']

n_trials = 10

acc_list = []
param_list = []

for i in range(n_trials):
    
    param_dict = {
        'n_neighbors': [i for i in range(1, 4)],
        'p': [1, 2],
#         'penalty': ['l2'],
#         'C': np.linspace(0.1, 1.2, 100),
#         'class_weight': [None, 'balanced'],
#         'n_jobs': [-1]
    }

    random_grid = RandomizedSearchCV(
        estimator=KNeighborsClassifier(),
        param_distributions = param_dict,
        scoring='accuracy',
        cv = skf.split(X, y),
        n_jobs=-1,
        n_iter=2
    )
    
    random_grid.fit(X, y)
    
    acc_list.append(random_grid.best_score_)
    print('acc this round was', acc_list[-1])
    param_list.append(random_grid.best_params_)
    print('best_params_ was', param_list[-1])
    
print('average acc was', sum(acc_list) / n_trials)

acc this round was 0.7352941176470589
best_params_ was {'p': 1, 'n_neighbors': 1}
acc this round was 0.7156862745098039
best_params_ was {'p': 2, 'n_neighbors': 1}
acc this round was 0.7156862745098039
best_params_ was {'p': 1, 'n_neighbors': 2}
acc this round was 0.7058823529411765
best_params_ was {'p': 2, 'n_neighbors': 3}
acc this round was 0.7156862745098039
best_params_ was {'p': 1, 'n_neighbors': 2}
acc this round was 0.7156862745098039
best_params_ was {'p': 2, 'n_neighbors': 1}
acc this round was 0.7352941176470589
best_params_ was {'p': 1, 'n_neighbors': 1}
acc this round was 0.7156862745098039
best_params_ was {'p': 2, 'n_neighbors': 1}
acc this round was 0.7352941176470589
best_params_ was {'p': 1, 'n_neighbors': 1}
acc this round was 0.7352941176470589
best_params_ was {'p': 1, 'n_neighbors': 1}
average acc was 0.7225490196078432


In [37]:
# finding best params

best_score = 0
knn_best_params = None
for i in range(len(acc_list)):
    score = acc_list[i]
    if score > best_score:
        best_score = score
        knn_best_params = param_list[i]
        
print('top score was', best_score)

top score was 0.7352941176470589


In [38]:
for param in random_grid.best_params_:
    x = []
    y = []
    print(param)
    for i in range(len(param_list)):
        x.append(param_list[i][param])
        y.append(acc_list[i])
        
    py.iplot([go.Scatter(x=x, y=y, mode='markers')])

p


n_neighbors


In [41]:
# now lets do the same for a hypertuned svm classifier; it looked like you might be able
# to get good results

from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

X = df.drop(['Survived', 'Salutation', 'IsAlone', 'YouthFamilyScore'], axis=1)

X = X[['NumRelatives', 'Fare', 'Age', 'IsBritish', 'BoardedAtSouthamption', 'AgeLogFare', 'SibSp', 'BoardedAtCherbourg']]
#X = StandardScaler().fit_transform(X)

y = df['Survived']
# usually svc performs better with higher dimensionality data

# use same skf from before

n_trials = 30
param_acc_list = []

for t in range(n_trials):
    
    param_dict = {
        'C': np.logspace(0, 1, num=100, base=10.),
        'kernel': ['rbf'],
        'gamma': np.linspace(0, 1/X.shape[1], num=100)
    }
    
    random_grid = RandomizedSearchCV(
        estimator=SVC(),
        param_distributions = param_dict,
        scoring='accuracy',
        cv = skf.split(X, y),
        n_jobs=-1,
        n_iter=2
    )
    
    random_grid.fit(X, y)
    
    print(random_grid.best_score_, 'was the best score')
    print(random_grid.best_params_, 'were the params')
    param_acc_list.append((random_grid.best_params_, random_grid.best_score_))
    
param_acc_list = sorted(param_acc_list, key = lambda tup: tup[-1])

0.7352941176470589 was the best score
{'kernel': 'rbf', 'gamma': 0.10101010101010102, 'C': 4.641588833612779} were the params
0.7450980392156863 was the best score
{'kernel': 'rbf', 'gamma': 0.06060606060606061, 'C': 1.2328467394420661} were the params
0.7156862745098039 was the best score
{'kernel': 'rbf', 'gamma': 0.011363636363636364, 'C': 9.326033468832199} were the params
0.7254901960784313 was the best score
{'kernel': 'rbf', 'gamma': 0.09974747474747475, 'C': 1.2045035402587823} were the params
0.7352941176470589 was the best score
{'kernel': 'rbf', 'gamma': 0.0744949494949495, 'C': 3.0538555088334154} were the params
0.7058823529411765 was the best score
{'kernel': 'rbf', 'gamma': 0.125, 'C': 3.274549162877729} were the params
0.7156862745098039 was the best score
{'kernel': 'rbf', 'gamma': 0.06565656565656566, 'C': 1.1497569953977358} were the params
0.7450980392156863 was the best score
{'kernel': 'rbf', 'gamma': 0.09217171717171718, 'C': 1.5922827933410924} were the params
0

In [65]:
(svc_best_params, best_score) = param_acc_list[-1]
print('best score was', best_score)
print('params that go with that is', svc_best_params)
print('average score was', sum([tup[1] for tup in param_acc_list])/len(param_acc_list))

best score was 0.7549019607843137
params that go with that is {'kernel': 'rbf', 'gamma': 0.02777777777777778, 'C': 3.1992671377973836}
average score was 0.7333333333333333


In [61]:
# lets graph the outputs and take a look from the best parameters of both models;
# we can try a couple model stacking methods and see

import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(True)

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor

X = df.drop(['Survived', 'Salutation', 'IsAlone', 'YouthFamilyScore'], axis=1)

X = X[['NumRelatives', 'Fare', 'Age', 'IsBritish', 'BoardedAtSouthamption', 'AgeLogFare', 'SibSp', 'BoardedAtCherbourg']]
#X = StandardScaler().fit_transform(X)

y = list(df['Survived'])

svc = SVC(**svc_best_params)
knn = KNeighborsRegressor(**knn_best_params)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5, shuffle=True)

svc.fit(X_train, y_train)
knn.fit(X_train, y_train)

svc_output = svc.predict(X_test)
knn_output = knn.predict(X_test)

actual = y

alive = []
dead = []

for i in range(len(svc_output)):
    tup = (svc_output[i] + np.random.rand()/5, knn_output[i] + np.random.rand()/5)
    if y[i] == 1:
        alive.append(tup)
    else:
        dead.append(tup)
        
alive_trace = go.Scatter(x = [tup[0] for tup in alive], y = [tup[1] for tup in alive], mode='markers', name='alive')
dead_trace = go.Scatter(x = [tup[0] for tup in dead], y = [tup[1] for tup in dead], mode='markers', name='dead')

py.iplot([alive_trace, dead_trace])

In [None]:
# the models seem to predict the same output, but a 10% increase using either is pretty good...