In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
import PIL

import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

%matplotlib inline

## Data preprocessing

In [2]:
train_df = pd.read_csv('./train.csv')
train_df = train_df.drop('PassengerId', 1)
train_df = train_df.drop('Name', 1)
train_df = train_df.drop('Ticket', 1)
train_df = train_df.drop('Cabin', 1)
train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [3]:
train_df['FamilySize'] = train_df['SibSp'] + train_df['Parch'] + 1
train_df['IsAlone'] = 1 #initialize to yes/1 is alone
train_df['IsAlone'].loc[train_df['FamilySize'] > 1] = 0

encode_features = ['Sex', 'Embarked']
for enc in encode_features:
    one_hot = pd.get_dummies(train_df[enc], prefix=enc)
    train_df = train_df.drop(enc,axis = 1)
    train_df = train_df.join(one_hot)
train_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,FamilySize,IsAlone,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0,3,22.0,1,0,7.25,2,0,0,1,0,0,1
1,1,1,38.0,1,0,71.2833,2,0,1,0,1,0,0
2,1,3,26.0,0,0,7.925,1,1,1,0,0,0,1
3,1,1,35.0,1,0,53.1,2,0,1,0,0,0,1
4,0,3,35.0,0,0,8.05,1,1,0,1,0,0,1


In [4]:
train_df.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,FamilySize,IsAlone,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
count,891.0,891.0,714.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208,1.904602,0.602694,0.352413,0.647587,0.188552,0.08642,0.722783
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429,1.613459,0.489615,0.47799,0.47799,0.391372,0.281141,0.447876
min,0.0,1.0,0.42,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104,1.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,3.0,28.0,0.0,0.0,14.4542,1.0,1.0,0.0,1.0,0.0,0.0,1.0
75%,1.0,3.0,38.0,1.0,0.0,31.0,2.0,1.0,1.0,1.0,0.0,0.0,1.0
max,1.0,3.0,80.0,8.0,6.0,512.3292,11.0,1.0,1.0,1.0,1.0,1.0,1.0


In [5]:
real_features = ['Pclass', 'Age', 'Fare', 'FamilySize', 'SibSp', 'Parch']
cat_features = list(set(train_df.columns.values.tolist()) - set(real_features) )
cat_features.remove('Survived')
print (cat_features)

['IsAlone', 'Embarked_C', 'Embarked_Q', 'Sex_female', 'Embarked_S', 'Sex_male']


In [6]:
print (train_df.shape)

(891, 13)


In [7]:
train_df['Age'] = train_df['Age'].fillna(0)
##m = train_df['Age'].median()
#train_df['Age'] = train_df['Age'].fillna(m)

In [8]:
y = train_df['Survived'].to_numpy()
train_df = train_df.drop('Survived', 1)

X_real = train_df[real_features].to_numpy()
X_cat = train_df[cat_features].to_numpy()

print ("X_real: {} ".format(X_real[0]))
print ("X_cat: {} ".format(X_cat[0]))
print ("y: {} ".format(y[0]))

X_real: [ 3.   22.    7.25  2.    1.    0.  ] 
X_cat: [0 0 0 0 1 1] 
y: 0 


In [9]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_real_scaled = scaler.fit_transform(X_real)

In [10]:
#selected_features = (0, 1, 2, 5, 7, 9, 10, 12, 14, 16, 17, 18, 19, 21, 22)

In [11]:
X = np.hstack((X_real, X_cat))
#X = X[:,selected_features]

print (X.shape, y.shape)

(891, 12) (891,)


In [12]:
X.shape[1]

12

In [13]:
feat_labels = train_df.columns
feat_labels

Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'FamilySize', 'IsAlone',
       'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S'],
      dtype='object')

## Modeling

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

In [15]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier  

## Random Forest feature selection

In [16]:
rft = RandomForestClassifier(n_estimators=1000, random_state=0, n_jobs=-1, verbose=3)
rft.fit(X, y)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:    0.2s


building tree 1 of 1000building tree 2 of 1000
building tree 3 of 1000

building tree 4 of 1000
building tree 5 of 1000
building tree 6 of 1000building tree 7 of 1000

building tree 8 of 1000
building tree 9 of 1000
building tree 10 of 1000
building tree 11 of 1000
building tree 12 of 1000building tree 13 of 1000
building tree 14 of 1000

building tree 15 of 1000
building tree 16 of 1000
building tree 17 of 1000
building tree 18 of 1000
building tree 19 of 1000building tree 20 of 1000

building tree 21 of 1000building tree 22 of 1000

building tree 23 of 1000
building tree 24 of 1000
building tree 25 of 1000
building tree 26 of 1000
building tree 27 of 1000building tree 28 of 1000

building tree 29 of 1000
building tree 30 of 1000building tree 31 of 1000

building tree 32 of 1000
building tree 33 of 1000
building tree 34 of 1000
building tree 35 of 1000building tree 36 of 1000building tree 37 of 1000


building tree 38 of 1000
building tree 39 of 1000building tree 40 of 1000

building 

building tree 331 of 1000
building tree 332 of 1000building tree 333 of 1000

building tree 334 of 1000
building tree 335 of 1000
building tree 336 of 1000
building tree 337 of 1000building tree 338 of 1000

building tree 339 of 1000building tree 340 of 1000

building tree 341 of 1000
building tree 342 of 1000
building tree 343 of 1000
building tree 344 of 1000
building tree 345 of 1000
building tree 346 of 1000
building tree 347 of 1000building tree 348 of 1000

building tree 349 of 1000
building tree 350 of 1000
building tree 351 of 1000building tree 352 of 1000

building tree 353 of 1000
building tree 354 of 1000
building tree 355 of 1000
building tree 356 of 1000
building tree 357 of 1000
building tree 358 of 1000building tree 359 of 1000building tree 360 of 1000building tree 361 of 1000



building tree 362 of 1000
building tree 363 of 1000
building tree 364 of 1000building tree 365 of 1000
building tree 366 of 1000
building tree 367 of 1000

building tree 368 of 1000
building tre

[Parallel(n_jobs=-1)]: Done 496 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:    0.5s


building tree 659 of 1000
building tree 660 of 1000
building tree 661 of 1000
building tree 662 of 1000
building tree 663 of 1000
building tree 664 of 1000
building tree 665 of 1000
building tree 666 of 1000
building tree 667 of 1000
building tree 668 of 1000building tree 669 of 1000

building tree 670 of 1000
building tree 671 of 1000
building tree 672 of 1000
building tree 673 of 1000
building tree 674 of 1000
building tree 675 of 1000
building tree 676 of 1000building tree 677 of 1000
building tree 678 of 1000
building tree 679 of 1000building tree 680 of 1000


building tree 681 of 1000building tree 682 of 1000

building tree 683 of 1000
building tree 684 of 1000
building tree 685 of 1000
building tree 686 of 1000building tree 687 of 1000building tree 688 of 1000


building tree 689 of 1000
building tree 690 of 1000
building tree 691 of 1000
building tree 692 of 1000
building tree 693 of 1000
building tree 694 of 1000
building tree 695 of 1000
building tree 696 of 1000
building tre

building tree 976 of 1000
building tree 977 of 1000building tree 978 of 1000

building tree 979 of 1000
building tree 980 of 1000
building tree 981 of 1000
building tree 982 of 1000
building tree 983 of 1000
building tree 984 of 1000building tree 985 of 1000

building tree 986 of 1000
building tree 987 of 1000
building tree 988 of 1000
building tree 989 of 1000
building tree 990 of 1000
building tree 991 of 1000
building tree 992 of 1000building tree 993 of 1000

building tree 994 of 1000
building tree 995 of 1000building tree 996 of 1000
building tree 997 of 1000

building tree 998 of 1000
building tree 999 of 1000building tree 1000 of 1000



[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:    0.6s finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=-1, oob_score=False, random_state=0, verbose=3,
                       warm_start=False)

In [17]:
importances = rft.feature_importances_
indices = np.argsort(importances)[::-1]

print (importances)

[0.07897347 0.24164684 0.2528763  0.04572948 0.02809894 0.0267874
 0.01141125 0.01277153 0.00676998 0.14364248 0.01419072 0.13710163]


In [18]:
rft_important_features = []
for f in range(X.shape[1]):    
    print ("%2d) %-*s %f" % (f + 1, 30, feat_labels[indices[f]], importances[indices[f]]))
    if importances[indices[f]] >= 0.01:
        rft_important_features.append(indices[f])

 1) SibSp                          0.252876
 2) Age                            0.241647
 3) Embarked_C                     0.143642
 4) Embarked_S                     0.137102
 5) Pclass                         0.078973
 6) Parch                          0.045729
 7) Fare                           0.028099
 8) FamilySize                     0.026787
 9) Embarked_Q                     0.014191
10) Sex_female                     0.012772
11) IsAlone                        0.011411
12) Sex_male                       0.006770


In [19]:
print ("RF important features: {}".format(rft_important_features))

RF important features: [2, 1, 9, 11, 0, 3, 4, 5, 10, 7, 6]


## L1 feature selection

In [20]:
param_grid = {
    'C': [1.0, 10.0, 50.0, 100.0],
    'solver': ['liblinear'],
    'penalty': ['l1'],
    'max_iter': [20, 50, 100, 1000, 2000, 10000]
}

estimator = LogisticRegression()

lr1_model = GridSearchCV(estimator, param_grid, cv=10, n_jobs=-1, verbose=3)
lr1_model.fit(X, y)


Fitting 10 folds for each of 24 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:    2.2s finished


GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'C': [1.0, 10.0, 50.0, 100.0],
                         'max_iter': [20, 50, 100, 1000, 2000, 10000],
                         'penalty': ['l1'], 'solver': ['liblinear']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [21]:
print (lr1_model.best_estimator_)

LogisticRegression(C=10.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=20,
                   multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)


In [22]:
print (lr1_model.best_score_)
print (lr1_model.best_estimator_.coef_)

0.7912457912457912
[[-9.21734221e-01 -1.72311155e-02  2.03735714e-03  4.08439840e-02
  -5.32088573e-01 -2.65464249e-01 -8.21471662e-01  2.21831056e-01
   1.61138010e-02  3.16269914e+00 -6.08224605e-02  5.11870723e-01]]


In [23]:
c = 0
l1_important_features = []
for cf in lr1_model.best_estimator_.coef_[0]:
    if cf != 0:
        l1_important_features.append(c)
    c += 1
print (len(l1_important_features))    
print (l1_important_features)    

12
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]


In [24]:
importances = abs(lr1_model.best_estimator_.coef_[0])
indices = np.argsort(importances)[::-1]

In [25]:
for f in range(X.shape[1]):    
    print ("%2d) %-*s %f" % (f + 1, 30, feat_labels[f], importances[indices[f]]))

 1) Pclass                         3.162699
 2) Age                            0.921734
 3) SibSp                          0.821472
 4) Parch                          0.532089
 5) Fare                           0.511871
 6) FamilySize                     0.265464
 7) IsAlone                        0.221831
 8) Sex_female                     0.060822
 9) Sex_male                       0.040844
10) Embarked_C                     0.017231
11) Embarked_Q                     0.016114
12) Embarked_S                     0.002037


In [26]:
print ("L1 important features: {}".format(l1_important_features))

L1 important features: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]


## Selecting features

In [27]:
X = X[:, rft_important_features]
print (X.shape)

(891, 11)


### L2

In [28]:
param_grid = {
    'C': [0.01, 0.02, 0.05, 0.07, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 5.0, 10.0, 20.0, 30.0],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'penalty': ['l2'],
    'max_iter': [50, 100, 1000]
}

estimator = LogisticRegression()

lr2_model = GridSearchCV(estimator, param_grid, cv=10, n_jobs=-1, verbose=3)
lr2_model.fit(X, y)


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 10 folds for each of 270 candidates, totalling 2700 fits


[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 1936 tasks      | elapsed:    9.7s
[Parallel(n_jobs=-1)]: Done 2700 out of 2700 | elapsed:   14.6s finished


GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'C': [0.01, 0.02, 0.05, 0.07, 0.1, 0.2, 0.3, 0.4, 0.5,
                               0.6, 0.7, 0.8, 0.9, 1.0, 5.0, 10.0, 20.0, 30.0],
                         'max_iter': [50, 100, 1000], 'penalty': ['l2'],
                         'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag',
                                 

In [29]:
print (lr2_model.best_estimator_)

LogisticRegression(C=0.02, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=50,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)


In [30]:
print (lr2_model.best_score_)

0.8035914702581369


## SVM

In [31]:
from sklearn.svm import SVC

In [32]:
svm_model = SVC(gamma='auto')

score = cross_val_score(svm_model, X, y, cv=10)
score_mean = score.mean()

In [33]:
print (score_mean)

0.7263117126319373


In [34]:
param_grid = {
    'kernel': ['linear'],
    'degree': [1],
    'C': [30.0, 50.0, 100.0, 150.0, 200.0, 250.0 ]
    
}
estimator = SVC()

svm2_model = GridSearchCV(estimator, param_grid, cv=10, n_jobs=-1, verbose=3)
svm2_model.fit(X, y)

Fitting 10 folds for each of 6 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed: 22.7min finished


GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=-1,
             param_grid={'C': [30.0, 50.0, 100.0, 150.0, 200.0, 250.0],
                         'degree': [1], 'kernel': ['linear']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [35]:
print (svm2_model.best_estimator_)

SVC(C=150.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=1, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)


In [36]:
print (svm2_model.best_score_)

0.797979797979798


## Guassian Process

In [37]:
from sklearn.gaussian_process import GaussianProcessClassifier

In [38]:
gp_model = GaussianProcessClassifier()

score = cross_val_score(gp_model, X, y, cv=10)
score_mean = score.mean()

In [39]:
print (score_mean)

0.7352755078878676


In [40]:
param_grid = {
    'max_iter_predict': [200, 300, 500, 1000],
    'multi_class': ['one_vs_rest', 'one_vs_one'],    
}
estimator = GaussianProcessClassifier()

gp2_model = GridSearchCV(estimator, param_grid, cv=10, n_jobs=-1, verbose=3)
gp2_model.fit(X, y)

Fitting 10 folds for each of 8 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   10.2s
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:   40.8s finished


GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=GaussianProcessClassifier(copy_X_train=True, kernel=None,
                                                 max_iter_predict=100,
                                                 multi_class='one_vs_rest',
                                                 n_jobs=None,
                                                 n_restarts_optimizer=0,
                                                 optimizer='fmin_l_bfgs_b',
                                                 random_state=None,
                                                 warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'max_iter_predict': [200, 300, 500, 1000],
                         'multi_class': ['one_vs_rest', 'one_vs_one']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [41]:
print (gp2_model.best_estimator_)

GaussianProcessClassifier(copy_X_train=True, kernel=None, max_iter_predict=200,
                          multi_class='one_vs_rest', n_jobs=None,
                          n_restarts_optimizer=0, optimizer='fmin_l_bfgs_b',
                          random_state=None, warm_start=False)


In [42]:
print (gp2_model.best_score_)

0.7351290684624018


## KNN

In [43]:
from sklearn.neighbors import KNeighborsClassifier

In [44]:
kn_model = KNeighborsClassifier()

score = cross_val_score(kn_model, X, y, cv=10)
score_mean = score.mean()

In [45]:
print (score_mean)

0.7117049710589036


In [46]:
param_grid = {
    'n_neighbors': [5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25],
    'weights': ['uniform', 'distance'],    
    'leaf_size': [30, 40, 50]
}
estimator = KNeighborsClassifier()

kn2_model = GridSearchCV(estimator, param_grid, cv=10, n_jobs=-1, verbose=3)
kn2_model.fit(X, y)

Fitting 10 folds for each of 66 candidates, totalling 660 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 660 out of 660 | elapsed:    0.4s finished


GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=-1,
             param_grid={'leaf_size': [30, 40, 50],
                         'n_neighbors': [5, 7, 9, 11, 13, 15, 17, 19, 21, 23,
                                         25],
                         'weights': ['uniform', 'distance']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [47]:
print (kn2_model.best_estimator_)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=13, p=2,
                     weights='distance')


In [48]:
print (kn2_model.best_score_)

0.7317620650953984


## Discriminant Analysis

In [49]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

In [50]:
ld_model = LinearDiscriminantAnalysis()

score = cross_val_score(ld_model, X, y, cv=10)
score_mean = score.mean()



In [51]:
print (score_mean)

0.7867611508341845


In [52]:
param_grid = {
    'solver': ['svd', 'lsqr']
}
estimator = LinearDiscriminantAnalysis()

ld2_model = GridSearchCV(estimator, param_grid, cv=10, n_jobs=-1, verbose=3)
ld2_model.fit(X, y)

Fitting 10 folds for each of 2 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  20 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  12 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    0.1s finished


GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=LinearDiscriminantAnalysis(n_components=None,
                                                  priors=None, shrinkage=None,
                                                  solver='svd',
                                                  store_covariance=False,
                                                  tol=0.0001),
             iid='warn', n_jobs=-1, param_grid={'solver': ['svd', 'lsqr']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [53]:
print (ld2_model.best_estimator_)

LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
                           solver='svd', store_covariance=False, tol=0.0001)


In [54]:
print (ld2_model.best_score_)

0.7867564534231201


In [55]:
qd_model = QuadraticDiscriminantAnalysis()

score = cross_val_score(qd_model, X, y, cv=10)
score_mean = score.mean()



In [56]:
print (score_mean)

0.5185767790262172


In [57]:
param_grid = {
    'reg_param': [0.01, 0.05, 0.1, 0.2, 0.5, 1.0]
}
estimator = QuadraticDiscriminantAnalysis()

qd2_model = GridSearchCV(estimator, param_grid, cv=10, n_jobs=-1, verbose=3)
qd2_model.fit(X, y)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  60 | elapsed:    0.0s remaining:    0.2s


Fitting 10 folds for each of 6 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.1s finished


GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=QuadraticDiscriminantAnalysis(priors=None, reg_param=0.0,
                                                     store_covariance=False,
                                                     tol=0.0001),
             iid='warn', n_jobs=-1,
             param_grid={'reg_param': [0.01, 0.05, 0.1, 0.2, 0.5, 1.0]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [58]:
print (qd2_model.best_estimator_)

QuadraticDiscriminantAnalysis(priors=None, reg_param=0.1,
                              store_covariance=False, tol=0.0001)


In [59]:
print (qd2_model.best_score_)

0.7890011223344556


## Decision tree

In [60]:
tree = DecisionTreeClassifier()

score = cross_val_score(tree, X, y, cv=10)
score_mean = score.mean()


In [61]:
print (score_mean)

0.7801580410850075


In [62]:
param_grid = {
    'min_samples_leaf': [1, 2, 3, 4, 5, 6, 10], 
    'max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
    'min_samples_split': [2, 3, 4, 5, 6, 10],
}
estimator = DecisionTreeClassifier()

tr2_model = GridSearchCV(estimator, param_grid, cv=10, n_jobs=-1, verbose=3)
tr2_model.fit(X, y)

Fitting 10 folds for each of 336 candidates, totalling 3360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 2224 out of 3360 | elapsed:    1.2s remaining:    0.6s
[Parallel(n_jobs=-1)]: Done 3360 out of 3360 | elapsed:    1.4s finished


GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=None,
                                              splitter='best'),
             iid='warn', n_jobs=-1,
             param_grid={'max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
                         'min_samples_leaf': [1, 2, 3, 4, 5, 6, 10],
   

In [63]:
print (tr2_model.best_estimator_)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=6, min_samples_split=5,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')


In [64]:
print (tr2_model.best_score_)

0.835016835016835


## Random Forest

In [65]:
rf_model = RandomForestClassifier(n_estimators=100)

score = cross_val_score(rf_model, X, y, cv=10)
score_mean = score.mean()


In [66]:
print (score_mean)

0.8059760526614459


In [67]:
param_grid = {
    'n_estimators': [100, 200, 300, 400], 
    'max_depth': [3, 4, 5, 6],
    'min_samples_split': [3, 4],
}
estimator = RandomForestClassifier()

rf2_model = GridSearchCV(estimator, param_grid, cv=10, n_jobs=-1, verbose=3)
rf2_model.fit(X, y)

Fitting 10 folds for each of 32 candidates, totalling 320 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 208 tasks      | elapsed:    8.0s
[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed:   12.9s finished


GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             ii

In [68]:
print (rf2_model.best_estimator_)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=6, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=4,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)


In [69]:
print (rf2_model.best_score_)

0.8260381593714927


### Bagging

In [70]:
bc1_model = BaggingClassifier(n_estimators=100)

score = cross_val_score(bc1_model, X, y, cv=10)
score_mean = score.mean()


In [71]:
print (score_mean)

0.809321870389286


In [72]:
param_grid = {
    'n_estimators': [100, 200, 300, 400],    
}
estimator = BaggingClassifier()

bc2_model = GridSearchCV(estimator, param_grid, cv=10, n_jobs=-1, verbose=3)
bc2_model.fit(X, y)

Fitting 10 folds for each of 4 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    3.2s finished


GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=BaggingClassifier(base_estimator=None, bootstrap=True,
                                         bootstrap_features=False,
                                         max_features=1.0, max_samples=1.0,
                                         n_estimators=10, n_jobs=None,
                                         oob_score=False, random_state=None,
                                         verbose=0, warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'n_estimators': [100, 200, 300, 400]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [73]:
print (bc2_model.best_estimator_)

BaggingClassifier(base_estimator=None, bootstrap=True, bootstrap_features=False,
                  max_features=1.0, max_samples=1.0, n_estimators=300,
                  n_jobs=None, oob_score=False, random_state=None, verbose=0,
                  warm_start=False)


In [74]:
print (bc2_model.best_score_)

0.8159371492704826


In [75]:
mf = int(np.sqrt(X.shape[1]))
print (X.shape[1], mf)

tree = DecisionTreeClassifier(max_features=mf)

param_grid = {
    'base_estimator': [tree],
    'n_estimators': [100, 200, 300, 400],    
}
estimator = BaggingClassifier()

bc3_model = GridSearchCV(estimator, param_grid, cv=10, n_jobs=-1, verbose=3)
bc3_model.fit(X, y)

11 3
Fitting 10 folds for each of 4 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    2.2s finished


GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=BaggingClassifier(base_estimator=None, bootstrap=True,
                                         bootstrap_features=False,
                                         max_features=1.0, max_samples=1.0,
                                         n_estimators=10, n_jobs=None,
                                         oob_score=False, random_state=None,
                                         verbose=0, warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'base_estimator': [DecisionTreeClassifier(class_weight=N...
                                                                   criterion='gini',
                                                                   max_depth=None,
                                                                   max_features=3,
                                                                   max_leaf_nodes=None,
                                               

In [76]:
print (bc3_model.best_estimator_)

BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None,
                                                        criterion='gini',
                                                        max_depth=None,
                                                        max_features=3,
                                                        max_leaf_nodes=None,
                                                        min_impurity_decrease=0.0,
                                                        min_impurity_split=None,
                                                        min_samples_leaf=1,
                                                        min_samples_split=2,
                                                        min_weight_fraction_leaf=0.0,
                                                        presort=False,
                                                        random_state=None,
                                                        splitter='best'),
       

In [77]:
print (bc3_model.best_score_)

0.8047138047138047


## ExtraTrees

In [78]:
etr_model = ExtraTreesClassifier()

score = cross_val_score(etr_model, X, y, cv=10)
score_mean = score.mean()



In [79]:
print (score_mean)

0.7879227669958008


In [80]:
param_grid = {    
    'n_estimators': [20, 30, 40, 50, 100],    
    'max_depth': [2, 3, 4, 5, 10, 15],
    'min_samples_split': [2, 3, 4, 5],
}
estimator = ExtraTreesClassifier()

etr2_model = GridSearchCV(estimator, param_grid, cv=10, n_jobs=-1, verbose=3)
etr2_model.fit(X, y)

Fitting 10 folds for each of 120 candidates, totalling 1200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 1200 out of 1200 | elapsed:    8.8s finished


GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=ExtraTreesClassifier(bootstrap=False, class_weight=None,
                                            criterion='gini', max_depth=None,
                                            max_features='auto',
                                            max_leaf_nodes=None,
                                            min_impurity_decrease=0.0,
                                            min_impurity_split=None,
                                            min_samples_leaf=1,
                                            min_samples_split=2,
                                            min_weight_fraction_leaf=0.0,
                                            n_estimators='warn', n_jobs=None,
                                            oob_score=False, random_state=None,
                                            verbose=0, warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'max_depth': [2, 3, 4, 

In [81]:
print (etr2_model.best_estimator_)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
                     max_depth=15, max_features='auto', max_leaf_nodes=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=5,
                     min_weight_fraction_leaf=0.0, n_estimators=100,
                     n_jobs=None, oob_score=False, random_state=None, verbose=0,
                     warm_start=False)


In [82]:
print (etr2_model.best_score_)

0.8260381593714927


## AdaBoost

In [83]:
ada_model = AdaBoostClassifier()

score = cross_val_score(ada_model, X, y, cv=10)
score_mean = score.mean()

In [84]:
print (score_mean)

0.8070238905913063


In [85]:
param_grid = {        
    'learning_rate': [0.001, 0.01, 0.1, 0.5, 0.6, 0.7, 0.8, 1.],
    'n_estimators': [ 40, 50, 60, 70, 80, 90, 100]
}
estimator = AdaBoostClassifier()

ada2_model = GridSearchCV(estimator, param_grid, cv=10, n_jobs=-1, verbose=3)
ada2_model.fit(X, y)

Fitting 10 folds for each of 56 candidates, totalling 560 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 560 out of 560 | elapsed:    8.4s finished


GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=AdaBoostClassifier(algorithm='SAMME.R',
                                          base_estimator=None,
                                          learning_rate=1.0, n_estimators=50,
                                          random_state=None),
             iid='warn', n_jobs=-1,
             param_grid={'learning_rate': [0.001, 0.01, 0.1, 0.5, 0.6, 0.7, 0.8,
                                           1.0],
                         'n_estimators': [40, 50, 60, 70, 80, 90, 100]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [86]:
print (ada2_model.best_estimator_)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=0.1,
                   n_estimators=40, random_state=None)


In [87]:
print (ada2_model.best_score_)

0.8080808080808081


## GradientBoosting

In [88]:
gb_model = GradientBoostingClassifier()

score = cross_val_score(gb_model, X, y, cv=10)
score_mean = score.mean()

In [89]:
print (score_mean)

0.8317188741346045


In [90]:
param_grid = {
    'learning_rate': [0.03, 0.04, 0.05, 0.06, 0.07, 0.1],
    'max_depth': [3, 4, 5],
    'min_samples_split': [20, 22, 25, 27, 28, 30],
    'n_estimators': [400, 450, 500, 550, 600, 650, 700, 750]
}
estimator = GradientBoostingClassifier()

gb2_model = GridSearchCV(estimator, param_grid, cv=10, n_jobs=-1, verbose=3)
gb2_model.fit(X, y) 

Fitting 10 folds for each of 864 candidates, totalling 8640 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:    7.2s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:   17.8s
[Parallel(n_jobs=-1)]: Done 496 tasks      | elapsed:   32.8s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   57.9s
[Parallel(n_jobs=-1)]: Done 1136 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1552 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 2032 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 3856 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done 4592 tasks      | elapsed:  6.5min
[Parallel(n_jobs=-1)]: Done 5392 tasks      | elapsed:  7.7min
[Parallel(n_jobs=-1)]: Done 6256 tasks      | elapsed:  8.9min
[Parallel(n_jobs=-1)]: Done 7184 tasks      | e

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=GradientBoostingClassifier(criterion='friedman_mse',
                                                  init=None, learning_rate=0.1,
                                                  loss='deviance', max_depth=3,
                                                  max_features=None,
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=2,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=100,
                                                  n_iter_n...
                                                  subsample=1.0, tol

In [91]:
print (gb2_model.best_estimator_)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.03, loss='deviance', max_depth=4,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=25,
                           min_weight_fraction_leaf=0.0, n_estimators=500,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)


In [92]:
print (gb2_model.best_score_)

0.8383838383838383


## XGBoost

In [93]:
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
from xgboost import XGBClassifier

In [94]:
xgb_model = XGBClassifier()

score = cross_val_score(xgb_model, X, y, cv=10)
score_mean = score.mean()


In [95]:
print (score_mean)

0.8227545114061968


In [96]:
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2],
    'max_depth': [4, 5, 6, 7, 8],
    'min_child_weight': [2, 3, 4],
    'gamma': [0.01, 0.06, 0.1, 0.2],
    'colsample_bytree': [0.4, 0.5, 0.6, 0.7],
    'n_estimators': [400, 500, 600, 700, 800, 900]
}
estimator = XGBClassifier()

xgb2_model = GridSearchCV(estimator, param_grid, cv=10, n_jobs=-1, verbose=3)
xgb2_model.fit(X, y)

Fitting 10 folds for each of 7200 candidates, totalling 72000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:    5.8s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:   13.1s
[Parallel(n_jobs=-1)]: Done 496 tasks      | elapsed:   24.9s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   41.3s
[Parallel(n_jobs=-1)]: Done 1136 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 1552 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 2032 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 3856 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 4592 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 5392 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done 6256 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done 7184 tasks      | e

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=1,
                                     nthread=None, objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_...
                                     scale_pos_weight=1, seed=None, silent=None,
                                     subsample=1, verbosity=1),
             iid='warn', n_jobs=-1,
             param_grid={'colsample_bytree': [0.4, 0.5, 0.6, 0.7],
                         'gamma': [0.01, 0.06, 0.1, 0.2],
                         'learning_rate': [0.01, 

In [97]:
print (xgb2_model.best_estimator_)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gamma=0.06,
              learning_rate=0.01, max_delta_step=0, max_depth=8,
              min_child_weight=2, missing=None, n_estimators=500, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)


In [98]:
print (xgb2_model.best_score_)

0.8473625140291807


## Voting Classifier

In [99]:
from sklearn.ensemble import VotingClassifier

In [100]:
best_models = [
    #('lr1', lr1_model), 
    #('lr2', lr2_model),
    #('svm', svm2_model),  
    #('gp', gp2_model),  
    #('knn', kn2_model),  
    #('ld', ld2_model),  
    #('qd', qd2_model),  
    ('tr2', tr2_model), 
    ('rf2', rf2_model), 
    ('bc2', bc2_model),  
    #('bc3', bc3_model),
    ('etr', etr2_model),
    ('ada', ada2_model),
    ('gb', gb2_model),
    ('xgb', xgb2_model),
]
best_estimators = []
for md in best_models:
    print ("{}_model: {}".format(md[0], md[1].best_score_))
    best_estimators.append((md[0], md[1].best_estimator_))

tr2_model: 0.835016835016835
rf2_model: 0.8260381593714927
bc2_model: 0.8159371492704826
etr_model: 0.8260381593714927
ada_model: 0.8080808080808081
gb_model: 0.8383838383838383
xgb_model: 0.8473625140291807


In [101]:
vt_model = VotingClassifier(best_estimators, n_jobs=-1)

print (vt_model)

VotingClassifier(estimators=[('tr2',
                              DecisionTreeClassifier(class_weight=None,
                                                     criterion='gini',
                                                     max_depth=10,
                                                     max_features=None,
                                                     max_leaf_nodes=None,
                                                     min_impurity_decrease=0.0,
                                                     min_impurity_split=None,
                                                     min_samples_leaf=6,
                                                     min_samples_split=5,
                                                     min_weight_fraction_leaf=0.0,
                                                     presort=False,
                                                     random_state=None,
                                                     splitter='best')),
       

In [102]:
score = cross_val_score(vt_model, X, y, cv=10)
score_mean = score.mean()

In [103]:
print (score_mean)

0.8440412552491205


In [104]:
vt_model.fit(X, y)

VotingClassifier(estimators=[('tr2',
                              DecisionTreeClassifier(class_weight=None,
                                                     criterion='gini',
                                                     max_depth=10,
                                                     max_features=None,
                                                     max_leaf_nodes=None,
                                                     min_impurity_decrease=0.0,
                                                     min_impurity_split=None,
                                                     min_samples_leaf=6,
                                                     min_samples_split=5,
                                                     min_weight_fraction_leaf=0.0,
                                                     presort=False,
                                                     random_state=None,
                                                     splitter='best')),
       

In [105]:
print (len(vt_model.estimators))

7


## VT submission

In [106]:
df = pd.read_csv('./test.csv')
passenger_ids = df['PassengerId'].to_numpy()
df = df.drop('PassengerId', 1)
df = df.drop('Name', 1)
df = df.drop('Ticket', 1)
df = df.drop('Cabin', 1)
df.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,34.5,0,0,7.8292,Q
1,3,female,47.0,1,0,7.0,S
2,2,male,62.0,0,0,9.6875,Q
3,3,male,27.0,0,0,8.6625,S
4,3,female,22.0,1,1,12.2875,S


In [107]:
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
df['IsAlone'] = 1 #initialize to yes/1 is alone
df['IsAlone'].loc[df['FamilySize'] > 1] = 0

encode_features = ['Sex', 'Embarked']
for enc in encode_features:
    one_hot = pd.get_dummies(df[enc], prefix=enc)
    df = df.drop(enc,axis = 1)
    df = df.join(one_hot)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,FamilySize,IsAlone,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,3,34.5,0,0,7.8292,1,1,0,1,0,1,0
1,3,47.0,1,0,7.0,2,0,1,0,0,0,1
2,2,62.0,0,0,9.6875,1,1,0,1,0,1,0
3,3,27.0,0,0,8.6625,1,1,0,1,0,0,1
4,3,22.0,1,1,12.2875,3,0,1,0,0,0,1


In [108]:
df.describe()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,FamilySize,IsAlone,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
count,418.0,332.0,418.0,418.0,417.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0
mean,2.26555,30.27259,0.447368,0.392344,35.627188,1.839713,0.605263,0.363636,0.636364,0.244019,0.110048,0.645933
std,0.841838,14.181209,0.89676,0.981429,55.907576,1.519072,0.48938,0.481622,0.481622,0.430019,0.313324,0.478803
min,1.0,0.17,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,21.0,0.0,0.0,7.8958,1.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,3.0,27.0,0.0,0.0,14.4542,1.0,1.0,0.0,1.0,0.0,0.0,1.0
75%,3.0,39.0,1.0,0.0,31.5,2.0,1.0,1.0,1.0,0.0,0.0,1.0
max,3.0,76.0,8.0,9.0,512.3292,11.0,1.0,1.0,1.0,1.0,1.0,1.0


In [109]:
df['Age'] = df['Age'].fillna(0)
df['Fare'] = df['Fare'].fillna(0)

In [110]:
X_testing_real_zeros = df[real_features].to_numpy()
X_testing_cat_zeros = df[cat_features].to_numpy()

X_testing_real_zeros_scaled = scaler.transform(X_testing_real_zeros)

print ("X_real: {} ".format(X_testing_real_zeros[0]))
print ("X_cat: {} ".format(X_testing_cat_zeros[0]))
print ("y: {} ".format(y[0]))

X_real: [ 3.     34.5     7.8292  1.      0.      0.    ] 
X_cat: [1 0 1 0 0 1] 
y: 0 


In [111]:
print (df.shape)

(418, 12)


In [113]:
X_testing = np.hstack((X_testing_real_zeros_scaled, X_testing_cat_zeros))
#X_testing = X_testing[:,selected_features]
X_testing = X_testing[:, rft_important_features]
print (X.shape)

predicted = vt_model.predict(X_testing)

print (predicted)
print (passenger_ids)

(891, 11)
[0 1 0 0 1 0 1 0 1 0 0 0 1 0 1 1 0 0 1 1 0 0 1 0 1 0 1 0 0 0 0 0 1 1 0 0 1
 1 0 0 0 0 0 1 1 0 0 0 1 1 0 0 1 1 0 0 0 0 0 1 0 0 0 1 0 1 1 0 0 1 1 0 1 0
 1 0 0 1 0 1 0 0 0 0 0 0 1 1 1 0 1 0 1 0 0 0 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0
 1 1 1 1 0 0 1 0 1 1 0 1 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0
 0 0 1 0 0 1 0 0 1 1 0 1 1 0 1 0 0 1 0 0 1 1 0 0 0 0 1 1 1 0 1 1 0 0 1 0 1
 0 1 0 1 0 0 0 0 0 0 0 0 1 0 1 1 0 0 1 0 0 1 0 1 0 0 0 0 1 1 0 1 0 1 0 1 0
 1 0 1 1 0 1 0 0 0 1 0 0 0 0 0 0 1 1 1 1 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 0 1
 0 0 0 1 1 0 0 0 0 1 0 0 0 1 1 0 1 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 1 0 0 0 0
 1 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0
 1 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 1 0 0 0 1 0 1 0 0 1 0 1 1 0 1 1 0 1 1 0
 0 1 0 0 1 1 1 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 1 0 0 0 0
 0 1 1 1 1 1 0 1 0 0 0]
[ 892  893  894  895  896  897  898  899  900  901  902  903  904  905
  906  907  908  909  910  911  912  913  914  915  916  917  918  919

In [114]:
with open("submission_vt8.txt", "w") as fout:
    fout.write("PassengerId,Survived\n")
    for val in zip(passenger_ids, predicted):
        fout.write("{},{}\n".format(str(int(val[0])), str(int(val[1]))))