In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
import PIL

import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

%matplotlib inline

## Data preprocessing

In [2]:
train_df = pd.read_csv('./train.csv')
train_df = train_df.drop('PassengerId', 1)
train_df = train_df.drop('Name', 1)
train_df = train_df.drop('Ticket', 1)
train_df = train_df.drop('Cabin', 1)
train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [3]:
encode_features = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked']
for enc in encode_features:
    one_hot = pd.get_dummies(train_df[enc], prefix=enc)
    train_df = train_df.drop(enc,axis = 1)
    train_df = train_df.join(one_hot)
train_df.head()

Unnamed: 0,Survived,Age,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,SibSp_0,SibSp_1,...,Parch_0,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Embarked_C,Embarked_Q,Embarked_S
0,0,22.0,7.25,0,0,1,0,1,0,1,...,1,0,0,0,0,0,0,0,0,1
1,1,38.0,71.2833,1,0,0,1,0,0,1,...,1,0,0,0,0,0,0,1,0,0
2,1,26.0,7.925,0,0,1,1,0,1,0,...,1,0,0,0,0,0,0,0,0,1
3,1,35.0,53.1,1,0,0,1,0,0,1,...,1,0,0,0,0,0,0,0,0,1
4,0,35.0,8.05,0,0,1,0,1,1,0,...,1,0,0,0,0,0,0,0,0,1


In [4]:
train_df.describe()

Unnamed: 0,Survived,Age,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,SibSp_0,SibSp_1,...,Parch_0,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Embarked_C,Embarked_Q,Embarked_S
count,891.0,714.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,...,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.383838,29.699118,32.204208,0.242424,0.20651,0.551066,0.352413,0.647587,0.682379,0.234568,...,0.760943,0.132435,0.089787,0.005612,0.004489,0.005612,0.001122,0.188552,0.08642,0.722783
std,0.486592,14.526497,49.693429,0.42879,0.405028,0.497665,0.47799,0.47799,0.465813,0.423966,...,0.426747,0.339154,0.286037,0.074743,0.06689,0.074743,0.033501,0.391372,0.281141,0.447876
min,0.0,0.42,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,20.125,7.9104,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,28.0,14.4542,0.0,0.0,1.0,0.0,1.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,1.0,38.0,31.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,1.0,80.0,512.3292,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [5]:
real_features = ['Age', 'Fare']
cat_features = list(set(train_df.columns.values.tolist()) - set(real_features) )
cat_features.remove('Survived')
print (cat_features)

['SibSp_0', 'Parch_1', 'Sex_female', 'Pclass_3', 'Parch_5', 'Embarked_S', 'SibSp_2', 'SibSp_4', 'SibSp_3', 'Parch_2', 'Sex_male', 'Pclass_1', 'SibSp_5', 'SibSp_1', 'SibSp_8', 'Parch_6', 'Embarked_Q', 'Parch_4', 'Embarked_C', 'Parch_0', 'Pclass_2', 'Parch_3']


In [6]:
print (train_df.shape)

(891, 25)


In [7]:
train_df['Age'] = train_df['Age'].fillna(0)

In [8]:
y = train_df['Survived'].to_numpy()
train_df = train_df.drop('Survived', 1)

X_real = train_df[real_features].to_numpy()
X_cat = train_df[cat_features].to_numpy()

print ("X_real: {} ".format(X_real[0]))
print ("X_cat: {} ".format(X_cat[0]))
print ("y: {} ".format(y[0]))

X_real: [22.    7.25] 
X_cat: [0 0 0 1 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0] 
y: 0 


In [9]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_real_scaled = scaler.fit_transform(X_real)

In [10]:
#selected_features = (0, 1, 2, 5, 7, 9, 10, 12, 14, 16, 17, 18, 19, 21, 22)

In [11]:
X = np.hstack((X_real, X_cat))
#X = X[:,selected_features]

print (X.shape, y.shape)

(891, 24) (891,)


In [12]:
X.shape[1]

24

In [13]:
feat_labels = train_df.columns
feat_labels

Index(['Age', 'Fare', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_female',
       'Sex_male', 'SibSp_0', 'SibSp_1', 'SibSp_2', 'SibSp_3', 'SibSp_4',
       'SibSp_5', 'SibSp_8', 'Parch_0', 'Parch_1', 'Parch_2', 'Parch_3',
       'Parch_4', 'Parch_5', 'Parch_6', 'Embarked_C', 'Embarked_Q',
       'Embarked_S'],
      dtype='object')

## Modeling

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

In [15]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier  

## Random Forest feature selection

In [16]:
rft = RandomForestClassifier(n_estimators=1000, random_state=0, n_jobs=-1, verbose=3)
rft.fit(X, y)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:    0.2s


building tree 1 of 1000
building tree 2 of 1000
building tree 3 of 1000
building tree 4 of 1000
building tree 5 of 1000building tree 6 of 1000
building tree 7 of 1000
building tree 8 of 1000

building tree 9 of 1000building tree 10 of 1000

building tree 11 of 1000
building tree 12 of 1000
building tree 13 of 1000building tree 14 of 1000
building tree 15 of 1000

building tree 16 of 1000
building tree 17 of 1000
building tree 18 of 1000
building tree 19 of 1000
building tree 20 of 1000
building tree 21 of 1000
building tree 22 of 1000building tree 23 of 1000

building tree 24 of 1000
building tree 25 of 1000
building tree 26 of 1000
building tree 27 of 1000building tree 28 of 1000building tree 29 of 1000


building tree 30 of 1000building tree 31 of 1000building tree 32 of 1000


building tree 33 of 1000building tree 34 of 1000

building tree 35 of 1000
building tree 36 of 1000building tree 37 of 1000building tree 38 of 1000

building tree 39 of 1000

building tree 40 of 1000
building 

[Parallel(n_jobs=-1)]: Done 496 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:    0.5s


building tree 597 of 1000
building tree 598 of 1000
building tree 599 of 1000
building tree 600 of 1000
building tree 601 of 1000
building tree 602 of 1000
building tree 603 of 1000
building tree 604 of 1000
building tree 605 of 1000building tree 606 of 1000

building tree 607 of 1000building tree 608 of 1000

building tree 609 of 1000
building tree 610 of 1000
building tree 611 of 1000
building tree 612 of 1000building tree 613 of 1000

building tree 614 of 1000
building tree 615 of 1000
building tree 616 of 1000
building tree 617 of 1000
building tree 618 of 1000
building tree 619 of 1000
building tree 620 of 1000
building tree 621 of 1000
building tree 622 of 1000
building tree 623 of 1000
building tree 624 of 1000
building tree 625 of 1000
building tree 626 of 1000
building tree 627 of 1000
building tree 628 of 1000
building tree 629 of 1000
building tree 630 of 1000
building tree 631 of 1000
building tree 632 of 1000
building tree 633 of 1000
building tree 634 of 1000
building tre

[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:    0.7s finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=-1, oob_score=False, random_state=0, verbose=3,
                       warm_start=False)

In [17]:
importances = rft.feature_importances_
indices = np.argsort(importances)[::-1]

print (importances)

[0.24346936 0.24872231 0.01513297 0.01355654 0.14223941 0.04993958
 0.00210012 0.01544778 0.00522335 0.00377442 0.00560164 0.01096884
 0.13200634 0.02985901 0.00112361 0.01671948 0.00228341 0.00037914
 0.00801883 0.0018879  0.01439253 0.01922166 0.01688877 0.00104301]


In [18]:
rft_important_features = []
for f in range(X.shape[1]):    
    print ("%2d) %-*s %f" % (f + 1, 30, feat_labels[indices[f]], importances[indices[f]]))
    if importances[indices[f]] >= 0.01:
        rft_important_features.append(indices[f])

 1) Fare                           0.248722
 2) Age                            0.243469
 3) Pclass_3                       0.142239
 4) SibSp_5                        0.132006
 5) Sex_female                     0.049940
 6) SibSp_8                        0.029859
 7) Embarked_C                     0.019222
 8) Embarked_Q                     0.016889
 9) Parch_1                        0.016719
10) SibSp_0                        0.015448
11) Pclass_1                       0.015133
12) Parch_6                        0.014393
13) Pclass_2                       0.013557
14) SibSp_4                        0.010969
15) Parch_4                        0.008019
16) SibSp_3                        0.005602
17) SibSp_1                        0.005223
18) SibSp_2                        0.003774
19) Parch_2                        0.002283
20) Sex_male                       0.002100
21) Parch_5                        0.001888
22) Parch_0                        0.001124
23) Embarked_S                  

In [19]:
print ("RF important features: {}".format(rft_important_features))

RF important features: [1, 0, 4, 12, 5, 13, 21, 22, 15, 7, 2, 20, 3, 11]


### L1

In [20]:
param_grid = {
    'C': [1.0, 10.0, 50.0, 100.0],
    'solver': ['liblinear'],
    'penalty': ['l1'],
    'max_iter': [20, 50, 100, 1000, 2000, 10000]
}

estimator = LogisticRegression()

lr1_model = GridSearchCV(estimator, param_grid, cv=10, n_jobs=-1, verbose=3)
lr1_model.fit(X, y)


Fitting 10 folds for each of 24 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:    2.0s finished


GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'C': [1.0, 10.0, 50.0, 100.0],
                         'max_iter': [20, 50, 100, 1000, 2000, 10000],
                         'penalty': ['l1'], 'solver': ['liblinear']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [21]:
print (lr1_model.best_estimator_)

LogisticRegression(C=50.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=10000,
                   multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)


In [22]:
print (lr1_model.best_score_)
print (lr1_model.best_estimator_.coef_)

0.7901234567901234
[[-1.59874682e-02  2.81144866e-03  3.41635923e-01  5.22775817e-01
   2.24476139e+00 -9.00276384e-01 -1.34366039e+00 -1.01359422e+00
   2.14197494e-01 -9.98369314e-01 -1.79987361e+00  2.82605060e-01
  -4.32762983e-01  8.83011457e-01 -4.65936048e+00  4.72506422e-01
  -5.90170930e+00 -4.13958918e+00 -8.65198983e-01 -5.08669137e+00
  -7.27191261e-01  0.00000000e+00  1.31348859e-01  1.77110675e-01]]


In [23]:
c = 0
l1_important_features = []
for cf in lr1_model.best_estimator_.coef_[0]:
    if cf != 0:
        l1_important_features.append(c)
    c += 1
print (len(l1_important_features))    
print (l1_important_features)    

23
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22, 23]


In [24]:
importances = abs(lr1_model.best_estimator_.coef_[0])
indices = np.argsort(importances)[::-1]

In [25]:
for f in range(X.shape[1]):    
    print ("%2d) %-*s %f" % (f + 1, 30, feat_labels[f], importances[indices[f]]))

 1) Age                            5.901709
 2) Fare                           5.086691
 3) Pclass_1                       4.659360
 4) Pclass_2                       4.139589
 5) Pclass_3                       2.244761
 6) Sex_female                     1.799874
 7) Sex_male                       1.343660
 8) SibSp_0                        1.013594
 9) SibSp_1                        0.998369
10) SibSp_2                        0.900276
11) SibSp_3                        0.883011
12) SibSp_4                        0.865199
13) SibSp_5                        0.727191
14) SibSp_8                        0.522776
15) Parch_0                        0.472506
16) Parch_1                        0.432763
17) Parch_2                        0.341636
18) Parch_3                        0.282605
19) Parch_4                        0.214197
20) Parch_5                        0.177111
21) Parch_6                        0.131349
22) Embarked_C                     0.015987
23) Embarked_Q                  

In [26]:
print ("L1 important features: {}".format(l1_important_features))

L1 important features: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22, 23]


In [27]:
X = X[:, l1_important_features]
print (X.shape)

(891, 23)


### L2

In [28]:
param_grid = {
    'C': [0.01, 0.02, 0.05, 0.07, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 5.0, 10.0, 20.0, 30.0],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'penalty': ['l2'],
    'max_iter': [50, 100, 1000]
}

estimator = LogisticRegression()

lr2_model = GridSearchCV(estimator, param_grid, cv=10, n_jobs=-1, verbose=3)
lr2_model.fit(X, y)


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 10 folds for each of 270 candidates, totalling 2700 fits


[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 208 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-1)]: Done 528 tasks      | elapsed:   14.6s
[Parallel(n_jobs=-1)]: Done 976 tasks      | elapsed:   30.3s
[Parallel(n_jobs=-1)]: Done 1523 tasks      | elapsed:   50.0s
[Parallel(n_jobs=-1)]: Done 2091 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 2700 out of 2700 | elapsed:  1.7min finished


GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'C': [0.01, 0.02, 0.05, 0.07, 0.1, 0.2, 0.3, 0.4, 0.5,
                               0.6, 0.7, 0.8, 0.9, 1.0, 5.0, 10.0, 20.0, 30.0],
                         'max_iter': [50, 100, 1000], 'penalty': ['l2'],
                         'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag',
                                 

In [29]:
print (lr2_model.best_estimator_)

LogisticRegression(C=0.6, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=50,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='newton-cg', tol=0.0001, verbose=0,
                   warm_start=False)


In [30]:
print (lr2_model.best_score_)

0.7968574635241302


## SVM

In [31]:
from sklearn.svm import SVC

In [32]:
svm_model = SVC(gamma='auto')

score = cross_val_score(svm_model, X, y, cv=10)
score_mean = score.mean()

In [33]:
print (score_mean)

0.7264376915219612


In [34]:
param_grid = {
    'kernel': ['linear'],
    'degree': [1],
    'C': [30.0, 50.0, 100.0, 150.0, 200.0, 250.0 ]
    
}
estimator = SVC()

svm2_model = GridSearchCV(estimator, param_grid, cv=10, n_jobs=-1, verbose=3)
svm2_model.fit(X, y)

Fitting 10 folds for each of 6 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed: 31.8min finished


GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=-1,
             param_grid={'C': [30.0, 50.0, 100.0, 150.0, 200.0, 250.0],
                         'degree': [1], 'kernel': ['linear']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [35]:
print (svm2_model.best_estimator_)

SVC(C=30.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=1, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)


In [36]:
print (svm2_model.best_score_)

0.797979797979798


## Decision tree

In [37]:
tree = DecisionTreeClassifier()

score = cross_val_score(tree, X, y, cv=10)
score_mean = score.mean()


In [38]:
print (score_mean)

0.7733906480535694


In [39]:
param_grid = {
    'min_samples_leaf': [1, 2, 3, 4, 5, 6, 10], 
    'max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
    'min_samples_split': [2, 3, 4, 5, 6, 10],
}
estimator = DecisionTreeClassifier()

tr2_model = GridSearchCV(estimator, param_grid, cv=10, n_jobs=-1, verbose=3)
tr2_model.fit(X, y)

Fitting 10 folds for each of 336 candidates, totalling 3360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done 3360 out of 3360 | elapsed:    3.2s finished


GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=None,
                                              splitter='best'),
             iid='warn', n_jobs=-1,
             param_grid={'max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
                         'min_samples_leaf': [1, 2, 3, 4, 5, 6, 10],
   

In [40]:
print (tr2_model.best_estimator_)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=9,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=6, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')


In [41]:
print (tr2_model.best_score_)

0.8260381593714927


## Random Forest

In [42]:
rf_model = RandomForestClassifier(n_estimators=100)

score = cross_val_score(rf_model, X, y, cv=10)
score_mean = score.mean()


In [43]:
print (score_mean)

0.791343774826921


In [44]:
param_grid = {
    'n_estimators': [100, 200, 300, 400], 
    'max_depth': [3, 4, 5, 6],
    'min_samples_split': [3, 4],
}
estimator = RandomForestClassifier()

rf2_model = GridSearchCV(estimator, param_grid, cv=10, n_jobs=-1, verbose=3)
rf2_model.fit(X, y)

Fitting 10 folds for each of 32 candidates, totalling 320 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 208 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed:   11.8s finished


GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             ii

In [45]:
print (rf2_model.best_estimator_)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=6, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=3,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)


In [46]:
print (rf2_model.best_score_)

0.8181818181818182


### Bagging

In [47]:
bc1_model = BaggingClassifier(n_estimators=100)

score = cross_val_score(bc1_model, X, y, cv=10)
score_mean = score.mean()


In [48]:
print (score_mean)

0.7992716490750199


In [49]:
param_grid = {
    'n_estimators': [100, 200, 300, 400],    
}
estimator = BaggingClassifier()

bc2_model = GridSearchCV(estimator, param_grid, cv=10, n_jobs=-1, verbose=3)
bc2_model.fit(X, y)

Fitting 10 folds for each of 4 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    3.0s finished


GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=BaggingClassifier(base_estimator=None, bootstrap=True,
                                         bootstrap_features=False,
                                         max_features=1.0, max_samples=1.0,
                                         n_estimators=10, n_jobs=None,
                                         oob_score=False, random_state=None,
                                         verbose=0, warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'n_estimators': [100, 200, 300, 400]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [50]:
print (bc2_model.best_estimator_)

BaggingClassifier(base_estimator=None, bootstrap=True, bootstrap_features=False,
                  max_features=1.0, max_samples=1.0, n_estimators=100,
                  n_jobs=None, oob_score=False, random_state=None, verbose=0,
                  warm_start=False)


In [51]:
print (bc2_model.best_score_)

0.8024691358024691


In [52]:
mf = int(np.sqrt(X.shape[1]))
print (X.shape[1], mf)

tree = DecisionTreeClassifier(max_features=mf)

param_grid = {
    'base_estimator': [tree],
    'n_estimators': [100, 200, 300, 400],    
}
estimator = BaggingClassifier()

bc3_model = GridSearchCV(estimator, param_grid, cv=10, n_jobs=-1, verbose=3)
bc3_model.fit(X, y)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


23 4
Fitting 10 folds for each of 4 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    2.4s finished


GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=BaggingClassifier(base_estimator=None, bootstrap=True,
                                         bootstrap_features=False,
                                         max_features=1.0, max_samples=1.0,
                                         n_estimators=10, n_jobs=None,
                                         oob_score=False, random_state=None,
                                         verbose=0, warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'base_estimator': [DecisionTreeClassifier(class_weight=N...
                                                                   criterion='gini',
                                                                   max_depth=None,
                                                                   max_features=4,
                                                                   max_leaf_nodes=None,
                                               

In [53]:
print (bc3_model.best_estimator_)

BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None,
                                                        criterion='gini',
                                                        max_depth=None,
                                                        max_features=4,
                                                        max_leaf_nodes=None,
                                                        min_impurity_decrease=0.0,
                                                        min_impurity_split=None,
                                                        min_samples_leaf=1,
                                                        min_samples_split=2,
                                                        min_weight_fraction_leaf=0.0,
                                                        presort=False,
                                                        random_state=None,
                                                        splitter='best'),
       

In [54]:
print (bc3_model.best_score_)

0.792368125701459


## ExtraTrees

In [55]:
etr_model = ExtraTreesClassifier()

score = cross_val_score(etr_model, X, y, cv=10)
score_mean = score.mean()



In [56]:
print (score_mean)

0.7756503234593122


In [57]:
param_grid = {    
    'n_estimators': [20, 30, 40, 50, 100],    
    'max_depth': [2, 3, 4, 5, 10, 15],
    'min_samples_split': [2, 3, 4, 5],
}
estimator = ExtraTreesClassifier()

etr2_model = GridSearchCV(estimator, param_grid, cv=10, n_jobs=-1, verbose=3)
etr2_model.fit(X, y)

Fitting 10 folds for each of 120 candidates, totalling 1200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 1200 out of 1200 | elapsed:    8.9s finished


GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=ExtraTreesClassifier(bootstrap=False, class_weight=None,
                                            criterion='gini', max_depth=None,
                                            max_features='auto',
                                            max_leaf_nodes=None,
                                            min_impurity_decrease=0.0,
                                            min_impurity_split=None,
                                            min_samples_leaf=1,
                                            min_samples_split=2,
                                            min_weight_fraction_leaf=0.0,
                                            n_estimators='warn', n_jobs=None,
                                            oob_score=False, random_state=None,
                                            verbose=0, warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'max_depth': [2, 3, 4, 

In [58]:
print (etr2_model.best_estimator_)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
                     max_depth=15, max_features='auto', max_leaf_nodes=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=5,
                     min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=None,
                     oob_score=False, random_state=None, verbose=0,
                     warm_start=False)


In [59]:
print (etr2_model.best_score_)

0.8181818181818182


## AdaBoost

In [60]:
ada_model = AdaBoostClassifier()

score = cross_val_score(ada_model, X, y, cv=10)
score_mean = score.mean()

In [61]:
print (score_mean)

0.8014564181137216


In [62]:
param_grid = {        
    'learning_rate': [0.001, 0.01, 0.1, 0.5, 0.6, 0.7, 0.8, 1.],
    'n_estimators': [ 40, 50, 60, 70, 80, 90, 100]
}
estimator = AdaBoostClassifier()

ada2_model = GridSearchCV(estimator, param_grid, cv=10, n_jobs=-1, verbose=3)
ada2_model.fit(X, y)

Fitting 10 folds for each of 56 candidates, totalling 560 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 400 tasks      | elapsed:    6.5s
[Parallel(n_jobs=-1)]: Done 560 out of 560 | elapsed:    8.9s finished


GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=AdaBoostClassifier(algorithm='SAMME.R',
                                          base_estimator=None,
                                          learning_rate=1.0, n_estimators=50,
                                          random_state=None),
             iid='warn', n_jobs=-1,
             param_grid={'learning_rate': [0.001, 0.01, 0.1, 0.5, 0.6, 0.7, 0.8,
                                           1.0],
                         'n_estimators': [40, 50, 60, 70, 80, 90, 100]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [63]:
print (ada2_model.best_estimator_)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=0.6,
                   n_estimators=80, random_state=None)


In [64]:
print (ada2_model.best_score_)

0.8069584736251403


## GradientBoosting

In [65]:
gb_model = GradientBoostingClassifier()

score = cross_val_score(gb_model, X, y, cv=10)
score_mean = score.mean()

In [66]:
print (score_mean)

0.8261383497900352


In [67]:
param_grid = {
    'learning_rate': [0.03, 0.04, 0.05, 0.06, 0.07, 0.1],
    'max_depth': [3, 4, 5],
    'min_samples_split': [20, 25, 27, 28, 30, 32, 35],
    'n_estimators': [400, 450, 500, 550, 600, 650, 700]
}
estimator = GradientBoostingClassifier()

gb2_model = GridSearchCV(estimator, param_grid, cv=10, n_jobs=-1, verbose=3)
gb2_model.fit(X, y) 

Fitting 10 folds for each of 756 candidates, totalling 7560 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:    8.5s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:   21.0s
[Parallel(n_jobs=-1)]: Done 496 tasks      | elapsed:   42.2s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 1136 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 1552 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 2032 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed:  5.8min
[Parallel(n_jobs=-1)]: Done 3856 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Done 4592 tasks      | elapsed:  8.4min
[Parallel(n_jobs=-1)]: Done 5392 tasks      | elapsed: 10.0min
[Parallel(n_jobs=-1)]: Done 6256 tasks      | elapsed: 11.8min
[Parallel(n_jobs=-1)]: Done 7184 tasks      | e

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=GradientBoostingClassifier(criterion='friedman_mse',
                                                  init=None, learning_rate=0.1,
                                                  loss='deviance', max_depth=3,
                                                  max_features=None,
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=2,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=100,
                                                  n_iter_n...
                                                  random_state=None,

In [68]:
print (gb2_model.best_estimator_)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.03, loss='deviance', max_depth=4,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=30,
                           min_weight_fraction_leaf=0.0, n_estimators=500,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)


In [69]:
print (gb2_model.best_score_)

0.8327721661054994


## XGBoost

In [70]:
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
from xgboost import XGBClassifier

In [71]:
xgb_model = XGBClassifier()

score = cross_val_score(xgb_model, X, y, cv=10)
score_mean = score.mean()


In [72]:
print (score_mean)

0.8160886959482465


In [73]:
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1, 0.15],
    'max_depth': [2, 3, 4, 5],
    'min_child_weight': [2, 3],
    'gamma': [0.01, 0.06, 0.1, 0.2],
    'colsample_bytree': [0.5, 0.6, 0.7],
    'n_estimators': [400, 500, 600, 700, 800, 900]
}
estimator = XGBClassifier()

xgb2_model = GridSearchCV(estimator, param_grid, cv=10, n_jobs=-1, verbose=3)
xgb2_model.fit(X, y)

Fitting 10 folds for each of 2304 candidates, totalling 23040 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:    6.6s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:   15.7s
[Parallel(n_jobs=-1)]: Done 496 tasks      | elapsed:   30.4s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   46.3s
[Parallel(n_jobs=-1)]: Done 1136 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 1552 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 2032 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 3856 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 4592 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 5392 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done 6256 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 7184 tasks      | e

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=1,
                                     nthread=None, objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_...
                                     scale_pos_weight=1, seed=None, silent=None,
                                     subsample=1, verbosity=1),
             iid='warn', n_jobs=-1,
             param_grid={'colsample_bytree': [0.5, 0.6, 0.7],
                         'gamma': [0.01, 0.06, 0.1, 0.2],
                         'learning_rate': [0.01, 0.05,

In [74]:
print (xgb2_model.best_estimator_)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7, gamma=0.1,
              learning_rate=0.1, max_delta_step=0, max_depth=4,
              min_child_weight=3, missing=None, n_estimators=700, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)


In [75]:
print (xgb2_model.best_score_)

0.8361391694725028


## Voting Classifier

In [76]:
from sklearn.ensemble import VotingClassifier

In [104]:
best_models = [     
    #('lr2', lr2_model),
    #('svm', svm2_model),  
    ('tr2', tr2_model), 
    ('rf2', rf2_model), 
    ('bc2', bc2_model),  
    #('bc3', bc3_model),
    ('etr', etr2_model),
    ('ada', ada2_model),
    ('gb', gb2_model),
    ('xgb', xgb2_model),
]
best_estimators = []
for md in best_models:
    print ("{}_model: {}".format(md[0], md[1].best_score_))
    best_estimators.append((md[0], md[1].best_estimator_))

tr2_model: 0.8260381593714927
rf2_model: 0.8181818181818182
bc2_model: 0.8024691358024691
etr_model: 0.8181818181818182
ada_model: 0.8069584736251403
gb_model: 0.8327721661054994
xgb_model: 0.8361391694725028


In [105]:
vt_model = VotingClassifier(best_estimators, n_jobs=-1)

print (vt_model)

VotingClassifier(estimators=[('tr2',
                              DecisionTreeClassifier(class_weight=None,
                                                     criterion='gini',
                                                     max_depth=9,
                                                     max_features=None,
                                                     max_leaf_nodes=None,
                                                     min_impurity_decrease=0.0,
                                                     min_impurity_split=None,
                                                     min_samples_leaf=6,
                                                     min_samples_split=2,
                                                     min_weight_fraction_leaf=0.0,
                                                     presort=False,
                                                     random_state=None,
                                                     splitter='best')),
        

In [106]:
score = cross_val_score(vt_model, X, y, cv=10)
score_mean = score.mean()

In [107]:
print (score_mean)

0.8395959595959596


In [108]:
vt_model.fit(X, y)

VotingClassifier(estimators=[('tr2',
                              DecisionTreeClassifier(class_weight=None,
                                                     criterion='gini',
                                                     max_depth=9,
                                                     max_features=None,
                                                     max_leaf_nodes=None,
                                                     min_impurity_decrease=0.0,
                                                     min_impurity_split=None,
                                                     min_samples_leaf=6,
                                                     min_samples_split=2,
                                                     min_weight_fraction_leaf=0.0,
                                                     presort=False,
                                                     random_state=None,
                                                     splitter='best')),
        

In [109]:
print (len(vt_model.estimators))

7


## VT submission

In [83]:
df = pd.read_csv('./test.csv')
passenger_ids = df['PassengerId'].to_numpy()
df = df.drop('PassengerId', 1)
df = df.drop('Name', 1)
df = df.drop('Ticket', 1)
df = df.drop('Cabin', 1)
df.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,34.5,0,0,7.8292,Q
1,3,female,47.0,1,0,7.0,S
2,2,male,62.0,0,0,9.6875,Q
3,3,male,27.0,0,0,8.6625,S
4,3,female,22.0,1,1,12.2875,S


In [84]:
encode_features = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked']
for enc in encode_features:
    one_hot = pd.get_dummies(df[enc], prefix=enc)
    df = df.drop(enc,axis = 1)
    df = df.join(one_hot)
df.head()

Unnamed: 0,Age,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,SibSp_0,SibSp_1,SibSp_2,...,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Parch_9,Embarked_C,Embarked_Q,Embarked_S
0,34.5,7.8292,0,0,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,1,0
1,47.0,7.0,0,0,1,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
2,62.0,9.6875,0,1,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,1,0
3,27.0,8.6625,0,0,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,1
4,22.0,12.2875,0,0,1,1,0,0,1,0,...,1,0,0,0,0,0,0,0,0,1


In [85]:
df.describe()

Unnamed: 0,Age,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,SibSp_0,SibSp_1,SibSp_2,...,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Parch_9,Embarked_C,Embarked_Q,Embarked_S
count,332.0,417.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,...,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0
mean,30.27259,35.627188,0.255981,0.222488,0.521531,0.363636,0.636364,0.677033,0.263158,0.033493,...,0.124402,0.078947,0.007177,0.004785,0.002392,0.002392,0.004785,0.244019,0.110048,0.645933
std,14.181209,55.907576,0.436934,0.416416,0.500135,0.481622,0.481622,0.46817,0.440875,0.180135,...,0.330435,0.26998,0.084514,0.069088,0.048912,0.048912,0.069088,0.430019,0.313324,0.478803
min,0.17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,21.0,7.8958,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,14.4542,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,39.0,31.5,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,76.0,512.3292,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [86]:
df['Age'] = df['Age'].fillna(0)
df['Fare'] = df['Fare'].fillna(0)

In [87]:
X_testing_real_zeros = df[real_features].to_numpy()
X_testing_cat_zeros = df[cat_features].to_numpy()

X_testing_real_zeros_scaled = scaler.transform(X_testing_real_zeros)

print ("X_real: {} ".format(X_testing_real_zeros[0]))
print ("X_cat: {} ".format(X_testing_cat_zeros[0]))
print ("y: {} ".format(y[0]))

X_real: [34.5     7.8292] 
X_cat: [1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0] 
y: 0 


In [88]:
print (df.shape)

(418, 25)


In [110]:
X_testing = np.hstack((X_testing_real_zeros_scaled, X_testing_cat_zeros))
#X_testing = X_testing[:,selected_features]
X_testing = X_testing[:, l1_important_features]
print (X.shape)

predicted = vt_model.predict(X_testing)

print (predicted)
print (passenger_ids)

(891, 23)
[0 1 0 0 0 0 1 1 1 0 0 0 1 0 1 1 0 0 0 1 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 1 0 0 1 1 0 0 0 1 0 0 0 1 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0
 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0
 0 1 0 1 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
 0 1 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 1 1 0 1 1 0 1
 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 1 0 0 0 1 1 0 1 0
 1 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 1 1 1 1 1 0 0 0 1 1 1 1 1 0 0 0 0 0 0 0 1
 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 0 0
 1 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0
 1 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 1 0 0 0 0 0 1 0 0 1 0 1 1 0 1 0 0 0 1 0
 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 1 0 0 0 0
 1 0 0 0 1 0 0 1 0 0 0]
[ 892  893  894  895  896  897  898  899  900  901  902  903  904  905
  906  907  908  909  910  911  912  913  914  915  916  917  918  919

In [111]:
with open("submission_vt5.txt", "w") as fout:
    fout.write("PassengerId,Survived\n")
    for val in zip(passenger_ids, predicted):
        fout.write("{},{}\n".format(str(int(val[0])), str(int(val[1]))))