In [194]:
# data analysis and wrangling
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport

# visualization
import pydotplus
from sklearn.tree import export_graphviz
import seaborn as sns
import matplotlib.pyplot as plt

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

# machine learning
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.ensemble import GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.decomposition import FactorAnalysis

#preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import OrdinalEncoder


import warnings
warnings.filterwarnings('ignore')


# I. Grid Search

In [195]:
train_data = pd.read_csv('.\\titanic\\train.csv')
test_data = pd.read_csv('.\\titanic\\test.csv')

In [196]:
train_df = train_data.copy()
test_df = test_data.copy()

#Preprocessing : with mean for train
train_df['Age'].fillna(train_df['Age'].mean(),inplace=True)
train_df['Embarked'].fillna('X',inplace=True)
train_df['Cabin'].fillna('XX',inplace=True)
train_df['Sex'] = train_df['Sex'].map({'female':0,'male':1})
train_df['Random'] = np.random.rand(train_df.shape[0])

#Preprocessing : with mean for test (don't use the statistic of the test in the train!!!!!!!)
test_df['Age'].fillna(test_df['Age'].mean(),inplace=True)
test_df['Embarked'].fillna('X',inplace=True)
test_df['Cabin'].fillna('XX',inplace=True)
test_df['Sex'] = test_df['Sex'].map({'female':0,'male':1})
test_df['Fare'].fillna(test_df['Fare'].mean(),inplace=True)
test_df['Random'] = np.random.rand(test_df.shape[0])


encoder = OneHotEncoder(handle_unknown="ignore")
encoder.fit(train_df[['Embarked']])    # Assume for simplicity all features are categorical.
# Apply the encoder for train
a = encoder.transform(train_df[['Embarked']])
other = pd.DataFrame(data=a.toarray(),columns=['Embarked_1','Embarked_2','Embarked_3','Embarked_4'])
train_df = train_df.join(other,lsuffix='_caller', rsuffix='_other')

# Apply the encoder for test
a = encoder.transform(test_df[['Embarked']])
other = pd.DataFrame(data=a.toarray(),columns=['Embarked_1','Embarked_2','Embarked_3','Embarked_4'])
test_df = test_df.join(other,lsuffix='_caller', rsuffix='_other')

In [207]:
X = train_df[['Pclass', 'Sex','Age','SibSp','Parch','Fare',
              'Embarked_1','Embarked_2','Embarked_3','Embarked_4','Random']]

X_sub = test_df[['Pclass', 'Sex','Age','SibSp','Parch','Fare',
              'Embarked_1','Embarked_2','Embarked_3','Embarked_4','Random']]

In [208]:
# Preprcessing of train
scl = MinMaxScaler()
X_scaled_minmax = scl.fit_transform(X)

scl = StandardScaler()
X_scaled_std = scl.fit_transform(X)

#preprocessing of test
scl = MinMaxScaler()
X_sub_scaled_minmax = scl.fit_transform(X_sub)

scl = StandardScaler()
X_sub_scaled_std = scl.fit_transform(X_sub)

In [209]:
# Feature Selection
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=10)
clf = RandomForestClassifier()

parameters_dc = { 'max_depth' : range(1,5),
                  'min_samples_leaf': range(2,5),
                'criterion':['entropy','gini']}

gs = GridSearchCV(estimator=clf, param_grid=parameters_dc, cv=cv, n_jobs=-1 )
_ = gs.fit(X,y)


In [210]:
gs.best_params_

{'criterion': 'entropy', 'max_depth': 4, 'min_samples_leaf': 3}

In [211]:
gs.best_estimator_

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=4, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=3, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [212]:
#Estimator return by Grid search
sorted(gs.best_estimator_.feature_importances_)
list(zip(X.columns,gs.best_estimator_.feature_importances_))

[('Pclass', 0.12906535356000592),
 ('Sex', 0.44129993797057815),
 ('Age', 0.07555547969686437),
 ('SibSp', 0.04927503939729419),
 ('Parch', 0.037124053488259944),
 ('Fare', 0.17057425681042052),
 ('Embarked_1', 0.015590811567474995),
 ('Embarked_2', 0.009959373336185039),
 ('Embarked_3', 0.022221851764423338),
 ('Embarked_4', 0.0),
 ('Random', 0.049333842408493506)]

In [173]:
#It is ready for use for prediction
gs.best_estimator_.predict(X_sub)

array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [174]:
#if you need probability if they are implemented by the classifier 
gs.best_estimator_.predict_proba(X_sub)

array([[0.74149757, 0.25850243],
       [0.57091136, 0.42908864],
       [0.74149757, 0.25850243],
       [0.7203724 , 0.2796276 ],
       [0.64159233, 0.35840767],
       [0.7203724 , 0.2796276 ],
       [0.39161274, 0.60838726],
       [0.83821466, 0.16178534],
       [0.38719556, 0.61280444],
       [0.82652627, 0.17347373],
       [0.7203724 , 0.2796276 ],
       [0.7203724 , 0.2796276 ],
       [0.57091136, 0.42908864],
       [0.82652627, 0.17347373],
       [0.57091136, 0.42908864],
       [0.5023917 , 0.4976083 ],
       [0.74149757, 0.25850243],
       [0.72932169, 0.27067831],
       [0.57091136, 0.42908864],
       [0.38719556, 0.61280444],
       [0.8095837 , 0.1904163 ],
       [0.73708151, 0.26291849],
       [0.43381614, 0.56618386],
       [0.74567576, 0.25432424],
       [0.54258683, 0.45741317],
       [0.82652627, 0.17347373],
       [0.45073545, 0.54926455],
       [0.72932169, 0.27067831],
       [0.7203724 , 0.2796276 ],
       [0.8095837 , 0.1904163 ],
       [0.

In [175]:
#Dict of how grid shows the best parameters
sorted(gs.cv_results_.keys())

['mean_fit_time',
 'mean_score_time',
 'mean_test_score',
 'param_criterion',
 'param_max_depth',
 'param_min_samples_leaf',
 'params',
 'rank_test_score',
 'split0_test_score',
 'split1_test_score',
 'split2_test_score',
 'split3_test_score',
 'split4_test_score',
 'std_fit_time',
 'std_score_time',
 'std_test_score']

In [176]:
gs.cv_results_

{'mean_fit_time': array([0.23437791, 0.25531826, 0.25048647, 0.22739167, 0.23896108,
        0.25895853, 0.24663401, 0.24235601, 0.22892833, 0.25795436,
        0.228409  , 0.24816613, 0.21738577, 0.26362953, 0.26582503,
        0.26255608, 0.24940805, 0.27808952, 0.2748569 , 0.25313292,
        0.27717705, 0.27362156, 0.25583811, 0.20480752]),
 'std_fit_time': array([0.01895597, 0.0265372 , 0.02745972, 0.01576914, 0.02587399,
        0.03032495, 0.02422329, 0.01277545, 0.02167638, 0.02287248,
        0.00725093, 0.03056918, 0.01953626, 0.02767128, 0.06261286,
        0.04645128, 0.03967807, 0.05098158, 0.05500806, 0.042641  ,
        0.04659947, 0.05020499, 0.02506307, 0.02303993]),
 'mean_score_time': array([0.01515918, 0.01436143, 0.01635609, 0.0157578 , 0.01416206,
        0.01618562, 0.01558547, 0.01628962, 0.01568456, 0.01610227,
        0.0162025 , 0.0166069 , 0.01789594, 0.0195899 , 0.01999383,
        0.02717571, 0.0200768 , 0.02147517, 0.02070656, 0.02099113,
        0.023383

In [177]:
# Best score
i = np.argmin(gs.cv_results_['rank_test_score']) #index of min
print('Best estimation score :', gs.best_score_)
print('Best estimation variance',gs.cv_results_['std_test_score'][i])

Best estimation score : 0.8260184545853996
Best estimation variance 0.031072862802004695


In [178]:
# In cas you need only parameters
a = RandomForestClassifier(**gs.best_params_)
_ = a.fit(X,y)
print(list(zip(X.columns.values,a.feature_importances_)))

[('Pclass', 0.15604610239186528), ('Sex', 0.44369248141816064), ('Age', 0.08826883357377147), ('SibSp', 0.05031086423831988), ('Parch', 0.030235796222136273), ('Fare', 0.1824206869558377), ('Embarked_1', 0.022903039987620104), ('Embarked_2', 0.008433744386521221), ('Embarked_3', 0.017679272465906764), ('Embarked_4', 9.178359860684269e-06)]


# II. Ensembling 

## 1. Voting 


In [213]:
# Decision Trees Parameters
dt_params = {
    'max_depth': range(1,5),
    'min_samples_leaf': range(2,5),
    'min_samples_split':range(2,5),
}
# Extra Trees Parameters
et_params = {
    'max_depth': range(1,5),
    'min_samples_leaf': range(2,5),
    'min_samples_split':range(2,5),
}
# Support Vector Classifier parameters 
svc_params = {
    'C' : [0,1, 0.05,0.025]
    }


cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=10)

clf_dt = DecisionTreeClassifier()
clf_et = ExtraTreesClassifier()
clf_svc = SVC(probability=True)

dic_clf = {
    'DT' : [clf_dt, dt_params],
    'ET' : [clf_et, et_params], 
    'SVC': [clf_svc, svc_params]
}

gs = {}

for key in dic_clf:
    gs[key] = GridSearchCV(estimator=dic_clf[key][0], param_grid=dic_clf[key][1], cv=cv, n_jobs=-1 )
    _ = gs[key].fit(X,y)
    print('Classifier:',key)
    print('Best params are:',gs[key].best_params_)
    i = np.argmin(gs[key].cv_results_['rank_test_score']) #index of min
    print('Best estimation score :', gs[key].best_score_)
    print('Best estimation variance',gs[key].cv_results_['std_test_score'][i],'\n')


Classifier: DT
Best params are: {'max_depth': 3, 'min_samples_leaf': 3, 'min_samples_split': 2}
Best estimation score : 0.8181658401858014
Best estimation variance 0.032567502513050496 

Classifier: ET
Best params are: {'max_depth': 3, 'min_samples_leaf': 2, 'min_samples_split': 3}
Best estimation score : 0.8192706044818279
Best estimation variance 0.02414906009903531 

Classifier: SVC
Best params are: {'C': 1}
Best estimation score : 0.6812880547360491
Best estimation variance 0.028346439925249878 



### Hard voting

In [225]:
pred_test_vote = pd.DataFrame(data=np.zeros(X_sub.shape[0]*len(gs.keys())).reshape(X_sub.shape[0],len(gs.keys())),columns=gs.keys())
pred_test_vote

Unnamed: 0,DT,ET,SVC
0,0.0,0.0,0.0
1,0.0,0.0,0.0
2,0.0,0.0,0.0
3,0.0,0.0,0.0
4,0.0,0.0,0.0
...,...,...,...
413,0.0,0.0,0.0
414,0.0,0.0,0.0
415,0.0,0.0,0.0
416,0.0,0.0,0.0


In [226]:
for key in gs:
    pred_test_vote[key] = gs[key].predict(X_sub)
pred_test_vote

Unnamed: 0,DT,ET,SVC
0,0,0,0
1,1,0,0
2,0,0,0
3,0,0,0
4,1,0,0
...,...,...,...
413,0,0,0
414,1,1,1
415,0,0,0
416,0,0,0


In [227]:
# Soft voting using a threshold: à.5

pred_test_vote['Prediction_Hard'] = (pred_test_vote.mean(axis=1) >=0.5).astype(int)
pred_test_vote.sample(100)

Unnamed: 0,DT,ET,SVC,Prediction_Hard
179,1,1,1,1
106,0,0,0,0
375,1,1,1,1
60,0,0,0,0
119,1,1,0,1
...,...,...,...,...
122,1,1,1,1
74,1,1,1,1
86,1,1,0,1
404,0,0,0,0


### Soft voting

In [228]:
for key in gs:
    pred_test_vote[key+'_soft'] = gs[key].predict_proba(X_sub)[:,1]
pred_test_vote

Unnamed: 0,DT,ET,SVC,Prediction_Hard,DT_soft,ET_soft,SVC_soft
0,0,0,0,0,0.115473,0.224091,0.286749
1,1,0,0,0,0.589744,0.445650,0.290644
2,0,0,0,0,0.115473,0.297253,0.292685
3,0,0,0,0,0.115473,0.196907,0.286125
4,1,0,0,0,0.589744,0.448295,0.296335
...,...,...,...,...,...,...,...
413,0,0,0,0,0.115473,0.196381,0.283110
414,1,1,1,1,0.980000,0.785377,0.804601
415,0,0,0,0,0.115473,0.193454,0.283578
416,0,0,0,0,0.115473,0.196381,0.283282


In [230]:
pred_test_vote['Prediction_soft'] = (pred_test_vote[['DT_soft','ET_soft','SVC_soft']].mean(axis=1) >=0.5).astype(int)
pred_test_vote.sample(100)

Unnamed: 0,DT,ET,SVC,Prediction_Hard,DT_soft,ET_soft,SVC_soft,Prediction_soft
163,0,0,0,0,0.115473,0.196718,0.283413,0
117,1,0,0,0,0.589744,0.478386,0.400929,0
121,0,0,0,0,0.115473,0.230971,0.279854,0
306,0,0,1,0,0.358333,0.405110,0.768422,1
271,0,0,0,0,0.115473,0.222058,0.287090,0
...,...,...,...,...,...,...,...,...
321,0,0,0,0,0.115473,0.264708,0.291650,0
364,1,1,1,1,0.980000,0.809693,0.605840,1
412,1,0,0,0,0.589744,0.448301,0.293761,0
201,1,0,0,0,1.000000,0.264297,0.397746,1


## 2. Stacking

In [231]:
# Based on the best param we keep the following
X = train_df[['Pclass', 'Sex','Age','SibSp','Parch','Fare']]
y = train_df['Survived']
X_sub = test_df[['Pclass', 'Sex','Age','SibSp','Parch','Fare']]

In [187]:
# Random Forest parameters
rf_params = {
    'n_estimators': range(49,50),
    'max_depth': range(1,5),
    'min_samples_leaf': range(2,5),
    'min_samples_split':range(2,5),
}

#LG Parameters
lg_params = {
    'C':[0.01,.1,1,2,5]
}

# Decision Trees Parameters
dt_params = {
    'max_depth': range(1,5),
    'min_samples_leaf': range(2,5),
    'min_samples_split':range(2,5),
}
# Extra Trees Parameters
et_params = {
    'max_depth': range(1,5),
    'min_samples_leaf': range(2,5),
    'min_samples_split':range(2,5),
}

# AdaBoost parameters
ada_params = {
    'n_estimators': range(49,50),
    'learning_rate':[0.01,0.05,0.1,0.5,1]
}


# Support Vector Classifier parameters 
svc_params = {
    'C' : [0,1, 0.05,0.025]
    }

knn_params = {
    'n_neighbors':[1,2,5,10,20,30], 
    'p':[1,2],
    'weights':['uniform','distance']
}



classifiers_names = {'RF': 'RandomForestClassifier',
               'DT': 'DecisionTreeClassifier',
               'LG': 'LogisticRegression',
               'ET': 'ExtraTreesClassifier',
               'ADA': 'AdaBoostClassifier',
               'SVC': 'SVC',
               'KNN': 'KNeighborsClassifier'

              }
classifiers_params = {'RF': 'rf_params',
               'DT': 'dt_params',
               'LG': 'lg_params',
               'ET': 'et_params',
               'ADA': 'ada_params',
               'SVC': 'svc_params',
               'KNN': 'knn_params'

              }

classifiers_clf = {}

for key in classifiers_names:
    classifiers_clf[key] = eval(classifiers_names[key]+'()') # RandomForestClassifier()
    

In [232]:
#Search for best parameters.
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=10)

classifiers_gs = {}

for key in classifiers_clf:
    print('Classifier:',key)
    clf = eval(classifiers_names[key]+'()')
    params = eval(classifiers_params[key])
    classifiers_gs[key] = GridSearchCV(clf, param_grid=params, cv=cv,n_jobs=-1)
    _ = classifiers_gs[key].fit(X,y)
    print('Best params are:',classifiers_gs[key].best_params_)
    i = np.argmin(classifiers_gs[key].cv_results_['rank_test_score']) #index of min
    print('Best estimation score :', classifiers_gs[key].best_score_)
    print('Best estimation variance',classifiers_gs[key].cv_results_['std_test_score'][i],'\n')

Classifier: RF
Best params are: {'max_depth': 4, 'min_samples_leaf': 4, 'min_samples_split': 4, 'n_estimators': 49}
Best estimation score : 0.8248823049400539
Best estimation variance 0.03240238785440985 

Classifier: DT
Best params are: {'max_depth': 3, 'min_samples_leaf': 3, 'min_samples_split': 2}
Best estimation score : 0.8181658401858014
Best estimation variance 0.032567502513050496 

Classifier: LG
Best params are: {'C': 0.1}
Best estimation score : 0.7968300797187873
Best estimation variance 0.036088869955324863 

Classifier: ET
Best params are: {'max_depth': 4, 'min_samples_leaf': 2, 'min_samples_split': 2}
Best estimation score : 0.809189630280585
Best estimation variance 0.03043478347137055 

Classifier: ADA
Best params are: {'learning_rate': 1, 'n_estimators': 49}
Best estimation score : 0.8058125666938674
Best estimation variance 0.038798301231628224 

Classifier: SVC
Best params are: {'C': 1}
Best estimation score : 0.679047140794677
Best estimation variance 0.028735987068

In [189]:
pred_test_stack = {}
pred_train_stack = pd.DataFrame(data=np.arange(X.shape[0]*len(classifiers_gs.keys())).reshape(X.shape[0],len(classifiers_gs.keys())),columns=classifiers_gs.keys())

for key in classifiers_gs:
    pred_test_stack[key] = pd.DataFrame()

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=10)


for key in classifiers_gs:
    fold=0
    for train_index, test_index in cv.split(X, y):
        X_train = X.loc[train_index]
        y_train = y.loc[train_index]

        X_test = X.loc[test_index]
        y_test = y.loc[test_index]
        
        params = classifiers_gs[key].best_params_
        model = eval(classifiers_names[key]+'(**params)')
        model.fit( X_train,  y_train)

        pred_train_stack[key][test_index] = model.predict(X_test)
        pred_sub   = model.predict(X_sub)
        pred_test_stack[key]['fold_'+str(fold)] = pred_sub

        fold +=1

In [190]:
pred_train_stack

Unnamed: 0,RF,DT,LG,ET,ADA,SVC,KNN
0,0,0,0,0,0,0,0
1,1,1,1,1,1,1,1
2,1,1,1,1,1,0,1
3,1,1,1,1,1,1,1
4,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...
886,0,0,0,0,0,0,0
887,1,1,1,1,1,0,1
888,0,1,0,1,1,0,0
889,0,0,1,0,0,0,1


In [None]:
#Majority voting strategy and bagging
pred=pd.DataFrame()
for key in pred_test_stack:
    pred[key] = pred_test_stack[key].sum(axis=1)>2
    pred[key] = pred[key].map({True:1,False:0})
    
predictions = (pred.mean(axis=1) >=0.5).astype(int)
predictions

In [192]:
#Majority voting strategy
#https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.VotingClassifier.html

eclf1 = VotingClassifier(estimators=classifiers_gs.items(), voting='hard')
eclf1 = eclf1.fit(X, y)
predictions = eclf1.predict(sub_test) 
print(predictions)

[0 0 0 0 1 0 1 0 1 0 0 0 1 0 1 1 0 0 1 0 0 0 1 1 1 0 1 0 0 0 0 0 1 0 0 0 1
 1 0 0 0 0 0 1 1 0 0 0 1 1 1 0 1 1 0 0 0 0 0 1 0 0 0 1 1 1 1 0 0 1 1 0 1 0
 1 0 0 1 0 1 1 0 0 0 0 0 1 1 1 0 1 0 1 0 0 0 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0
 1 1 1 1 0 0 1 0 1 1 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0
 0 0 1 0 0 1 0 0 1 1 0 1 1 0 1 0 0 1 0 0 1 1 0 0 0 0 0 1 1 0 1 1 0 0 1 0 1
 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 1 1 0 1 0 0 1 0 1 0 0 0 0 1 0 0 1 0 1 0 1 0
 1 0 1 1 0 1 0 0 0 1 0 0 0 0 0 0 1 1 1 1 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 0 1
 0 0 0 1 1 0 0 0 0 1 0 0 0 1 1 0 1 0 0 0 0 1 1 1 1 1 0 0 1 0 0 0 1 0 0 0 0
 1 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0
 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 1 1 0 1 0 0 1 1 0
 0 1 0 0 1 1 0 0 0 0 0 0 1 1 0 1 0 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 1 0 0 0 0
 0 1 1 1 1 1 0 1 0 0 0]


In [193]:
# Stacking
gbm = xgb.XGBClassifier(
    #learning_rate = 0.02,
 n_estimators= 500,
 max_depth= 4,
 min_child_weight= 2,
 #gamma=1,
 gamma=0.9,                        
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread= -1,
 scale_pos_weight=1)

clf_gbm = GridSearchCV(gbm,param_grid=gb_params,cv=cv, n_jobs=-1)

clf_gbm.fit(pred_train_stack, y)

predictions = clf_gbm.predict(pred)

predictions


ValueError: feature_names mismatch: ['RF', 'DT', 'LG', 'ET', 'ADA', 'SVC', 'KNN'] []
expected ADA, DT, LG, KNN, ET, RF, SVC in input data

In [241]:
clf_gbm.return_train_score

False

In [290]:
my_final_sub = pd.read_csv('.\\titanic\\test.csv')[['PassengerId']]
my_final_sub['Survived'] = predictions

my_final_sub.to_csv('submission.csv', index=False)