In [1]:
import pandas as pd
import numpy as np
import scipy.stats as st

from boostaroota import BoostARoota

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_curve

from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC 


np.random.seed(0)

In [2]:
trainTreated = pd.read_csv('dTrainTreated.csv')
trainTreated = trainTreated.drop('Unnamed: 0', 1)
trainTreated.set_index('PassengerId', inplace=True)

In [3]:
trainTreated.head()

Unnamed: 0_level_0,Pclass_catP,Pclass_catB,Age_clean,Age_isBAD,SibSp_catP,SibSp_catB,Parch_catP,Parch_catB,Ticket_catP,Ticket_catB,...,FamilySize_lev_x_1,FamilySize_lev_x_2,FamilySize_lev_x_3,FamilySize_lev_x_4,FamilySize_lev_x_6,Title_lev_x_Master,Title_lev_x_Miss,Title_lev_x_Mr,Title_lev_x_Mrs,Survived
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.553872,-0.581229,22.0,0,0.244108,0.514673,0.744108,-0.182756,0.000842,0.0,...,0,1,0,0,0,0,0,1,0,0
2,0.240741,0.884583,38.0,0,0.244108,0.514673,0.744108,-0.182756,0.000842,0.0,...,0,1,0,0,0,0,0,0,1,1
3,0.555556,-0.682717,26.0,0,0.686869,-0.18695,0.765993,-0.138514,0.000842,0.0,...,1,0,0,0,0,0,1,0,0,1
4,0.242424,1.074061,35.0,0,0.240741,0.683852,0.765993,-0.138514,0.001684,-8.737153,...,0,1,0,0,0,0,0,0,1,1
5,0.543771,-0.740099,35.0,0,0.691919,-0.187189,0.772727,-0.200315,0.000842,0.0,...,1,0,0,0,0,0,0,1,0,0


In [4]:
# BoostARoota from: https://github.com/chasedehan/BoostARoota
br = BoostARoota(metric='logloss')
train = trainTreated.loc[:, trainTreated.columns != 'Survived']
label = trainTreated["Survived"]

br.fit(train,label)
len(train.columns)
len(br.keep_vars_)
new_train = br.transform(train)

print("Original training set has " + str(train.shape) + " dimensions. \n" +\
"BoostARoota with .fit() and .transform() reduces to " + str(new_train.shape) + " dimensions.")

Round:  1  iteration:  1
Round:  1  iteration:  2
Round:  1  iteration:  3
Round:  1  iteration:  4
Round:  1  iteration:  5
Round:  1  iteration:  6
Round:  1  iteration:  7
Round:  1  iteration:  8
Round:  1  iteration:  9
Round:  1  iteration:  10
Round:  2  iteration:  1
Round:  2  iteration:  2
Round:  2  iteration:  3
Round:  2  iteration:  4
Round:  2  iteration:  5
Round:  2  iteration:  6
Round:  2  iteration:  7
Round:  2  iteration:  8
Round:  2  iteration:  9
Round:  2  iteration:  10
Round:  3  iteration:  1
Round:  3  iteration:  2
Round:  3  iteration:  3
Round:  3  iteration:  4
Round:  3  iteration:  5
Round:  3  iteration:  6
Round:  3  iteration:  7
Round:  3  iteration:  8
Round:  3  iteration:  9
Round:  3  iteration:  10
BoostARoota ran successfully! Algorithm went through  3  rounds.
Original training set has (891, 47) dimensions. 
BoostARoota with .fit() and .transform() reduces to (891, 19) dimensions.


In [5]:
list(new_train)

['Pclass_catP',
 'Pclass_catB',
 'Age_clean',
 'SibSp_catP',
 'SibSp_catB',
 'Parch_catP',
 'Ticket_catP',
 'Ticket_catB',
 'Fare_clean',
 'Cabin_catP',
 'Cabin_catB',
 'Embarked_catP',
 'Embarked_catB',
 'FamilySize_catP',
 'FamilySize_catB',
 'Title_catP',
 'Title_catB',
 'LastName_catP',
 'LastName_catB']

In [6]:
# split data into train and test sets
seed = 7
test_size = 0
X_train, X_test, y_train, y_test = train_test_split(new_train, label, test_size=test_size, random_state=seed)

In [7]:
# distributions of params from: http://danielhnyk.cz/how-to-use-xgboost-in-python/

XGBmodel = XGBClassifier()

one_to_left = st.beta(10, 1)  
from_zero_positive = st.expon(0, 50)
XGB_search_params = {  
    "n_estimators": st.randint(3, 40),
    "max_depth": st.randint(3, 40),
    "learning_rate": st.uniform(0.05, 0.4),
    "colsample_bytree": one_to_left,
    "subsample": one_to_left,
    "gamma": st.uniform(0, 10),
    'reg_alpha': from_zero_positive,
    "min_child_weight": from_zero_positive,
}


XGBsearch = RandomizedSearchCV(XGBmodel, XGB_search_params, cv=10, scoring = 'roc_auc', n_iter=200)
XGBsearch.fit(X_train, y_train)

RandomizedSearchCV(cv=10, error_score='raise',
          estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
          fit_params=None, iid=True, n_iter=200, n_jobs=1,
          param_distributions={'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000024E168242B0>, 'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000024E168244A8>, 'learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000024E16824940>, 'cols...E8>, 'min_child_weight': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000024E168249E8>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
    

In [8]:
RFmodel = RandomForestClassifier()

RF_search_params = {  
    "n_estimators": st.randint(3, 100),
    "max_depth": st.randint(3, 50),
    "min_samples_leaf": st.randint(1, 100)
    }

RFsearch = RandomizedSearchCV(RFmodel, RF_search_params, cv=10, scoring = 'roc_auc', n_iter=100)
RFsearch.fit(X_train, y_train)


RandomizedSearchCV(cv=10, error_score='raise',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid=True, n_iter=100, n_jobs=1,
          param_distributions={'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000024E1697B6D8>, 'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000024E1697B320>, 'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000024E1697B278>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring='roc_auc', verbose=0)

In [9]:
SVCmodel = SVC(probability=True)

SVC_search_params = {
    "C": st.uniform(0, 10),
    "gamma": st.uniform(0.1, 1)
    }

SVCsearch = RandomizedSearchCV(SVCmodel, SVC_search_params, cv=10, scoring = 'roc_auc', n_iter=100)
SVCsearch.fit(X_train, y_train)

RandomizedSearchCV(cv=10, error_score='raise',
          estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
          fit_params=None, iid=True, n_iter=100, n_jobs=1,
          param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000024E1675DC88>, 'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000024E1675D898>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring='roc_auc', verbose=0)

In [16]:
XGBparams = XGBsearch.best_params_
XGBmodel.set_params(**XGBparams)
XGBmodel.fit(X_train, y_train)

RFparams = RFsearch.best_params_
RFmodel.set_params(**RFparams)
RFmodel.fit(X_train, y_train)

SVCparams = SVCsearch.best_params_
SVCmodel.set_params(**SVCparams)
SVCmodel.fit(X_train, y_train)

eclf = VotingClassifier(estimators=[
       ('XGB', XGBmodel), ('RF', RFmodel), ('SVC', SVCmodel)], voting='soft')
eclf.fit(X_train, y_train)

VotingClassifier(estimators=[('XGB', XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.9650190880431975, gamma=3.6638133543864893,
       learning_rate=0.16705691888920274, max_delta_step=0, max_depth=34,
       min_child_weight=4.380404876606213, missing=None, n_estimato...bf', max_iter=-1, probability=True, random_state=None,
  shrinking=True, tol=0.001, verbose=False))],
         flatten_transform=None, n_jobs=1, voting='soft', weights=None)

In [28]:
predict_probabilities = eclf.predict_proba(X_train)[:,1]
fpr, tpr, thresholds = roc_curve(y_train, predict_probabilities)
optimal_threshold = thresholds[np.argmax(tpr - fpr)]

0.4225112345868191

In [23]:
testTreated = pd.read_csv('dTestTreated.csv')
testTreated = testTreated.drop('Unnamed: 0', 1)
testTreated.set_index('PassengerId', inplace=True)

In [24]:
# Get the Boruta reduced column list
boruta_columns = XGBmodel.get_booster().feature_names
# Filter test set columns and remove the columns excluded by Boruta
test_br = testTreated[np.intersect1d(testTreated.columns, boruta_columns)]
# Ensure the order of the test set columns is aligned with the training set of the model
test_br = test_br[boruta_columns]

In [25]:
# y_pred_test = eclf.predict(test_br)
y_pred_test = np.where(eclf.predict_proba(test_br)[:,1] > optimal_threshold, 1, 0)
test_br['Survived'] = y_pred_test

In [26]:
submission = test_br[['Survived']]
submission.reset_index(inplace=True)
submission.to_csv('submission.csv', index = False, header = True)
submission.head()