In [25]:
from xgboost.sklearn import XGBClassifier 
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from scipy.stats import randint
from sklearn.model_selection import cross_val_score, cross_val_predict, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder
from src.features.build_features import MostFrequentImputer, load_data, add_bucket, set_title

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import precision_score, recall_score, f1_score, roc_curve
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
import numpy as np
# To plot pretty figures
%matplotlib inline
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Load data

In [5]:
# train data 
train_data = load_data("train.csv")
y_train = train_data["Survived"]

# test data
test_data = load_data("test.csv")

In [8]:
relatives = ['SibSp' ,'Parch']

def add_columns(df):
    df['family'] = df[relatives].sum(axis=1)
    df['traveling_alone'] = np.where(df['family']==0,1,0)
    df['Sex'] = np.where(df['Sex']=='female',0,1)
    df['Age_Bucket'] = add_bucket(df['Age'], bins=6)
    df['Fare_Bucket'] = add_bucket(df['Fare'], bins=6)
    df['Title'] = df['Name'].apply(set_title)
    df['name_length'] = df['Name'].apply(len)
    df['Cabin'] = df['Cabin'].apply(lambda x: 0 if type(x) == float else 1)
    return df


train_data = add_columns(train_data)
test_data = add_columns(test_data)

## pipeline

In [10]:
# numerical pipeline
num_pipeline = Pipeline([ ("imputer", SimpleImputer(strategy="median")) ])
# categorical pipeline 
cat_pipeline = Pipeline([
        ("imputer", MostFrequentImputer()),
        ("cat_encoder", OneHotEncoder(sparse=False)),
    ])

# Full pipeline 
cat_attribs = ["Pclass", 'Embarked',  'Age_Bucket', 'Fare_Bucket', 'Title',]
num_attribs = [ "family", 'name_length',  'traveling_alone', 'Cabin', 'Sex', 'Age', 'Fare']

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", cat_pipeline, cat_attribs),
    ])

X_train = full_pipeline.fit_transform(train_data)
X_test = full_pipeline.transform(test_data)

Split the data into fifths, with the first 4/5ths train the models, use the remaining 1/5 of data to generate predictions. This predictions will be used as inputs for the second layer 

In [11]:
X_train.shape

(891, 29)

# Train Stage 1 models

## Log Reg

In [83]:
from sklearn.linear_model import LogisticRegression
log_clf = LogisticRegression()
param_distribs = {
        'class_weight': [None, 'balanced'],
        'penalty': ['l1', 'l2'], 
        'C': np.logspace(-20, 20, 10000), 
        'solver' : [ 'liblinear']
    }

rnd_search = RandomizedSearchCV(log_clf , param_distributions=param_distribs,
                                n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42)
rnd_search.fit(X_train, y_train)
# rename classifier 
log_clf = rnd_search.best_estimator_

In [84]:
log_clf.score(X_train, y_train)

0.8249158249158249

In [97]:
 log_clf.predict_proba(X_test)[:,1]

## Random Forest 

In [115]:
from sklearn.ensemble import RandomForestClassifier
forest_clf = RandomForestClassifier()

param_distribs = {
        'n_estimators': randint(low=1, high=500),
        'max_features': randint(low=1, high=10),
    }

rnd_search = RandomizedSearchCV(forest_clf , param_distributions=param_distribs,
                                n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42)
rnd_search.fit(X_train, y_train)
# rename classifier 
forest_clf = rnd_search.best_estimator_

In [116]:
forest_clf.score(X_train, y_train)

0.9966329966329966

# GB

In [117]:
gb_clf = GradientBoostingClassifier()


param_distribs = {
        'n_estimators': randint(low=1, high=500),
        'max_features': randint(low=1, high=10),
    }

rnd_search = RandomizedSearchCV(gb_clf , param_distributions=param_distribs,
                                n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42)
rnd_search.fit(X_train, y_train)
# rename classifier 
gb_clf = rnd_search.best_estimator_

In [118]:
gb_clf.score(X_train, y_train)

0.941638608305275

In [136]:
# add column 
X_stage2_data= np.c_[ log_clf.predict_proba(X_train)[:,1], forest_clf.predict_proba(X_train)[:,1],gb_clf.predict_proba(X_train)[:,1] ]

# Train stage 2 model 

In [141]:
import scipy.stats as st

xgb_clf = XGBClassifier(nthreads=-1)  


one_to_left = st.beta(10, 1)  
from_zero_positive = st.expon(0, 50)

params = {  
    "n_estimators": randint(low=1, high=500),
     'max_features': randint(low=1, high=10),
    "max_depth": st.randint(3, 40),
    "learning_rate": st.uniform(0.05, 0.4),
    "colsample_bytree": one_to_left,
    "subsample": one_to_left,
    "gamma": st.uniform(0, 10),
    'reg_alpha': from_zero_positive,
    "min_child_weight": from_zero_positive,
}

rnd_search = RandomizedSearchCV(xgb_clf , param_distributions=param_distribs,
                                n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42)
rnd_search.fit(X_stage2_data, y_train)
# rename classifier 
xgb_clf = rnd_search.best_estimator_

In [142]:
xgb_clf.score(X_stage2_data, y_train)

0.9966329966329966

# Predict on Test 

In [143]:
X_stage2_test= np.c_[ log_clf.predict_proba(X_test)[:,1], forest_clf.predict_proba(X_test)[:,1],gb_clf.predict_proba(X_test)[:,1] ]

In [144]:
test_data['Survived'] = xgb_clf.predict(X_stage2_test)
submission = test_data[['PassengerId', 'Survived']]
# save
submission.to_csv(path_or_buf = 'data/processed/submissions.csv', index=False)