In [1]:
from xgboost.sklearn import XGBClassifier 
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from scipy.stats import randint
from sklearn.model_selection import cross_val_score, cross_val_predict, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder
from src.features.build_features import MostFrequentImputer, load_data, add_bucket, set_title

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import precision_score, recall_score, f1_score, roc_curve
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
import numpy as np
# To plot pretty figures
%matplotlib inline
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Load data

In [2]:
# train data 
train_data = load_data("train.csv")
y_train = train_data["Survived"]

# test data
test_data = load_data("test.csv")

In [3]:
relatives = ['SibSp' ,'Parch']

def add_columns(df):
    df['family'] = df[relatives].sum(axis=1)
    df['traveling_alone'] = np.where(df['family']==0,1,0)
    df['Sex'] = np.where(df['Sex']=='female',0,1)
    df['Age_Bucket'] = add_bucket(df['Age'], bins=6)
    df['Fare_Bucket'] = add_bucket(df['Fare'], bins=6)
    df['Title'] = df['Name'].apply(set_title)
    df['name_length'] = df['Name'].apply(len)
    df['Cabin'] = df['Cabin'].apply(lambda x: 0 if type(x) == float else 1)
    return df


train_data = add_columns(train_data)
test_data = add_columns(test_data)

## pipeline

In [4]:
# numerical pipeline
num_pipeline = Pipeline([ ("imputer", SimpleImputer(strategy="median")) ])
# categorical pipeline 
cat_pipeline = Pipeline([
        ("imputer", MostFrequentImputer()),
        ("cat_encoder", OneHotEncoder(sparse=False)),
    ])

# Full pipeline 
cat_attribs = ["Pclass", 'Embarked',  'Age_Bucket', 'Fare_Bucket', 'Title',]
num_attribs = [ "family", 'name_length',  'traveling_alone', 'Cabin', 'Sex', 'Age', 'Fare']

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", cat_pipeline, cat_attribs),
    ])

X_train = full_pipeline.fit_transform(train_data)
X_test = full_pipeline.transform(test_data)

In [5]:
X_train.shape

(891, 29)

# Tune models

## Log Reg

In [6]:
from sklearn.linear_model import LogisticRegression
log_clf = LogisticRegression()
param_distribs = {
        'class_weight': [None, 'balanced'],
        'penalty': ['l1', 'l2'], 
        'C': np.logspace(-20, 20, 10000), 
        'solver' : [ 'liblinear']
    }

rnd_search = RandomizedSearchCV(log_clf , param_distributions=param_distribs,
                                n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42)
rnd_search.fit(X_train, y_train)
# rename classifier 
log_clf = rnd_search.best_estimator_

In [7]:
log_clf.score(X_train, y_train)

0.8249158249158249

## Random Forest 

In [9]:
from sklearn.ensemble import RandomForestClassifier
forest_clf = RandomForestClassifier()

param_distribs = {
        'n_estimators': randint(low=1, high=500),
        'max_features': randint(low=1, high=10),
    }

rnd_search = RandomizedSearchCV(forest_clf , param_distributions=param_distribs,
                                n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42)
rnd_search.fit(X_train, y_train)
# rename classifier 
forest_clf = rnd_search.best_estimator_

In [10]:
forest_clf.score(X_train, y_train)

0.9966329966329966

# GB

In [11]:
gb_clf = GradientBoostingClassifier()


param_distribs = {
        'n_estimators': randint(low=1, high=500),
        'max_features': randint(low=1, high=10),
    }

rnd_search = RandomizedSearchCV(gb_clf , param_distributions=param_distribs,
                                n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42)
rnd_search.fit(X_train, y_train)
# rename classifier 
gb_clf = rnd_search.best_estimator_

In [12]:
gb_clf.score(X_train, y_train)

0.9494949494949495

# XGboost

In [24]:
xgb_clf = XGBClassifier(nthreads=-1)  


param_distribs = {
        'n_estimators': randint(low=1, high=500),
        'max_features': randint(low=1, high=10),
    }

rnd_search = RandomizedSearchCV(xgb_clf , param_distributions=param_distribs,
                                n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42)
rnd_search.fit(X_train, y_train)
# rename classifier 
xgb_clf = rnd_search.best_estimator_
xgb_clf.score(X_train, y_train)

0.9494949494949495

# Voting classifier

In [40]:
from sklearn.ensemble import VotingClassifier


vt_clf = VotingClassifier(estimators=[ ('log', log_clf ), ('rf', forest_clf ), ('gb', gb_clf)], voting='hard')
vt_clf.fit(X_train, y_train)
vt_clf.score(X_train, y_train)

0.9472502805836139

# Predict on Test 

In [41]:
test_data['Survived'] = vt_clf.predict(X_test)
submission = test_data[['PassengerId', 'Survived']]
# save
submission.to_csv(path_or_buf = 'data/processed/submissions.csv', index=False)

Voting Classifier (without xgboost): 0.76555
Voting Classifier (wit xgboost hard): 0.74162
Voting Classifier (wit xgboost soft): 0.74162