# Titanic

In [1]:
# Main packages
import numpy as np
import pandas as pd
import xgboost as xgb

# Classifiers
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingRegressor
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

# Tools
from mlxtend.classifier import StackingClassifier
from sklearn.model_selection import cross_val_score

In [2]:
# Importing data
test = pd.read_csv('data/test.csv',index_col='PassengerId')
train = pd.read_csv('data/train.csv',index_col='PassengerId')

## Data processing functions

In [3]:
def data_eng(df):
    # Filling NA of original columns we'll use
    df['Age'] = df['Age'].fillna(df['Age'].mean())
    df['Fare'] = df['Fare'].fillna(df['Fare'].mean())
    
    # Creating columns
    ## Based on the Name column
    df['Mr'] = df['Name'].str.contains('Mr') & ~df['Name'].str.contains('Mrs')
    df['Mrs'] = df['Name'].str.contains('Mrs')
    df['Miss'] = df['Name'].str.contains('Miss')
    
    ## Based on the Embarked column
    df['Cherbourg'] = df['Embarked'].str.contains('C')
    df['Southampton'] = df['Embarked'].str.contains('S')
    df['Queenstown'] = df['Embarked'].str.contains('Q')
       
    ## Based on the Sex column
    df['Sex'] = ~df['Sex'].str.contains('female')
    
    ## Based on SibSp and Parch
    df['Family'] = df['SibSp'] + df['Parch'] + 1
    
    # Combining columns
    df['Pclass*Age'] = (df['Pclass']*df['Age']).apply(lambda x: 1/x if x != 0 else 999999999)
    df['Fare*Age'] = (df['Fare']*df['Age']).apply(lambda x: 1/x if x != 0 else 999999999)
    df['Pclass*Age*Fare'] = df['Pclass*Age']*df['Fare']
    
    return df

def fill_na(df):    
    
    df['Pclass*Age'] = df['Pclass*Age'].fillna(df['Pclass*Age'].mean())
    df['Fare*Age'] = df['Fare*Age'].fillna(df['Fare*Age'].mean())
    df['Pclass*Age*Fare'] = df['Pclass*Age*Fare'].fillna(df['Pclass*Age*Fare'].mean())
    df[['Cherbourg', 'Southampton', 'Queenstown']] = df[['Cherbourg', 'Southampton', 'Queenstown']].fillna(value = False)
    
    return df

def prepare_df(df):
    
    df = data_eng(df)    
    # Dropping columns
    df = df.drop(['Name', 'Ticket', 'Cabin', 'Embarked'], axis=1)
    
    return fill_na(df)

Applying data processing on train and then splitting dataset into **X_train** and **y_train**.

In [4]:
train = prepare_df(train)
X_train = train.drop('Survived', axis=1)
y_train = train['Survived']

Applying data processing on test

In [5]:
test = prepare_df(test)

## Training part

In [6]:
# Models
rf = RandomForestClassifier(n_estimators=100)
lr = LogisticRegression(solver='lbfgs')
dt = DecisionTreeClassifier()
ada = AdaBoostClassifier(DecisionTreeClassifier(), algorithm="SAMME.R", n_estimators=100)
xg = xgb.XGBClassifier(n_estimators=100)
svc = SVC()
knn = KNeighborsClassifier(n_neighbors=5)

In [7]:
# Meta
meta_clf = LogisticRegression(solver='lbfgs')

In [8]:
# Stacking
classifiers = [rf, lr, dt, ada, xg, svc, knn]
stack_clf = StackingClassifier(classifiers=classifiers, meta_classifier=meta_clf)

In [9]:
# Parameters
max_depth = range(1, 3)
splits = [10, 18, 30]

params = {
    'randomforestclassifier__max_depth' : max_depth,
    'randomforestclassifier__min_samples_split' : splits,
    'decisiontreeclassifier__max_depth' : max_depth,
    'decisiontreeclassifier__min_samples_split' : splits,
    'xgbclassifier__max_depth' : max_depth
}

In [10]:
import warnings
warnings.filterwarnings('ignore')

# Grid Search
grid = GridSearchCV(estimator=stack_clf, param_grid=params, cv=5, refit=True, verbose=1)
grid.fit(X_train, y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 360 out of 360 | elapsed:  4.3min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=StackingClassifier(average_probas=False,
          classifiers=[RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_...ures=False, use_clones=True,
          use_features_in_secondary=False, use_probas=False, verbose=0),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'randomforestclassifier__max_depth': range(1, 3), 'randomforestclassifier__min_samples_split': [10, 18, 30], 'decisiontreeclassifier__max_depth': range(1, 3), 'decisiontreeclassifier__min_samples_split': [10, 18, 30], 'xgbclassifier__max_depth': range(1, 3)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [11]:
grid.best_score_

0.8092031425364759

## Generating the results

Predicting results from the test dataset.

In [12]:
res = grid.best_estimator_.predict(test)

Generating the .csv file

In [13]:
df_res = pd.DataFrame({'PassengerId':test.index, 'Survived':res})

In [14]:
df_res.to_csv('stacking.csv', index=False)