# Kaggle Titanic Challenge
#### Alphonse Doutriaux - March 2018

## 0. Preliminaries

### 0.1. Libraries

In [1]:
import pandas as pd
import numpy as np
import re
import datetime
import warnings

from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from datetime import date, datetime
from xgboost import XGBClassifier
warnings.filterwarnings('ignore')

### 0.2. Data

In [2]:
train = pd.read_csv("./data/train.csv", index_col=0)
test = pd.read_csv("./data/test.csv", index_col=0)

In [3]:
data = pd.concat([train, test])

## 1. Preprocessing

In [4]:
X = data.copy()

y_train = train[['Survived']]
X = X[['Pclass', 'Name', 'Cabin', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]

### Feature extraction: from `Name` to `Title` using regular expressions

In [5]:
i=1
for name in X['Name']:
    title = re.findall(r"[,][ ][A-Za-z]*", name)[0][2:]
    X.loc[i, 'Name'] = title
    i=i+1
X.rename(columns = {'Name':'Title'}, inplace = True)

X['Title'] = X['Title'].replace(('Mme', 'Ms'), 'Mrs') 
X['Title'] = X['Title'].replace('Mlle', 'Miss') 
X['Title'] = X['Title'].replace(('Col', 'Major', 'Capt'), 'Military') 
X['Title'] = X['Title'].replace(('Dooley, Mr. Patrick', 'Jonkheer'), 'Mr')
X['Title'] = X['Title'].replace('Don', 'Sir')
X['Title'] = X['Title'].replace(('the', 'Dona'), 'Lady')

### Handle missing values

#### For the `age` and `fare` features, we replace `NaN` values with the median

In [6]:
X['Age'].fillna(X['Age'].median(), inplace=True)
X['Fare'].fillna(X['Fare'].median(), inplace=True)

#### For the harbor (PassengerId 62 & 830, two women in the B28 cabin)
A priori, we can not choose to replace missing values with `Q` as only two people from first class left from there. We input `S`, because 127 people from first class left from there, compared to 85 from `C`.

In [7]:
X['Embarked'].fillna('S', inplace=True)

#### For the cabins

My hypothesis is that the cabin number is caring information about the family and more importantly about the location on the ship (on which deck and at the bow/stern). Additionnal information showed that the lower cabin number are located at the front of the ship (source : http://s4.e-monsite.com/2011/05/15/759725893-pont-b-agrandi-jpg.jpg)

We choose to split the cabin number into cabin letter (=deck) and cabin number (=location on the boat)

In [8]:
X['Cabin'].fillna('Z0', inplace=True)

# cleaning the improper cabin num ('D' & 'T')
X.loc[340, 'Cabin'] = 'T0'
X.loc[1001, 'Cabin'] = 'F0' 
X.loc[1193, 'Cabin'] = 'D0' 
X.loc[949, 'Cabin'] = 'G63' 
X.loc[1180, 'Cabin'] = 'E46' 
X.loc[1213, 'Cabin'] = 'E57' 
for i in (293, 328, 474):
    X.loc[i, 'Cabin'] = 'D0' 

In [9]:
i=1

for cabin_id in X['Cabin']:
    cabin_num = int(re.findall(r"\d+", cabin_id)[0])
    cabin_deck = re.findall(r"[A-Z]", cabin_id)[0]
    X.loc[i, 'CabinNum'] = cabin_num
    X.loc[i, 'CabinDeck'] = cabin_deck
    i=i+1
    
X = X.drop(['Cabin'], axis=1)

We transform alphabetical information (about the deck) into numerical information. It seems that deck `A` to `G` correspond to decks 1 to 7. `T` decl is the highest deck: it is treated as `A`

In [10]:
X['CabinDeck'] = X['CabinDeck'].replace(['T', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'Z'],[7,7,6,5,4,3,2,1, np.NaN])
X['CabinDeck'] = X['CabinDeck'].fillna(X['CabinDeck'].median())

Missing values in the `CabinNum` column are replace with the median of the column

In [11]:
X['CabinNum'] = X['CabinNum'].replace(0, np.NaN)
X['CabinNum'].fillna(X['CabinNum'].median(), inplace=True)

### One hot encoding

In [12]:
columns_to_encode = ['Title', 'Sex', 'Embarked']
X = pd.get_dummies(X, columns=columns_to_encode, prefix=columns_to_encode)

### FamilySize & IsChild

In [13]:
X['FamilySize'] = X['SibSp'] + X['Parch'] + 1
X = X.drop(['Parch', 'SibSp'], axis=1)

### Sex x Age column

Wikipedia article about the *Titanic Shrinkage* says that the survival rate for men is strongly dependant on the class

In [14]:
X['FirstClassMale'] = pd.Series(0, index=X.index)
X['SecondClassMale'] = pd.Series(0, index=X.index)
X['ThirdClassMale'] = pd.Series(0, index=X.index)

In [15]:
i=1
for passenger in X.index:
    if i in X[X['Pclass'] == 1][X['Sex_male']==1].index.tolist():
        X.loc[i, 'FirstClassMale'] = 1
    elif i in X[X['Pclass'] == 2][X['Sex_male']==1].index.tolist():
        X.loc[i, 'SecondClassMale'] = 1
    elif i in X[X['Pclass'] == 3][X['Sex_male']==1].index.tolist():
        X.loc[i, 'ThirdClassMale'] = 1
    i += 1

### Train / test split

In [16]:
X_train = X[:len(train)]

## 2. XGBoost
* Hyperparameter tuning

In [17]:
# this script takes circa. 20s

n_estimators = [20, 100, 1000]
learning_rate = [0.1, 0.3, 0.5]
gamma = [0]
max_depth = [5, 7, 9]
colsample_bytree = np.arange(0.2, 1, 0.2)

param_grid_xgb = dict(n_estimators=n_estimators, learning_rate=learning_rate, gamma=gamma, max_depth=max_depth, colsample_bytree=colsample_bytree)

xgb_gs = GridSearchCV(XGBClassifier(), param_grid=param_grid_xgb, n_jobs=-1)
grid_result_xgb = xgb_gs.fit(X_train,y_train)

print("Best R2 score using XGBoost: {:.2%}".format(xgb_gs.best_score_))
print(xgb_gs.best_params_)

Best R2 score using XGBoost: 84.40%
{'colsample_bytree': 0.6000000000000001, 'gamma': 0, 'learning_rate': 0.3, 'max_depth': 9, 'n_estimators': 20}


* Fit and cross validation

In [18]:
xgb = XGBClassifier(n_estimators=xgb_gs.best_params_['n_estimators'],
                  learning_rate=xgb_gs.best_params_['learning_rate'],
                  gamma=xgb_gs.best_params_['gamma'],
                  max_depth=xgb_gs.best_params_['max_depth'],
                  colsample_bytree=xgb_gs.best_params_['colsample_bytree']
                  )

In [19]:
xgb_scores = cross_val_score(xgb, X_train, y_train, cv=3, scoring="accuracy", n_jobs=-1)

print("XGB Average accuracy: {:.2%}".format(xgb_scores.mean()))
print("Interval: [", round(xgb_scores.mean()-3*xgb_scores.std(),4), ";", round(xgb_scores.mean()+3*xgb_scores.std(),4),"]")

XGB Average accuracy: 84.40%
Interval: [ 0.7943 ; 0.8937 ]


## 4. Predictions on testset

In [20]:
xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.6000000000000001, gamma=0, learning_rate=0.3,
       max_delta_step=0, max_depth=9, min_child_weight=1, missing=None,
       n_estimators=20, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1)

In [21]:
test = X[891:]

In [23]:
preds = xgb.predict(test)

In [24]:
preds = pd.DataFrame({"Survived":preds}, index=test.index)
preds.Survived = preds.Survived.astype(int)

### Kaggle submission file preparation

In [25]:
preds.to_csv(path_or_buf= './submission_files/preds_' + datetime.now().strftime("%d%m%Y-%H%M%S") + '.csv')