 # Titanic: Machine Learning from Disaster
 
 https://www.kaggle.com/c/titanic

## Load required packages

In [93]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import Imputer, LabelEncoder, StandardScaler
from sklearn.model_selection import (
    train_test_split, StratifiedKFold, cross_val_score, 
    learning_curve, validation_curve, GridSearchCV
)
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline

## Load dataset

In [2]:
dataset = pd.read_csv("train.csv")
dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Data Dictionary

Variable	Definition	
survival 	Survival 	
pclass 	Ticket class 
sex 	Sex 	
Age 	Age in years 	
sibsp 	# of siblings / spouses aboard the Titanic 	
parch 	# of parents / children aboard the Titanic 	
ticket 	Ticket number 	
fare 	Passenger fare 	
cabin 	Cabin number 	
embarked 	Port of Embarkation 	C = Cherbourg, Q = Queenstown, S = Southampton

## Data Preprocessing

In [3]:
y = dataset["Survived"]
X = pd.DataFrame(dataset[dataset.columns.difference(["PassengerId", "Survived", "Cabin", "Name", "Ticket"])])
X.head()

Unnamed: 0,Age,Embarked,Fare,Parch,Pclass,Sex,SibSp
0,22.0,S,7.25,0,3,male,1
1,38.0,C,71.2833,0,1,female,1
2,26.0,S,7.925,0,3,female,0
3,35.0,S,53.1,0,1,female,1
4,35.0,S,8.05,0,3,male,0


### Imput missing values

In [4]:
X.isna().sum()

Age         177
Embarked      2
Fare          0
Parch         0
Pclass        0
Sex           0
SibSp         0
dtype: int64

In [5]:
X[["Embarked"]] = X[["Embarked"]].fillna(value=X["Embarked"].value_counts().index[0])

age_imputer = Imputer(missing_values="NaN", strategy="mean", axis=0).fit(X[["Age"]])
X[["Age"]] = age_imputer.transform(X[["Age"]])

### Handling categorical data

In [6]:
X = pd.get_dummies(X, columns=["Pclass", "Embarked", "Sex"], drop_first=True)

### Create training and test sets

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)

In [8]:
print("Train split: ", np.bincount(y_train)/y_train.shape[0])
print("Test split: ", np.bincount(y_test)/y_test.shape[0])

Train split:  [0.61637239 0.38362761]
Test split:  [0.61567164 0.38432836]


### Standardize features

In [55]:
class SubsetStandardScaler(StandardScaler):
    
    def __init__(self, columns=None, **kwargs):
        super().__init__(**kwargs)
        self.columns = columns
        
    def fit(self, X, y=None):
        if self.columns is None:
            return super().fit(X, y)
        return super().fit(X[self.columns], y)
        
    def transform(self, X, y="deprecated", copy=None):
        if self.columns is None:
            return super().transform(X, y, copy)
        X_std = X.copy()
        X_subset_std = super().transform(X_std[self.columns], y, copy)
        for i in range(X_subset_std.shape[1]):
            X_std[[self.columns[i]]] = X_subset_std[:,i]
        return X_std.values

## Algorithm Selection

Using nested cross-validation select the best algorithm among following:
- logistic regression
- svc
- decision tree classifier
- random forests
- k-nearest neighbors

### Logistic Regression

In [81]:
pipe_lr = make_pipeline(
    SubsetStandardScaler(columns=["Age", "Fare"]),
    LogisticRegression()
)

gs = GridSearchCV(
    estimator=pipe_lr,
    param_grid = [ { "logisticregression__C": [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]} ],
    scoring="accuracy", cv=2
)

scores = cross_val_score(gs, X_train, y_train, scoring="accuracy", cv=5)

print("CV accuracy: %.3f +/- %.3f" % (np.mean(scores), np.std(scores)))

CV accuracy: 0.793 +/- 0.027


### SVC

In [83]:
pipe_svc = make_pipeline(
    SubsetStandardScaler(columns=["Age", "Fare"]),
    SVC(random_state=1)
)

param_range = [ 10**i for i in range(-2, 2,) ]
param_grid = [
    {"svc__C": param_range, "svc__kernel": ["linear"]},
    {"svc__C": param_range, "svc__gamma": param_range, "svc__kernel": ["rbf"]}
]

gs = GridSearchCV(
    estimator=pipe_svc, param_grid=param_grid, scoring="accuracy", 
    cv=2, n_jobs=-1
)
scores = cross_val_score(gs, X_train, y_train, scoring="accuracy", cv=5)

print("CV accuracy: %.3f +/- %.3f" % (np.mean(scores), np.std(scores)))

CV accuracy: 0.825 +/- 0.012


### Decision Tree Classifier

In [87]:
dtc_pipe = make_pipeline(
    SubsetStandardScaler(columns=["Age", "Fare"]),
    DecisionTreeClassifier(random_state=1)
)

gs = GridSearchCV(
    estimator=dtc_pipe,
    param_grid=[{ "decisiontreeclassifier__max_depth": [1, 2, 3, 4, 5, 6, 7, None]}],
    scoring="accuracy", cv=2, n_jobs=-1
)
scores = cross_val_score(gs, X_train, y_train, scoring="accuracy", cv=5)

print("CV accuracy: %.3f +/- %.3f" % (np.mean(scores), np.std(scores)))

CV accuracy: 0.799 +/- 0.008


### Random Forest

In [89]:
rf_pipe = make_pipeline(
    SubsetStandardScaler(columns=["Age", "Fare"]),
    RandomForestClassifier(random_state=1)
)

gs = GridSearchCV(
    estimator=rf_pipe,
    param_grid=[{
        "randomforestclassifier__max_depth": [1, 2, 3, None],
        "randomforestclassifier__n_estimators": [10, 50, 100, 150, 200, 250]
    }],
    cv=2, n_jobs=-1, scoring="accuracy"
)
scores = cross_val_score(gs, X_train, y_train, scoring="accuracy", cv=5)

print("CV accuracy: %.3f +/- %.3f" % (np.mean(scores), np.std(scores)))

CV accuracy: 0.796 +/- 0.021


### K-nearest neighbors

In [95]:
knn_pipe = make_pipeline(
    SubsetStandardScaler(columns=["Age", "Fare"]),
    KNeighborsClassifier(p=2, metric="minkowski")
)

gs = GridSearchCV(
    estimator=knn_pipe,
    param_grid=[{
        "kneighborsclassifier__n_neighbors": [2, 5, 10, 15, 20, 25]
    }],
    scoring="accuracy", cv=2, n_jobs=-1
)
scores = cross_val_score(gs, X_train, y_train, scoring="accuracy", cv=5)

print("CV accuracy: %.3f +/- %.3f" % (np.mean(scores), np.std(scores)))

CV accuracy: 0.778 +/- 0.017


SVC with accuracy of 0.825 is the best algorithm.

## SVC - Model Selection

In [105]:
pipe_svc = make_pipeline(
    SubsetStandardScaler(columns=["Age", "Fare"]),
    SVC(random_state=1)
)

param_range = [ 10**i for i in range(-2, 2,) ]
param_grid = [
    {"svc__C": param_range, "svc__kernel": ["linear"]},
    {"svc__C": param_range, "svc__gamma": param_range, "svc__kernel": ["rbf"]}
]

gs = GridSearchCV(
    estimator=pipe_svc, param_grid=param_grid, scoring="accuracy", 
    cv=2, n_jobs=-1
).fit(X_train, y_train)

In [106]:
gs.best_params_

{'svc__C': 1, 'svc__gamma': 0.1, 'svc__kernel': 'rbf'}

In [108]:
pipe_svc = make_pipeline(
    SubsetStandardScaler(columns=["Age", "Fare"]),
    SVC(kernel="rbf", gamma=0.1, C=1)
).fit(X_train, y_train)

In [110]:
print("SVC accuracy on test set: %.3f" % pipe_svc.score(X_test, y_test))

SVC accuracy on test set: 0.836
