# The Titanic data challenge

In [None]:
import numpy as np
import pandas as pd

In [None]:
# Settings
showPlots = False
doGridSearch = False

In [None]:
# Load the data
train_df = pd.read_csv('data/titanic/train.csv')
apply_df = pd.read_csv('data/titanic/test.csv')
print(train_df[:3], '\n')
print(train_df.info())

## Test submissions
Test files:
* random prediction
* survival for women and children only

In [None]:
createRandom, createWomenChildren = False, False

In [None]:
if createRandom:
    # Create dummy random output for first submission
    id_df = apply_df['PassengerId']
    random_df = pd.DataFrame(np.random.randint(low=0, high=2, size=(id_df.shape[0], 1)), columns=['Survived'])
    result_df = pd.concat([id_df, random_df], axis=1)

    # Save output to file
    result_df.to_csv("results/result_random.csv", index=False, float_format='%.0f')

In [None]:
if createWomenChildren:
    # Create dummy output with survival for women and children only
    result_df = apply_df
    result_df['Survived'] = ((result_df['Sex'] == 'female') | (result_df['Age'] < 16)).astype(int)

    # Save output to file
    result_df.to_csv("results/result_children_women.csv", columns=['PassengerId', 'Survived'], index=False, float_format='%.0f')

## Inspect the data

In [None]:
# Have a general look
train_df.describe()

In [None]:
# Find how many unique entries
train_df.describe(include='O')

In [None]:
# Count how many men
train_df.loc[train_df['Sex'] == 'male'].shape[0]

#### Take a closer look at single features

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

if showPlots:
    g = sns.FacetGrid(train_df, col='Survived')
    g.map(plt.hist, 'Age', bins=20)

In [None]:
if showPlots:
    # grid = sns.FacetGrid(train_df, col='Pclass', hue='Survived')
    grid = sns.FacetGrid(train_df, col='Survived', row='Pclass', size=2.2, aspect=1.6)
    grid.map(plt.hist, 'Age', alpha=.5, bins=20)
    grid.add_legend()

In [None]:
if showPlots:
    grid = sns.FacetGrid(train_df, col='Embarked')
#     grid = sns.FacetGrid(train_df, row='Embarked', size=2.2, aspect=1.6)
    grid.map(sns.pointplot, 'Pclass', 'Survived', 'Sex', palette='deep', 
             hue_order=['female', 'male'], order=[1,2,3])
    grid.add_legend()

In [None]:
if showPlots:
#     grid = sns.FacetGrid(train_df, col='Embarked', hue='Survived', palette={0: 'k', 1: 'w'})
    grid = sns.FacetGrid(train_df, row='Embarked', col='Survived', size=2.2, aspect=1.6)
    grid.map(sns.barplot, 'Sex', 'Fare', alpha=.5, ci=None, order=['female', 'male'])
    grid.add_legend()

## Data preparation

### Remove unusable data
* drop PassengerId, Ticket

In [None]:
X_full = train_df.drop(["Survived", "PassengerId", "Ticket"], axis=1)
y_full = train_df["Survived"]
X_apply = apply_df.drop(["PassengerId", "Ticket"], axis=1)
X_full.head()

### Feature engineering
* number of family members
* coarser binning for age
* title from name
* maybe add some products, sums etc?

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin


class MyFeatureAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_relatives=True, add_title=True, add_age_band=True):
        self.add_title = add_title
        self.add_relatives = add_relatives
        self.add_age_band = add_age_band
    
    def fit(self, X, y=None):
        return self  # nothing to do

    def transform(self, X, y=None):
        if self.add_title:
            pd.options.mode.chained_assignment = None  # creates slice-copy assignment warning
            X['Title'] = X.Name.str.extract(r' ([A-Za-z]+)\.', expand=False)
            X['Title'].replace(['Lady', 'Countess', 'Capt', 'Col',
                                'Don', 'Dr', 'Major', 'Rev', 'Sir',
                                'Jonkheer', 'Dona'],
                                'Rare', inplace=True)
            X['Title'].replace('Mlle', 'Miss', inplace=True)
            X['Title'].replace('Ms', 'Miss', inplace=True)
            X['Title'].replace('Mme', 'Mrs', inplace=True)
            pd.options.mode.chained_assignment = 'warn'

        if self.add_relatives:
#             X.loc[:, 'Family'] = X['SibSp'] + X['Parch']  # creates slice-copy assignment warning
            Xtmp = pd.DataFrame(X['SibSp'] + X['Parch'], columns=['Family'])
            X = pd.concat([X, Xtmp], axis=1)
        
        if self.add_age_band:
            pd.options.mode.chained_assignment = None  # creates slice-copy assignment warning
            def age_band(row, quantiles):
                for i in range(len(quantiles)):
                    if row['Age'] < quantiles[i]:
                        return i-1
                return len(quantiles)-1
            nBins = 4
            quantiles = [X['Age'].quantile(1.0 * q / nBins) for q in range(0, nBins+1)]
            X['Age'] = X.apply(lambda row: age_band(row, quantiles), axis=1)
            pd.options.mode.chained_assignment = 'warn'
        
        X = X.drop('Name', axis=1)
        return X

# feat_adder = MyFeatureAdder()
# X_new = feat_adder.fit_transform(X_full)
# print(X_new.head())
# X_new.describe()

### Transform non-numeric labels to numeric ones
* fill NaN values with sensible defaults
* fill missing values with medians
* integer labels for 'Sex', 'Embarked' and 'Cabin'

In [None]:
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import label_binarize
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler


class MyNumericizer(BaseEstimator, TransformerMixin):
    def __init__(self, add_cabin_info=True, use_one_hot=False):
        # no *args, **kargs to make use of BaseEstimator class
        # other args can be steered later as hyperparameters
        self.add_cabin_info = add_cabin_info
        self.use_one_hot = use_one_hot

    def fit(self, X, y=None):
        return self  # nothing to do

    def transform(self, X, y=None):
        # Sex
        binarizer = LabelBinarizer(sparse_output=False)
        X['Sex'] = binarizer.fit_transform(X['Sex'])
        # Embarked
        X['Embarked'].fillna('unknown', inplace=True)
        if self.use_one_hot:
            print('Warning: cross validation chokes on this!')
            classes = ['C', 'S', 'Q', 'unknown']
            X_embarked = pd.DataFrame(label_binarize(X['Embarked'], classes=classes),
                                      columns=map(lambda s: 'Em_' + s, classes))
            X = X.drop(['Embarked'], axis=1)
            X = pd.concat([X, X_embarked], axis=1)
        else:
            encoder = LabelEncoder()
            X['Embarked'] = encoder.fit_transform(X['Embarked'])
        #cabin
        encoder = LabelEncoder()
        if self.add_cabin_info:
            X['Cabin'].fillna('unknown', inplace=True)
            X['Cabin'] = encoder.fit_transform(X['Cabin'])
        else:
            X = X.drop(['Cabin'], axis=1)
        # Age
        median = X['Age'].median()
        X['Age'].fillna(median, inplace=True)
        # Fare
        median = X['Fare'].median()
        X['Fare'].fillna(median, inplace=True)
        # Title
        if 'Title' in X.columns:
            X['Title'].fillna('unknown', inplace=True)
            X['Title'] = encoder.fit_transform(X['Title'])
        
        if X.isnull().any().any():
            print('Warning: NaN values detected:')            
            total_df = pd.DataFrame(np.full((X.shape[1], 1), "/ " + str(X.shape[0])))
            total_df.index = X.columns
            info_df = pd.concat([X.isnull().any(), X.isnull().sum(), total_df], axis=1)
            info_df.columns = ['NaN?', '#NaN', 'Total']
            print(info_df)
            print('Single rows containing NaN values:')
            print(X[(X.iloc[:,:].isnull()).any(axis=1)])
            

        return X


# print(X_full.head())
# feat_adder = MyFeatureAdder()
# X_new = feat_adder.fit_transform(X_full)
# print(X_new.head())
# numericizer = MyNumericizer()
# X_new = numericizer.fit_transform(X_new)
# print(X_new.head())
# scaler = StandardScaler()
# colNames = X_new.columns
# X_new = scaler.fit_transform(X_new)
# print(pd.DataFrame(X_new, columns=colNames).head())


### Visualization

#### Before transformation to numerical values

In [None]:
from pandas.plotting import scatter_matrix

if showPlots:
    plt.figure()
    X_full.hist(bins=50, figsize=(20, 15))
    1  # prevent matlpotlib printout

#### After transformation to numerical values

In [None]:
feat_adder = MyFeatureAdder()
my_num = MyNumericizer()
X_plot = my_num.fit_transform(feat_adder.fit_transform(X_full))

if showPlots:
    plt.figure()
    X_plot.hist(bins=50, figsize=(20, 15))
    1  # prevent matlpotlib printout

#### Correlation plots

In [None]:
if showPlots:
    plt.figure()
    scatter_matrix(pd.concat([X_plot, y_full], axis=1), figsize=(12, 8))
    1  # prevent matlpotlib printout

### Create pipeline

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier


add_cabin_info = False  # adding this worsens accuracy
add_relatives = True  # adding this improves accuracy
add_title = True  # in current implementation worsens accuracy

clfs = {}

clfs['linear_svc'] = Pipeline((
        ('MyFeatureAdder', MyFeatureAdder(add_relatives=add_relatives, add_title=add_title)),
        ('MyNumericizer', MyNumericizer(add_cabin_info=add_cabin_info)),
        ('scaler', StandardScaler()),
        ('clf', LinearSVC(C=1, loss='hinge')),
        ))

clfs['rbf_svc'] = Pipeline((
        ('MyFeatureAdder', MyFeatureAdder(add_relatives=add_relatives, add_title=add_title)),
        ('MyNumericizer', MyNumericizer(add_cabin_info=add_cabin_info)),
        ('scaler', StandardScaler()),
        ('clf', SVC(kernel='rbf', C=1)),
        ))

clfs['poly_svc'] = Pipeline((
        ('MyFeatureAdder', MyFeatureAdder(add_relatives=add_relatives, add_title=add_title)),
        ('MyNumericizer', MyNumericizer(add_cabin_info=add_cabin_info)),
        ('scaler', StandardScaler()),
        ('clf', SVC(kernel='poly', C=1)),
        ))

clfs['tree'] = Pipeline((
        ('MyFeatureAdder', MyFeatureAdder(add_relatives=add_relatives, add_title=add_title)),
        ('MyNumericizer', MyNumericizer(add_cabin_info=add_cabin_info)),
        ('scaler', StandardScaler()),
        ('clf', DecisionTreeClassifier()),
        ))

clfs['kNN'] = Pipeline((
        ('MyFeatureAdder', MyFeatureAdder(add_relatives=add_relatives, add_title=add_title)),
        ('MyNumericizer', MyNumericizer(add_cabin_info=add_cabin_info)),
        ('scaler', StandardScaler()),
        ('clf', KNeighborsClassifier(n_neighbors=3)),
        ))

clfs['gradBoost'] = Pipeline((
        ('MyFeatureAdder', MyFeatureAdder(add_relatives=add_relatives, add_title=add_title)),
        ('MyNumericizer', MyNumericizer(add_cabin_info=add_cabin_info)),
        ('scaler', StandardScaler()),
        ('clf', GradientBoostingClassifier(learning_rate=1.0, n_estimators=3, max_depth=2)),
        ))

clfs['randomForest'] = Pipeline((
        ('MyFeatureAdder', MyFeatureAdder(add_relatives=add_relatives, add_title=add_title)),
        ('MyNumericizer', MyNumericizer(add_cabin_info=add_cabin_info)),
        ('scaler', StandardScaler()),
        ('clf', RandomForestClassifier()),
        ))

clfs['log_reg'] = Pipeline((
        ('MyFeatureAdder', MyFeatureAdder(add_relatives=add_relatives, add_title=add_title)),
        ('MyNumericizer', MyNumericizer(add_cabin_info=add_cabin_info)),
        ('scaler', StandardScaler()),
        ('clf', LogisticRegression()),
        ))

clfs['naive_bayes'] = Pipeline((
        ('MyFeatureAdder', MyFeatureAdder(add_relatives=add_relatives, add_title=add_title)),
        ('MyNumericizer', MyNumericizer(add_cabin_info=add_cabin_info)),
        ('scaler', StandardScaler()),
        ('clf', GaussianNB()),
        ))

clfs['perceptron'] = Pipeline((
        ('MyFeatureAdder', MyFeatureAdder(add_relatives=add_relatives, add_title=add_title)),
        ('MyNumericizer', MyNumericizer(add_cabin_info=add_cabin_info)),
        ('scaler', StandardScaler()),
        ('clf', Perceptron(max_iter=100)),
        ))

clfs['SGD'] = Pipeline((
        ('MyFeatureAdder', MyFeatureAdder(add_relatives=add_relatives, add_title=add_title)),
        ('MyNumericizer', MyNumericizer(add_cabin_info=add_cabin_info)),
        ('scaler', StandardScaler()),
        ('clf', SGDClassifier(max_iter=100)),
        ))

## Training

### Split training sample into train and validation set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_full, y_full, test_size=0.25, random_state=1337)
# Reset the index to reach from 0 to n-1 to avoid NaN rows
for x in [X_train, X_test, y_train, y_test]:
    x.reset_index(drop=True, inplace=True)
print('Split', X_full.shape[0], 'events into', X_train.shape[0], 'training and', X_test.shape[0], 'test events!')

### Training

In [None]:
for name, clf in clfs.items():
    clf.fit(X_train, y_train)

## Evaluation

### Feature relevance

In [None]:
feat_adder = MyFeatureAdder()
X_new = feat_adder.fit_transform(X_full)
numericizer = MyNumericizer()
X_new = numericizer.fit_transform(X_new)

coeff_df = pd.DataFrame(X_new.columns.delete(0))
coeff_df.columns = ['Feature']
coeff_df["Coeff"] = pd.Series(clfs['log_reg'].named_steps['clf'].coef_[0])

coeff_df.sort_values(by='Coeff', ascending=False)

### Model accuracy and cross validation

In [None]:
import tabulate

from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score


def get_metrics(clf, name):
    y_pred = clf.predict(X_test)
    
    accuracy = round(accuracy_score(y_test, y_pred), 3)
    
    cvs = cross_val_score(clf, X_train, y_train, cv=3, scoring='accuracy')
    
    y_train_pred = cross_val_predict(clf, X_train, y_train, cv=3)
    
    accuracy_train = round(accuracy_score(y_train, y_train_pred), 3)
    
    confusion = confusion_matrix(y_train, y_train_pred)
    precision = precision_score(y_train, y_train_pred)
    recall = recall_score(y_train, y_train_pred)
    f1 = f1_score(y_train, y_train_pred)
    
    header = ['Algorithm', 'Accuracy\nTest data', 'Accuracy\nTrain data', 'Xval score\nTrain data',
              'Precision', 'Recall', 'F1', 'Confusion']
    metrics = (name, accuracy, accuracy_train, cvs.mean(), precision, recall, f1, confusion)
    
    return {'header': header, 'metrics': metrics}

headers = []
metrics = []
for name, clf in clfs.items():
    thisMetrics = get_metrics(clf, name)
    metrics.append(thisMetrics['metrics'])
    headers = thisMetrics['header']

metrics.sort(key=lambda x: x[1], reverse=True)
print(tabulate.tabulate(metrics, headers=headers, tablefmt='grid'))

top = metrics[0][0]

### ROC curves

In [None]:
def plot_precision_recall_vs_threshold(prec, rec, thresh, axes=None):
    if axes is None:
        fig, axes = plt.subplots(1,1)
    axes.plot(thresh, prec[:-1], label='Precision')
    axes.plot(thresh, rec[:-1], label='Recall')
    axes.set_xlabel('Threshold')
    axes.legend(loc='best')
    axes.set_ylim([0, 1])


def plot_precision_vs_recall(prec, rec, axes=None):
    if axes is None:
        fig, axes = plt.subplots(1,1)
    axes.plot(rec, prec, 'b--', label='Precision')
    axes.set_xlabel('Recall')
    axes.set_ylabel('Precision')
    axes.set_ylim([0, 1])


def plot_roc_curve(fpr, tpr, label=None, axes=None):
    if axes is None:
        fig, axes = plt.subplots(1,1)
    axes.plot(fpr, tpr, linewidth=2, label=label)
    axes.plot([0, 1], [0, 1], 'k--')
    axes.axis([0, 1, 0, 1])
    axes.set_xlabel('False Positive Rate')
    axes.set_ylabel('True Positive Rate')
    
def plot_roc_curves(fprs, tprs, names):
    plt.figure()
    for fpr, tpr, name in zip(fprs, tprs, names):
        plt.plot(fpr, tpr, label=name)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.axis([0, 1, 0, 1])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend(loc='lower right')

In [None]:
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve, roc_auc_score


fprs, tprs, names = [], [], []
for idx, name in enumerate([item[0] for item in metrics]):
    try:
        y_scores = cross_val_predict(clfs[name], X_train, y_train, cv=3, method='decision_function')
    except AttributeError:
        print('No decision_function method for', name + '?')
        continue

    precisions, recalls, thresholds = precision_recall_curve(y_train, y_scores)
    fpr, tpr, roc_thresholds = roc_curve(y_train, y_scores)
    fprs.append(fpr)
    tprs.append(tpr)
    names.append(name)
    
    if showPlots:
        plt.figure(idx, figsize=(9, 2))

        plot_precision_recall_vs_threshold(precisions, recalls, thresholds, axes=plt.subplot(131))
        plot_precision_vs_recall(precisions, recalls, axes=plt.subplot(132))
        plot_roc_curve(fpr, tpr, axes=plt.subplot(133))

        plt.suptitle(name)
        plt.tight_layout()

In [None]:
if showPlots:
    plot_roc_curves(fprs, tprs, names)

## Model optimization

### Perform a grid search for hyperparameters

In [None]:
from sklearn.model_selection import GridSearchCV

if doGridSearch:
    parameters = {}
    if top == 'gradBoost':
        parameters = {
                      'clf__learning_rate': [0.1, 0.2, 1.0],
                      'clf__n_estimators': [3, 10, 100], 
                      'clf__max_depth': [2, 3, 4],
                     }

    parameters['MyFeatureAdder__add_title'] = (False, True)
    parameters['MyFeatureAdder__add_relatives'] = (False, True)
    parameters['MyNumericizer__add_cabin_info'] = (False, True)
    parameters['MyFeatureAdder__add_age_band'] = (False, True)

    grid_search = GridSearchCV(clfs[top], parameters, scoring='accuracy')
    grid_search.fit(X_train, y_train)

    print(*sorted([(key + ': ' + str(val)) for key, val in grid_search.best_params_.items()]), sep='\n')

    top_optimized = grid_search.best_estimator_

    print('Top score:', grid_search.best_score_)
else:
    print('Skip optimization!')
    top_optimized = clfs[top]

### Investigate the optimal estimator

#### Compare performance metrics

In [None]:
topOptimizedMetrics = get_metrics(top_optimized, 'Optimized\n' + top)
topMetrics = get_metrics(clfs[top], 'Unoptimized\n' + top)

headers = topOptimizedMetrics['header']
metrics = [topOptimizedMetrics['metrics'], topMetrics['metrics']]

print(tabulate.tabulate(metrics, headers=headers, tablefmt='grid'))

#### Show plots

In [None]:
description = ['Optimized', 'Unoptimized']

for idx, clf in enumerate((top_optimized, clfs[top])):
    try:
        y_scores = cross_val_predict(clf, X_train, y_train, cv=3, method='decision_function')
    except AttributeError:
        print(str(idx) + ': no decision_function method?')
        continue
    
    precisions, recalls, thresholds = precision_recall_curve(y_train, y_scores)
    fpr, tpr, roc_thresholds = roc_curve(y_train, y_scores)
    
    if showPlots:
        plt.figure(idx, figsize=(9, 2))

        plot_precision_recall_vs_threshold(precisions, recalls, thresholds, axes=plt.subplot(131))
        plot_precision_vs_recall(precisions, recalls, axes=plt.subplot(132))
        plot_roc_curve(fpr, tpr, axes=plt.subplot(133))

        plt.suptitle(description[idx])
        plt.tight_layout()

## Prediction

In [None]:
# y_pred = clfs[top].predict(X_apply)
y_pred = top_optimized.predict(X_apply)

In [None]:
id_pred = pd.DataFrame(apply_df['PassengerId'])
y_pred = pd.DataFrame(y_pred, columns=['Survived'])
result_df = pd.concat([id_pred, y_pred], axis=1)
print(result_df.head())

# Save output to file
result_df.to_csv("results/result.csv", index=False, float_format='%.0f')