# The Titanic data challenge

In [None]:
import numpy as np
import pandas as pd

In [None]:
# Settings
showPlots = False

In [None]:
# Load the data
train_df = pd.read_csv('data/train.csv')
apply_df = pd.read_csv('data/test.csv')
print(train_df[:3], '\n')
print(train_df.info())

## Test submissions
Test files:
* random prediction
* survival for women and children only

In [None]:
createRandom, createWomenChildren = False, False

In [None]:
if createRandom:
    # Create dummy random output for first submission
    id_df = apply_df['PassengerId']
    random_df = pd.DataFrame(np.random.randint(low=0, high=2, size=(id_df.shape[0], 1)), columns=['Survived'])
    result_df = pd.concat([id_df, random_df], axis=1)

    # Save output to file
    result_df.to_csv("results/result_random.csv", index=False, float_format='%.0f')

In [None]:
if createWomenChildren:
    # Create dummy output with survival for women and children only
    result_df = apply_df
    result_df['Survived'] = ((result_df['Sex'] == 'female') | (result_df['Age'] < 16)).astype(int)

    # Save output to file
    result_df.to_csv("results/result_children_women.csv", columns=['PassengerId', 'Survived'], index=False, float_format='%.0f')

## Inspect the data

In [None]:
# Have a general look
train_df.describe()

In [None]:
# Find how many unique entries
train_df.describe(include='O')

In [None]:
# Count how many men
train_df.loc[train_df['Sex'] == 'male'].shape[0]

#### Take a closer look at single features

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

if showPlots:
    g = sns.FacetGrid(train_df, col='Survived')
    g.map(plt.hist, 'Age', bins=20)

In [None]:
if showPlots:
    # grid = sns.FacetGrid(train_df, col='Pclass', hue='Survived')
    grid = sns.FacetGrid(train_df, col='Survived', row='Pclass', size=2.2, aspect=1.6)
    grid.map(plt.hist, 'Age', alpha=.5, bins=20)
    grid.add_legend()

In [None]:
if showPlots:
    grid = sns.FacetGrid(train_df, col='Embarked')
#     grid = sns.FacetGrid(train_df, row='Embarked', size=2.2, aspect=1.6)
    grid.map(sns.pointplot, 'Pclass', 'Survived', 'Sex', palette='deep', 
             hue_order=['female', 'male'], order=[1,2,3])
    grid.add_legend()

In [None]:
if showPlots:
#     grid = sns.FacetGrid(train_df, col='Embarked', hue='Survived', palette={0: 'k', 1: 'w'})
    grid = sns.FacetGrid(train_df, row='Embarked', col='Survived', size=2.2, aspect=1.6)
    grid.map(sns.barplot, 'Sex', 'Fare', alpha=.5, ci=None, order=['female', 'male'])
    grid.add_legend()

## Data preparation

### Remove unusable data
* drop PassengerId, Ticket

In [None]:
X_full = train_df.drop(["Survived", "PassengerId", "Ticket"], axis=1)
y_full = train_df["Survived"]
X_apply = apply_df.drop(["PassengerId", "Ticket"], axis=1)
X_full.head()

### Feature engineering
* number of family members
from sklearn.base import BaseEstimator, TransformerMixin
* coarser binning for age?

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin


class MyFeatureAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_relatives=True, add_title=True):
        self.add_title = add_title
        self.add_relatives = add_relatives
    
    def fit(self, X, y=None):
        return self  # nothing to do

    def transform(self, X, y=None):
        if self.add_title:
            pd.options.mode.chained_assignment = None  # creates slice-copy assignment warning
            X['Title'] = X.Name.str.extract(r' ([A-Za-z]+)\.', expand=False)
            X['Title'].replace(['Lady', 'Countess', 'Capt', 'Col',
                                'Don', 'Dr', 'Major', 'Rev', 'Sir',
                                'Jonkheer', 'Dona'],
                                'Rare', inplace=True)
            X['Title'].replace('Mlle', 'Miss', inplace=True)
            X['Title'].replace('Ms', 'Miss', inplace=True)
            X['Title'].replace('Mme', 'Mrs', inplace=True)
            pd.options.mode.chained_assignment = 'warn'

        if self.add_relatives:
#             X.loc[:, 'Family'] = X['SibSp'] + X['Parch']  # creates slice-copy assignment warning
            Xtmp = pd.DataFrame(X['SibSp'] + X['Parch'], columns=['Family'])
            X = pd.concat([X, Xtmp], axis=1)
        X = X.drop('Name', axis=1)
        return X

# feat_adder = MyFeatureAdder()
# X_new = feat_adder.fit_transform(X_full)
# print(X_new.head())

### Transform non-numeric labels to numeric ones
* fill NaN values with sensible defaults
* fill missing values with medians
* integer labels for 'Sex', 'Embarked' and 'Cabin'

In [None]:
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler


class MyNumericizer(BaseEstimator, TransformerMixin):
    def __init__(self, add_cabin_info=True):
        # no *args, **kargs to make use of BaseEstimator class
        # other args can be steered later as hyperparameters
        self.add_cabin_info = add_cabin_info

    def fit(self, X, y=None):
        return self  # nothing to do

    def transform(self, X, y=None):
        # Sex
        binarizer = LabelBinarizer(sparse_output=False)
        X['Sex'] = binarizer.fit_transform(X['Sex'])
        # Embarked
        encoder = LabelEncoder()
        X['Embarked'].fillna('unknown', inplace=True)
        X['Embarked'] = encoder.fit_transform(X['Embarked'])
        #cabin
        encoder = LabelEncoder()
        if self.add_cabin_info:
            X['Cabin'].fillna('unknown', inplace=True)
            X['Cabin'] = encoder.fit_transform(X['Cabin'])
        else:
            X = X.drop(['Cabin'], axis=1)
        # Age
        median = X['Age'].median()
        X['Age'].fillna(median, inplace=True)
        # Fare
        median = X['Fare'].median()
        X['Fare'].fillna(median, inplace=True)
        # Title
        X['Title'].fillna('unknown', inplace=True)
        X['Title'] = encoder.fit_transform(X['Title'])
        
        if X.isnull().any().any():
            print('Warning: null value detected:')
            print(X.isnull().any())
            
        return X


# print(X_full.head())
# feat_adder = MyFeatureAdder()
# X_new = feat_adder.fit_transform(X_full)
# print(X_new.head())
# numericizer = MyNumericizer()
# X_new = numericizer.fit_transform(X_new)
# print(X_new.head())
# scaler = StandardScaler()
# colNames = X_new.columns
# X_new = scaler.fit_transform(X_new)
# print(pd.DataFrame(X_new, columns=colNames).head())

### Visualization

#### Before transformation to numerical values

In [None]:
from pandas.plotting import scatter_matrix

if showPlots:
    plt.figure()
    X_full.hist(bins=50, figsize=(20, 15))
    1  # prevent matlpotlib printout

#### After transformation to numerical values

In [None]:
feat_adder = MyFeatureAdder()
my_num = MyNumericizer()
X_plot = my_num.fit_transform(feat_adder.fit_transform(X_full))

if showPlots:
    plt.figure()
    X_plot.hist(bins=50, figsize=(20, 15))
    1  # prevent matlpotlib printout

#### Correlation plots

In [None]:
if showPlots:
    plt.figure()
    scatter_matrix(pd.concat([X_plot, y_full], axis=1), figsize=(12, 8))
    1  # prevent matlpotlib printout

### Create pipeline

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

add_cabin_info = False  # adding this worsens accuracy
add_relatives = True  # adding this improves accuracy
add_title = True  # in current implementation worsens accuracy

clfs = {}

clfs['linear_svc'] = Pipeline((
        ('MyFeatureAdder', MyFeatureAdder(add_relatives=add_relatives, add_title=add_title)),
        ('MyNumericizer', MyNumericizer(add_cabin_info=add_cabin_info)),
        ('scaler', StandardScaler()),
        ('linear_svc', LinearSVC(C=1, loss='hinge')),
        ))

clfs['rbf_svc'] = Pipeline((
        ('MyFeatureAdder', MyFeatureAdder(add_relatives=add_relatives, add_title=add_title)),
        ('MyNumericizer', MyNumericizer(add_cabin_info=add_cabin_info)),
        ('scaler', StandardScaler()),
        ('rbf_svc', SVC(kernel='rbf', C=1)),
        ))

clfs['poly_svc'] = Pipeline((
        ('MyFeatureAdder', MyFeatureAdder(add_relatives=add_relatives, add_title=add_title)),
        ('MyNumericizer', MyNumericizer(add_cabin_info=add_cabin_info)),
        ('scaler', StandardScaler()),
        ('poly_svc', SVC(kernel='poly', C=1)),
        ))

clfs['tree'] = Pipeline((
        ('MyFeatureAdder', MyFeatureAdder(add_relatives=add_relatives, add_title=add_title)),
        ('MyNumericizer', MyNumericizer(add_cabin_info=add_cabin_info)),
        ('scaler', StandardScaler()),
        ('tree', DecisionTreeClassifier()),
        ))

clfs['kNN'] = Pipeline((
        ('MyFeatureAdder', MyFeatureAdder(add_relatives=add_relatives, add_title=add_title)),
        ('MyNumericizer', MyNumericizer(add_cabin_info=add_cabin_info)),
        ('scaler', StandardScaler()),
        ('kNN', KNeighborsClassifier()),
        ))

clfs['gradBoost'] = Pipeline((
        ('MyFeatureAdder', MyFeatureAdder(add_relatives=add_relatives, add_title=add_title)),
        ('MyNumericizer', MyNumericizer(add_cabin_info=add_cabin_info)),
        ('scaler', StandardScaler()),
        ('gradBoost', GradientBoostingClassifier(learning_rate=1.0, n_estimators=3, max_depth=2)),
        ))

clfs['randomForest'] = Pipeline((
        ('MyFeatureAdder', MyFeatureAdder(add_relatives=add_relatives, add_title=add_title)),
        ('MyNumericizer', MyNumericizer(add_cabin_info=add_cabin_info)),
        ('scaler', StandardScaler()),
        ('randomForest', RandomForestClassifier()),
        ))

## Training

### Split training sample into train and validation set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_full, y_full, test_size=0.25, random_state=1337)
# Reset the index to reach from 0 to n-1 to avoid NaN rows
for x in [X_train, X_test, y_train, y_test]:
    x.reset_index(drop=True, inplace=True)

### Training

In [None]:
for name, clf in clfs.items():
    clf.fit(X_train, y_train)

## Evaluation

In [None]:
from sklearn.metrics import accuracy_score

accuracies = []
for name, clf in clfs.items():
    y_pred = clf.predict(X_test)
    accuracy = round(accuracy_score(y_test, y_pred), 3)
    accuracies.append((name, accuracy))
    
accuracies.sort(key=lambda x: x[1], reverse=True)
for acc in accuracies:
    print(acc[0], ':', *acc[1:])

top = accuracies[0][0]

## Prediction

In [None]:
y_pred = clfs[top].predict(X_apply)

In [None]:
id_pred = pd.DataFrame(apply_df['PassengerId'])
y_pred = pd.DataFrame(y_pred, columns=['Survived'])
result_df = pd.concat([id_pred, y_pred], axis=1)
print(result_df.head())

# Save output to file
result_df.to_csv("results/result.csv", index=False, float_format='%.0f')