# The Titanic data challenge

In [None]:
import numpy as np
import pandas as pd

from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [None]:
# Load the data
train_df = pd.read_csv('data/train.csv')
apply_df = pd.read_csv('data/test.csv')
print(train_df[:3], '\n')
print(train_df.info())

## Test submissions
Test files:
* random prediction
* survival for women and children only

In [None]:
createRandom, createWomenChildren = False, False

In [None]:
if createRandom:
    # Create dummy random output for first submission
    id_df = apply_df['PassengerId']
    random_df = pd.DataFrame(np.random.randint(low=0, high=2, size=(id_df.shape[0], 1)), columns=['Survived'])
    result_df = pd.concat([id_df, random_df], axis=1)

    # Save output to file
    result_df.to_csv("results/result_random.csv", index=False, float_format='%.0f')

In [None]:
if createWomenChildren:
    # Create dummy output with survival for women and children only
    result_df = apply_df
    result_df['Survived'] = ((result_df['Sex'] == 'female') | (result_df['Age'] < 16)).astype(int)

    # Save output to file
    result_df.to_csv("results/result_children_women.csv", columns=['PassengerId', 'Survived'], index=False, float_format='%.0f')

## Investigate the data

In [None]:
# Have a general look
train_df.describe()

In [None]:
# Find how many unique entries
train_df.describe(include='O')

In [None]:
# Count how many men
train_df.loc[train_df['Sex'] == 'male'].shape[0]

## Data preparation

### Remove unusable data
* drop PassengerId, Name, Ticket

In [None]:
X_full = train_df.drop(["Survived", "PassengerId", "Name", "Ticket"], axis=1)
y_full = train_df["Survived"]
X_apply = apply_df.drop(["PassengerId", "Name", "Ticket"], axis=1)
X_full.head()

### Transform non-numeric labels
* fill NaN values with sensible defaults
* fill missing values with medians
* integer labels for 'Sex', 'Embarked' and 'Cabin'

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelBinarizer, LabelEncoder


class MyNumericizer(BaseEstimator, TransformerMixin):
    def __init__(self, add_cabin_info=True):
        # no *args, **kargs to make use of BaseEstimator class
        # other args can be steered later as hyperparameters
        self.add_cabin_info = add_cabin_info

    def fit(self, X, y=None):
        return self  # nothing to do

    def transform(self, X, y=None):
        # Sex
        binarizer = LabelBinarizer(sparse_output=False)
        col = X['Sex']
        col = pd.DataFrame(binarizer.fit_transform(col), columns=['Sex'])
        X = X.drop('Sex', axis=1)
        X = pd.concat([X, col], axis=1)
        # Embarked
        encoder = LabelEncoder()
        X['Embarked'].fillna('unknown', inplace=True)
        col = encoder.fit_transform(X['Embarked'])
        col = pd.DataFrame(col, columns=['Embarked'])
        X = X.drop('Embarked', axis=1)
        X = pd.concat([X, col], axis=1)
        #cabin
        if self.add_cabin_info:
            encoder = LabelEncoder()
            X['Cabin'].fillna('unknown', inplace=True)
            col = encoder.fit_transform(X['Cabin'])
            col = pd.DataFrame(col, columns=['Cabin'])
            X = X.drop('Cabin', axis=1)
            X = pd.concat([X, col], axis=1)
        else:
            X = X.drop(['Cabin'], axis=1)
        #Age
        median = X['Age'].median()
        X['Age'].fillna(median, inplace=True)
        #Fare
        median = X['Fare'].median()
        X['Fare'].fillna(median, inplace=True)
        
        if X.isnull().any().any():
            print('Warning: null value detected:')
            print(X.isnull().any())
            
        return X

# attr_adder = MyNumericizer(add_cabin_info=True)
# X_new = attr_adder.fit_transform(X_full)
# print(X_new.head())
# scaler = StandardScaler()
# X_new = scaler.fit_transform(X_new)
# print(pd.DataFrame(X_new, columns = X_full.columns).head())



### Feature engineering
* number of family members

### Create pipeline

In [None]:
svm_clf = Pipeline((
    ('MyNumericizer', MyNumericizer(add_cabin_info=True)),
    ('scaler', StandardScaler()),
    ('linear_svc', LinearSVC(C=1, loss='hinge')),
    # ('linear_svc', SVC(kernel='linear', C=1),
    ))

## Training

### Split training sample into train and validation set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_full, y_full, test_size=0.25, random_state=1337)
# Reset the index to reach from 0 to n-1 to avoid NaN rows
list(map(lambda x: x.reset_index(drop=True, inplace=True), [X_train, X_test, y_train, y_test])) 

### Training

In [None]:
svm_clf.fit(X_train, y_train)

## Evaluation

In [None]:
from sklearn.metrics import accuracy_score

y_pred = svm_clf.predict(X_test)
accuracy = round(accuracy_score(y_test, y_pred), 3)

accuracy

## Prediction

In [None]:
y_pred = svm_clf.predict(X_apply)

In [None]:
id_pred = pd.DataFrame(apply_df['PassengerId'])
y_pred = pd.DataFrame(y_pred, columns=['Survived'])
result_df = pd.concat([id_pred, y_pred], axis=1)
print(result_df.head())

# Save output to file
result_df.to_csv("results/result.csv", index=False, float_format='%.0f')