## Imports

In [4]:
import numpy as np
import pandas as pd
from pathlib import Path

In [5]:
from bokeh.io import output_notebook, show
from bokeh.plotting import figure

In [6]:
import seaborn as sns

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans

In [8]:
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score, accuracy_score

## Config

In [9]:
data_fp = Path('../data/')

In [10]:
%matplotlib inline

## Get data

### Read data

In [11]:
example_fp = data_fp.joinpath('gender_submission.csv')
train_fp = data_fp.joinpath('train.csv')
test_fp = data_fp.joinpath('test.csv')

In [12]:
example_raw = pd.read_csv(example_fp)
train_raw = pd.read_csv(train_fp)
test_raw = pd.read_csv(test_fp)

## Functions

In [59]:
def preprocess(df, test=False, km_mdl=None):
    df_proc = df.join(pd.get_dummies(df['Pclass'], prefix='Pclass')).drop(['Pclass'], axis=1)
    
    df_proc = df_proc.join(pd.get_dummies(df_proc['Sex'], prefix='Sex')).drop(['Sex'], axis=1)
    
    df_proc['Sib?'] = df_proc['SibSp'].apply(lambda n: int(n>0))
    df_proc['Par?'] = df_proc['Parch'].apply(lambda n: int(n>0))
    df_proc = df_proc.drop(['SibSp', 'Parch'], axis=1)
    
    age_max = df_proc['Age'].max()
    df_proc['Age'] = df_proc['Age'] / age_max
    df_proc = df_proc.fillna(-1)
    
    return df_proc

## Preprocessing

### Separate out data columns

In [60]:
proc_cols_train = ['PassengerId', 'Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch']
proc_cols_test = ['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch']
train_df = train_raw[proc_cols_train].set_index('PassengerId')
test_df = test_raw[proc_cols_test].set_index('PassengerId')

### Preprocess

In [61]:
train_proc = preprocess(train_df)
test_proc = preprocess(test_df)

## Modelling

### Data

In [62]:
train_proc.columns

Index(['Survived', 'Age', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_female',
       'Sex_male', 'Sib?', 'Par?'],
      dtype='object')

In [63]:
y_cols = ['Survived']
x_cols = ['Age', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_female', 'Sex_male', 'Sib?', 'Par?']

In [64]:
X = train_proc.loc[:, x_cols]
y = train_proc.loc[:, y_cols]
X_train = train_proc[x_cols]
y_train = train_proc[y_cols]

### Models

In [66]:
mdl = LogisticRegression(solver='liblinear')

### Run Models

In [67]:
mdl.fit(X_train, y_train.values.ravel())
preds = mdl.predict(X_train)

### Score Models

In [68]:
print('Accuracy: %.3f\n' % accuracy_score(y_train, preds))

Accuracy: 0.787



## Predictions

In [69]:
preds = mdl.predict(test_proc[x_cols])

In [73]:
pd.DataFrame(data=preds, index=test_proc.index, columns=['Survived']).to_csv('../data/predictions.csv')