# Titanic Survival Prediction
In this notebook I apply the random forest model to a Titantic dataset in order to predict the passengers who survive.

## Imports

In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
from fastai.imports import *
from fastai.structured import *

from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display

from sklearn import metrics

In [3]:
PATH = '../../data/titanic/'

In [4]:
!ls {PATH}

gender_submission.csv
test.csv
train.csv


## Explore Data

In [5]:
df_raw = pd.read_csv(f'{PATH}train.csv')

In [6]:
def display_all(data): 
    with pd.option_context('display.max_rows', 1000, 'display.max_columns', 1000):
        display(data)

In [7]:
display_all(df_raw.tail().T)

Unnamed: 0,886,887,888,889,890
PassengerId,887,888,889,890,891
Survived,0,1,0,1,0
Pclass,2,1,3,1,3
Name,"Montvila, Rev. Juozas","Graham, Miss. Margaret Edith","Johnston, Miss. Catherine Helen ""Carrie""","Behr, Mr. Karl Howell","Dooley, Mr. Patrick"
Sex,male,female,female,male,male
Age,27,19,,26,32
SibSp,0,0,1,0,0
Parch,0,0,2,0,0
Ticket,211536,112053,W./C. 6607,111369,370376
Fare,13,30,23.45,30,7.75


In [8]:
display_all(df_raw.describe(include='all').T)

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
PassengerId,891,,,,446.0,257.354,1.0,223.5,446.0,668.5,891.0
Survived,891,,,,0.383838,0.486592,0.0,0.0,0.0,1.0,1.0
Pclass,891,,,,2.30864,0.836071,1.0,2.0,3.0,3.0,3.0
Name,891,891.0,"Perreault, Miss. Anne",1.0,,,,,,,
Sex,891,2.0,male,577.0,,,,,,,
Age,714,,,,29.6991,14.5265,0.42,20.125,28.0,38.0,80.0
SibSp,891,,,,0.523008,1.10274,0.0,0.0,0.0,1.0,8.0
Parch,891,,,,0.381594,0.806057,0.0,0.0,0.0,0.0,6.0
Ticket,891,681.0,1601,7.0,,,,,,,
Fare,891,,,,32.2042,49.6934,0.0,7.9104,14.4542,31.0,512.329


## Data Prep
Here we convert the categorical features from strings to categories.

In [9]:
train_cats(df_raw)

In [10]:
df_raw.Ticket.cat.categories

Index(['110152', '110413', '110465', '110564', '110813', '111240', '111320',
       '111361', '111369', '111426',
       ...
       'STON/O2. 3101290', 'SW/PP 751', 'W./C. 14258', 'W./C. 14263',
       'W./C. 6607', 'W./C. 6608', 'W./C. 6609', 'W.E.P. 5734', 'W/C 14208',
       'WE/P 5735'],
      dtype='object', length=681)

In [11]:
display_all(df_raw.isnull().sum().sort_index()/len(df_raw))

Age            0.198653
Cabin          0.771044
Embarked       0.002245
Fare           0.000000
Name           0.000000
Parch          0.000000
PassengerId    0.000000
Pclass         0.000000
Sex            0.000000
SibSp          0.000000
Survived       0.000000
Ticket         0.000000
dtype: float64

Save DataFrame to file since it is in usable state.

In [12]:
os.makedirs('tmp', exist_ok=True)
df_raw.to_feather('tmp/titanic_raw')

## Preprocessing from Feather
Can start here to prevent having to reload data from csv everytime.

In [38]:
df_raw = pd.read_feather('tmp/titanic_raw')

In [39]:
df, y, nas = proc_df(df_raw, y_fld='Survived')

In [40]:
len(df)

891

Now we create a train, valid, and dev set for use in model iteration, before we train on the full dataset.

In [71]:
def split_data(df, y, n_val):
    val_idxs = df.index.isin(np.random.choice(len(df), size=n_val, replace=False))
    return df[~val_idxs].copy().reset_index(drop=True), y[~val_idxs].copy(), \
            df[val_idxs].copy().reset_index(drop=True), y[val_idxs].copy()

In [72]:
df_train, y_train, df_valid, y_valid = split_data(df, y, 200)

In [73]:
_, _, df_dev, y_dev = split_data(df_train, y_train, 200)

## Create Model

In [80]:
def print_score(m):
    res = {'train_accuracy': m.score(df_dev, y_dev), 'valid_accuracy': m.score(df_valid, y_valid), 
          'train_loss': metrics.log_loss(y_dev, m.predict_proba(df_dev)), 
           'valid_loss': metrics.log_loss(y_valid, m.predict_proba(df_valid))}
    if hasattr(m, 'oob_score_'): res['oob_score'] = m.oob_score_
    return res

In [137]:
# Set random_state, bootstrap, and max_features to make RF deterministic
m = RandomForestClassifier(n_estimators=10, max_features=None, n_jobs=-1, bootstrap=False, random_state=0)

In [138]:
%time m.fit(df_dev, y_dev)
print_score(m)

Wall time: 115 ms


{'train_accuracy': 1.0,
 'valid_accuracy': 0.725,
 'train_loss': 9.992007221626413e-16,
 'valid_loss': 6.3676198257653205}