## Predicting survival in Titanic using Random Forests Classifier

<h3> Import Libraries </h3>

In [73]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

from sklearn .ensemble import RandomForestClassifier

from sklearn.cross_validation import train_test_split

<h3> Load the data </h3>

<h3> Data preparation </h3>

In [2]:
train = pd.read_csv("Dataset/train.csv")
test = pd.read_csv("Dataset/test.csv")

full = train.append(test, ignore_index = True)
titanic = full[ :891 ]

del train, test
print ('Datasets:' , 'full:' , full.shape , 'titanic:' , titanic.shape)

Datasets: full: (1309, 12) titanic: (891, 12)


In [7]:
full.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
0,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,1,3,male,1,0.0,A/5 21171
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1.0,PC 17599
2,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,3,female,0,1.0,STON/O2. 3101282
3,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,1.0,113803
4,35.0,,S,8.05,"Allen, Mr. William Henry",0,5,3,male,0,0.0,373450


In [6]:
# transform Sex into binary values 0 and 1
sex = pd.Series( np.where(full.Sex == 'male', 1, 0), name = 'Sex')
sex.head()

0    1
1    0
2    0
3    0
4    1
Name: Sex, dtype: int64

In [9]:
# transform embarked
embarked = pd.get_dummies( full.Embarked, prefix='Embarked').astype(int)
embarked.head()

Unnamed: 0,Embarked_C,Embarked_Q,Embarked_S
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1


In [11]:
#transform Pclass
pclass = pd.get_dummies( full.Pclass, prefix='Pclass').astype(int)
pclass.head()

Unnamed: 0,Pclass_1,Pclass_2,Pclass_3
0,0,0,1
1,1,0,0
2,0,0,1
3,1,0,0
4,0,0,1


<h3> Check for missing variables and fill missing values </h3>

In [15]:
full.isnull().sum()

Age             263
Cabin          1014
Embarked          2
Fare              1
Name              0
Parch             0
PassengerId       0
Pclass            0
Sex               0
SibSp             0
Survived        418
Ticket            0
dtype: int64

<h3> Feature Engineering </h3>

In [37]:
# Extract titles/salutations in names of passengers
full['Initial'] =0
full['Titles'] =0
for i in full:
    full['Titles']=full['Initial'] = full.Name.str.extract('([A-Za-z]+)\.') # look for strings which lie between A-Z or a-z and followed by a .(dot) using Regex 
full['Initial'].replace(['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don','Dona'],
                        ['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr','Mrs'],inplace=True)

In [38]:
# Average age depending on their title/salutation
full.groupby('Initial')['Age'].mean()

Initial
Master     5.419344
Miss      21.866377
Mr        32.650000
Mrs       36.905000
Other     44.923077
Name: Age, dtype: float64

In [39]:
# Fill missing values of Age using features extracted from titles/salutations
full.loc[(full.Age.isnull())&(full.Initial=='Mr'),'Age']=33
full.loc[(full.Age.isnull())&(full.Initial=='Mrs'),'Age']=37
full.loc[(full.Age.isnull())&(full.Initial=='Master'),'Age']=5
full.loc[(full.Age.isnull())&(full.Initial=='Miss'),'Age']=22
full.loc[(full.Age.isnull())&(full.Initial=='Other'),'Age']=45

In [40]:
# Transform age into age groups
full['Age_band']=0
full.loc[full['Age']<=16,'Age_band']=0
full.loc[(full['Age']>16)&(full['Age']<=32),'Age_band']=1
full.loc[(full['Age']>32)&(full['Age']<=48),'Age_band']=2
full.loc[(full['Age']>48)&(full['Age']<=64),'Age_band']=3
full.loc[full['Age']>64,'Age_band']=4
Age_band = pd.DataFrame(full['Age_band'])

In [50]:
# Fill missing values of Fare with the average of Fare
full['Fare'] = full.Fare.fillna( full.Fare.mean())

# Transform fare into fare categories
full['Fare_cat'] = 0
full.loc[full['Fare']<=7.91, 'Fare_cat'] =0 
full.loc[(full[ 'Fare' ]>7.91)&(full[ 'Fare' ]<=14.454),'Fare_cat']=1
full.loc[(full[ 'Fare' ]>14.454)&(full[ 'Fare' ]<=31),'Fare_cat']=2
full.loc[(full[ 'Fare' ]>31)&(full[ 'Fare' ]<=513),'Fare_cat']=3
Fare_cat = pd.DataFrame(full['Fare_cat'])

In [55]:
# Transform titles 
title = pd.get_dummies( full.Titles, prefix='Title').astype(int)
title.head()

Unnamed: 0,Title_Capt,Title_Col,Title_Countess,Title_Don,Title_Dona,Title_Dr,Title_Jonkheer,Title_Lady,Title_Major,Title_Master,Title_Miss,Title_Mlle,Title_Mme,Title_Mr,Title_Mrs,Title_Ms,Title_Rev,Title_Sir
0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0


In [57]:
full['Family_Size'] = 0 
# Family Size
full['Family_Size'] = full['Parch'] + full['SibSp'] 
family = pd.DataFrame(full['Family_Size'])

<h3> Assemble final dataset for modelling </h3>

<h4> Feature Selection </h4>

In [58]:
full_X = pd.concat([Fare_cat,Age_band,embarked,pclass,family,title,sex], axis=1)

<h3> Prepare Training and Test sets </h3>

In [63]:
train_valid_X = full_X[ 0:891 ]
train_valid_Y = titanic.Survived
train_X , valid_X , train_y , valid_y = train_test_split( train_valid_X , train_valid_Y , train_size = .7 )

<h3> Modeling </h3>

In [64]:
model = RandomForestClassifier(n_estimators=100, criterion='entropy')

In [65]:
model.fit( train_X , train_y )

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

<h3> Evaluation </h3>

In [66]:
print (model.score( train_X , train_y ) , model.score( valid_X , valid_y ))

0.906902086677 0.798507462687
