# Kaggle Titanic workflow  
Predicting survival from the sinking of the titanic using passenger data  

Aine Fairbrother-Browne  
12/20  

## Setup

In [46]:
# import libraries
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os

# import ML libraries 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold # for K-fold cross validation
from sklearn.model_selection import cross_val_score # score evaluation
from sklearn.model_selection import cross_val_predict # prediction
from sklearn.metrics import confusion_matrix

# settings
warnings.filterwarnings('ignore')
%matplotlib inline
%load_ext autotime

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 7.97 ms


## Import data

In [13]:
# import data
os.chdir("/home/abrowne/ML/kaggle_titanic_survival/")

# test data
train_data = pd.read_csv(open("./data/train.csv"), encoding="utf-8", engine='python', index_col=0, header=0)

# train data
test_data = pd.read_csv(open("./data/test.csv"), encoding="utf-8", engine='python', index_col=0, header=0)

time: 35.9 ms


## View data

| Variable | Definition                                 | Key                                            |
|----------|--------------------------------------------|------------------------------------------------|
| survival | Survival                                   | 0 = No, 1 = Yes                                |
| pclass   | Ticket class                               | 1 = 1st, 2 = 2nd, 3 = 3rd                      |
| sex      | Sex                                        |                                                |
| Age      | Age in years                               |                                                |
| sibsp    | # of siblings / spouses aboard the Titanic |                                                |
| parch    | # of parents / children aboard the Titanic |                                                |
| ticket   | Ticket number                              |                                                |
| fare     | Passenger fare                             |                                                |
| cabin    | Cabin number                               |                                                |
| embarked | Port of Embarkation                        | C = Cherbourg, Q = Queenstown, S = Southampton |


* age:  
Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5  

* sibsp:  
The dataset defines family relations in this way...  
Sibling = brother, sister, stepbrother, stepsister  
Spouse = husband, wife (mistresses and fiancés were ignored)  

* parch:  
The dataset defines family relations in this way...  
Parent = mother, father  
Child = daughter, son, stepdaughter, stepson  
Some children travelled only with a nanny, therefore parch=0 for them  

In [20]:
print(train_data.shape) # 891 passengers, 11 features
train_data.head(10)

(891, 11)


Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


time: 25 ms


## Dealing with missing data

## Feature engineering

In [41]:
women = train_data.loc[train_data.Sex == 'female']["Survived"]
rate_women = sum(women)/len(women)

print("% of women who survived:", rate_women)

% of women who survived: 0.7420382165605095
time: 5.09 ms


## Trying models

### 1. Random forest classifier  

In [76]:
# import the RFC from sklearn
from sklearn.ensemble import RandomForestClassifier

# function to run rfc for fecture vectors
def run_rfc(feature_vector, save_output=False):
    
    print('features selected: ',feature_vector)
    
    # define features that we want to use to predict survival
    features = feature_vector

    # selecting features to train the model on
    pred_features = pd.get_dummies(train_data[features])
    target = train_data["Survived"]

    # splitting training set test:train 30:70
    X_train,X_test,y_train,y_test = train_test_split(pred_features, target, test_size=0.3, random_state=42)

    # initialise RFC model
    model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)

    # fit model
    model.fit(X_train_dummies, y_train)

    # get predictions 
    predictions = model.predict(X_test_dummies)

    print('The accuracy of the Random Forest Classifier is', round(accuracy_score(predictions, y_test)*100,2))
    
    if save_output==True:
        # export 
        output = pd.DataFrame({'PassengerId': test_data.index, 'Survived': predictions})
        output.to_csv('./submissions/submission1.csv', index=False)
        
    
    

time: 3.8 ms


In [77]:
run_rfc(feature_vector = ["Pclass", "Sex", "SibSp", "Parch"])

features selected:  ['Pclass', 'Sex', 'SibSp', 'Parch']
The accuracy of the Random Forest Classifier is 80.6
time: 239 ms
