In [180]:
# V2 - June 20 2018
# script to predict titanic kaggle competion survival rates
# author - Benson
# language - python 2

# load the libraries needed
import numpy as np # scientific computing library
import pandas as pd # library for data analysis

from sklearn.ensemble import RandomForestClassifier #  ML classier algorithm
import random # to randomly replace missing values

# load data 
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# show me 5 rows
# train.head(5)
train.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [181]:
# describe the data
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [182]:
# check missing data in columns
print 'Columns with null in train {}'.format(train.columns[train.isnull().any()])
print 'Columns with null in test {}'.format(test.columns[test.isnull().any()])

Columns with null in train Index([u'Age', u'Cabin', u'Embarked'], dtype='object')
Columns with null in test Index([u'Age', u'Fare', u'Cabin'], dtype='object')


In [183]:
# define variables of interest

# Survived, Pclass, Sex, Age, SibSp, Parch, Fare, mbarked
# To Drop - PassengerId, Name, Ticket, Cabin

variables_to_drop_in_train = ['PassengerId','Name','Ticket','Fare','Cabin']
variables_to_drop_in_test = ['Name','Ticket','Fare','Cabin']

trimmed_train=train.drop(variables_to_drop_in_train, axis=1)
trimmed_test=test.drop(variables_to_drop_in_test, axis=1)

In [184]:
# features engineering
# 1. Embarking - convert to int, replace null with S - has most occurences - 270/418, 664/891
trimmed_train['Embarked']=trimmed_train['Embarked'].fillna('S')
trimmed_test['Embarked']=trimmed_test['Embarked'].fillna('S')

# convert to int 1-S 1-Q 2-C
trimmed_train['Embarked'] = trimmed_train['Embarked'].map({'S': 0, 'Q': 1, 'C': 2})
trimmed_test['Embarked'] = trimmed_test['Embarked'].map({'S': 0, 'Q': 1, 'C': 2})

In [185]:
# 2. Age - replace null with a random number: between mean and std

# get mean ages
train_mean_age = trimmed_train['Age'].mean()
test_mean_age = trimmed_test['Age'].mean()

# Standard deviation
train_sd_age = trimmed_train['Age'].std()
test_sd_age = trimmed_test['Age'].std()

# lower and upper bounds
train_lower_bound = train_mean_age - train_sd_age
train_upper_bound = train_mean_age + train_sd_age

test_lower_bound = test_mean_age - test_sd_age
test_upper_bound = test_mean_age + test_sd_age

# generate random ages within bounds
train_random_age =  random.randint(int(train_lower_bound), int(train_upper_bound))
test_random_age =  random.randint(int(test_lower_bound), int(test_upper_bound))

# convert Age to int and drop na values
trimmed_train['Age'] = trimmed_train['Age'].dropna().astype(int)
trimmed_test['Age'] = trimmed_test['Age'].dropna().astype(int)

# replace nan with random ages
trimmed_train['Age']=trimmed_train['Age'].fillna(train_random_age)
trimmed_test['Age']=trimmed_test['Age'].fillna(test_random_age)


In [186]:
# 3 sex - convert to int
trimmed_train['Sex']=trimmed_train['Sex'].map({'female':0, 'male': 1})
trimmed_test['Sex']=trimmed_test['Sex'].map({'female':0, 'male': 1})


In [189]:
# 4 SibSp
# trimmed_train['SibSp'].unique()
# trimmed_train['SibSp'].describe() # mean is 0.5, min is 0 max 8, sd is 1.1
trimmed_train['SibSp']=trimmed_train['SibSp'].fillna(random.randint(0,1))

# 5 Parch
# trimmed_train['Parch'].unique()
# trimmed_train['Parch'].describe() # mean 0.38, std 0.8, max 6, min 0

# random 1 or 0 all null
trimmed_train['Parch']=trimmed_train['Parch'].fillna(random.randint(0,1))

# 6. NumRelatives - What's the relation between SibSp, Parch - relatives
trimmed_train['NumRelatives'] = trimmed_train['Parch'] + trimmed_train['SibSp'] + 1
trimmed_test['NumRelatives'] = trimmed_test['Parch'] + trimmed_test['SibSp'] + 1

In [190]:
# show me the cleaned trained data
trimmed_train.head(5)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked,NumRelatives
0,0,3,1,22.0,1,0,0,2
1,1,1,0,38.0,1,0,2,2
2,1,3,0,26.0,0,0,0,1
3,1,1,0,35.0,1,0,0,2
4,0,3,1,35.0,0,0,0,1


In [191]:
# show me the cleaned test data
trimmed_test.head(5)

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Embarked,NumRelatives
0,892,3,1,34.0,0,0,1,1
1,893,3,0,47.0,1,0,0,2
2,894,2,1,62.0,0,0,1,1
3,895,3,1,27.0,0,0,0,1
4,896,3,0,22.0,1,1,0,3


In [192]:
# training and testing set
X_train = trimmed_train.drop("Survived",axis=1)
Y_train = trimmed_train["Survived"]
X_test  = trimmed_test.drop("PassengerId",axis=1).copy()

# random forest classifier
rf=RandomForestClassifier(n_estimators=100)

# train
rf.fit(X_train, Y_train)

# predict
Y_predict=rf.predict(X_test)

# score
rf.score(X_train, Y_train)

0.9349046015712682

In [193]:
# finally save my prediction
prediction = pd.DataFrame({'PassengerId': trimmed_test['PassengerId'], 'Survived': Y_predict})

# save to csv
prediction.to_csv('prediction.csv', index=False)

# show me some outputs
prediction.head(10)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,1
4,896,0
5,897,0
6,898,0
7,899,0
8,900,0
9,901,0


In [194]:
print 'Hurrayy!!!'


Hurrayy!!!
