In [1]:
#Import useful packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
#For easier reading of numbers
pd.set_option('display.precision',2)

In [3]:
#Import the training data
data = pd.read_csv('train.csv')
data.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.28,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.92,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
#For this Random Forest Classifier we decide which columns to make use of and discard the rest
#We are deciding not to look at the names of the passengers, the ticket, nor the cabin
data_train = data[['Survived', 'Pclass',  'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]

#View the completeness of the data (this is only looking for NaNs, None, Null)
data_train.count()

Survived    891
Pclass      891
Sex         891
Age         714
SibSp       891
Parch       891
Fare        891
Embarked    889
dtype: int64

In [5]:
#The above shows that the age of 177 passengers is missing, and that the embarkation points are missing for two passengers
#Missing data can be resolved by either suplimenting an relatively neutral new value (mean, mode, etc.), or by dropping those data points

#Resolve missing age data by assigning mean value to passengers missing the age information
data_train = data_train.fillna(value={'Age' : np.mean(data_train['Age'])})
data_train.count()

Survived    891
Pclass      891
Sex         891
Age         891
SibSp       891
Parch       891
Fare        891
Embarked    889
dtype: int64

In [6]:
#Resolve missing embarkation data by removing data (only two data points)
#dropna() will remove the rows with NaNs, which we know are in the 'Embarked' col
data_train = data_train.dropna()

In [7]:
data_train.count()

Survived    889
Pclass      889
Sex         889
Age         889
SibSp       889
Parch       889
Fare        889
Embarked    889
dtype: int64

In [8]:
#Our Random Forest Classifier requires numerical data as input to the model, so we need to use "one-hot-encoding"

#Resolve 'Sex' and 'Embarked' columns non-numerical data using Pandas' wonderful .get_dummies()
data_train = pd.get_dummies(data_train)

In [9]:
data_train.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0,3,22.0,1,0,7.25,0,1,0,0,1
1,1,1,38.0,1,0,71.28,1,0,1,0,0
2,1,3,26.0,0,0,7.92,1,0,0,0,1
3,1,1,35.0,1,0,53.1,1,0,0,0,1
4,0,3,35.0,0,0,8.05,0,1,0,0,1


In [10]:
#Define the Random Forest Classifier Model
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100)

In [11]:
#train / fit the model
model.fit(data_train.drop(labels='Survived', axis=1).values, data_train['Survived'].values)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [12]:
estimator = model.estimators_[15]

In [13]:
from sklearn.tree import export_graphviz
# Export as dot file
export_graphviz(estimator, out_file='tree.dot', 
                feature_names = data_train.drop(labels='Survived', axis=1).columns.values,
                class_names = ['Survived', 'Perished'],
                rounded = True, proportion = False, 
                precision = 2, filled = True)

In [14]:
import graphviz
with open("tree.dot") as f:
    dot_graph = f.read()


In [15]:
tree = graphviz.Source(dot_graph, format='pdf')

In [16]:
tree.render(filename="Titanic-Tree")

'Titanic-Tree.pdf'

In [17]:
data_test = pd.read_csv('test.csv')
data_test.count()

PassengerId    418
Pclass         418
Name           418
Sex            418
Age            332
SibSp          418
Parch          418
Ticket         418
Fare           417
Cabin           91
Embarked       418
dtype: int64

In [18]:
#Collect relevant data, including PassengerId
data_test = data_test[['PassengerId', 'Pclass',  'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
#Have a look at the completeness of the data (this is only looking for Nans)
data_test.count()

PassengerId    418
Pclass         418
Sex            418
Age            332
SibSp          418
Parch          418
Fare           417
Embarked       418
dtype: int64

In [19]:
#Set missing age and fare information with their mean values
data_test = data_test.fillna(value={'Age':np.mean(data_test['Age']), 
                                    'Fare':np.mean(data_test['Fare'])})
data_test = pd.get_dummies(data_test)
data_test.count()

PassengerId    418
Pclass         418
Age            418
SibSp          418
Parch          418
Fare           418
Sex_female     418
Sex_male       418
Embarked_C     418
Embarked_Q     418
Embarked_S     418
dtype: int64

In [20]:
#Predict the survival of the passenger by adding a new data column
data_test['Survived'] = model.predict(data_test.drop(labels='PassengerId', axis=1))

In [21]:
#Print a csv that is formatted for submission to the Kaggle competition
#CSV won't contain an index column, only the PassengerId and whether or not we predict that they survived
data_test.to_csv('submission_RFC_improved.csv',index=False,columns=['PassengerId', 'Survived'])