In [46]:
# import packages 
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier


import matplotlib.pyplot as plt
%matplotlib inline

In [19]:
# import dataset
titanic = pd.read_csv('titanic.csv')

In [20]:
# data quality & cleaning
# DQ checks
# Finding null values per column
missing = pd.DataFrame({'Column':[],'Number of missing datapoints':[]})
for i in range(len(titanic.columns)):
    missing.loc[i,['Column']] = titanic.columns[i]
    missing.loc[i,['Number of missing datapoints']] = titanic.iloc[:,i].isnull().sum()

# look at the # of missing points in the first ten columns
missing = missing.sort_values(by = ['Number of missing datapoints'],ascending = False)
missing.reset_index(inplace = True, drop = True)
missing
# replace NA with mean imputation where possible
titanic['Age'] = titanic['Age'].fillna(titanic['Age'].mean())
# generate dummy variable for sex
titanic['Sex_dummy'] = np.where(titanic['Sex'].eq('female'),1,0)
# separate dependent and independent variable which are usable
y = titanic['Survived']
X = titanic[['Pclass','Age','SibSp','Parch','Sex_dummy']]

In [21]:
# split data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # 70% training and 30% test

In [23]:
# Bagged Tree for the titanic dataset
base = DecisionTreeClassifier(max_depth=5)
ensemble = BaggingClassifier(base_estimator=base, n_estimators=100, random_state=7)

base.fit(X_train,y_train)
ensemble.fit(X_train,y_train)

# Test accuracy (% of true positives and true negatives out of the total)
print("Accuracy base:",base.score(X_test, y_test))
print("Accuracy ensemble:",ensemble.score(X_test, y_test))

Accuracy base: 0.8208955223880597
Accuracy ensemble: 0.8171641791044776


In [25]:
# Random Forest for the titanic dataset
forest = RandomForestClassifier(n_estimators=100, random_state=7)
forest.fit(X_train, y_train)
print("Accuracy forest:",forest.score(X_test, y_test))

Accuracy ensemble: 0.8208955223880597


In [41]:
# Boosted Tree
boost = AdaBoostClassifier(base_estimator = base,n_estimators = 100, random_state = 7)
boost.fit(X_train, y_train)
print("Accuracy ensemble:",boost.score(X_test, y_test))

Accuracy ensemble: 0.8171641791044776


In [26]:
# determine which of the features is the one that contributes the most to predicting whether a passenger survives or not.
feature_importance = pd.Series(forest.feature_importances_).sort_values(ascending=False)
feature_importance

1    0.383247
4    0.332609
0    0.154125
2    0.076831
3    0.053188
dtype: float64

In [42]:
# Only 1st 3 features contribute a lot to the predicting passenger survival
X2 = X.iloc[:,[1,4,0]]

# retrain the model
X_train, X_test, y_train, y_test = train_test_split(X2, y, test_size=0.3, random_state= 4) # 70% training and 30% test
base.fit(X_train,y_train)
forest.fit(X_train,y_train)

print("Accuracy base:",base.score(X_test, y_test))
print("Accuracy forest:",forest.score(X_test, y_test))

Accuracy base: 0.8432835820895522
Accuracy forest: 0.8246268656716418


In [48]:
# tune the parameters n_estimators and max_depth.
n_estimators = [int(i) for i in np.linspace(start = 10, stop = 100, num = 10)]
max_depth = [i for i in range(1,6)]
param_dict = {'n_estimators': n_estimators, 'max_depth':max_depth}
rf_grid = GridSearchCV(estimator = forest, param_grid = param_dict)
rf_grid.fit(X_train,y_train)

GridSearchCV(estimator=RandomForestClassifier(random_state=7),
             param_grid={'max_depth': [1, 2, 3, 4, 5],
                         'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90,
                                          100]})

In [49]:
# Report the accuracy of all models and report which model performed the best, including the values for n_estimators and max_depth that the best model had
rf_grid.best_params_

{'max_depth': 4, 'n_estimators': 10}

In [50]:
rf_grid.score(X_test,y_test)

0.832089552238806

In [55]:
# rerun models with these parameters
forest = RandomForestClassifier(n_estimators=10, random_state=7,max_depth = 4)
forest.fit(X_train,y_train)
print(f"The score on training data with optimisted hyper-parameters is {forest.score(X_train,y_train)}.")
print(f"The score on testing data with optimisted hyper-parameters is {forest.score(X_test,y_test)}.")

The score on training data with optimisted hyper-parameters is 0.8057784911717496.
The score on testing data with optimisted hyper-parameters is 0.832089552238806.
