## One last try: remove alone and keep sibsp & parch

# Part 1: Create a decision tree model of the Titanic dataset that predicts survival from seaborn.

#-------- Import Packages --------#

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier

In [2]:
Titanic = sns.load_dataset('titanic')
Titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


#-------- Data Wrangling --------#

### First, I need to drop any variables that are redundant or unnecessary.

### The following columns will be dropped for redundancy: 'class' (same as 'pclass'), 'who' (same as 'sex'), 'adult_male' (same as 'sex'), 'embark_town' (same as embarked), 'alive' (same as survived). 

In [3]:
Titanic2 = Titanic.drop(['class', 'who', 'adult_male', 'embark_town', 'alive', 'alone'], axis=1)

In [4]:
Titanic2.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
0,0,3,male,22.0,1,0,7.25,S,
1,1,1,female,38.0,1,0,71.2833,C,C
2,1,3,female,26.0,0,0,7.925,S,
3,1,1,female,35.0,1,0,53.1,S,C
4,0,3,male,35.0,0,0,8.05,S,


### Next, recoding

In [5]:
Titanic2.sex.value_counts()

sex
male      577
female    314
Name: count, dtype: int64

### 'sex' will be coded female = 0 and male = 1

In [6]:
def sex (series):
    if series == 'male':
        return 1
    if series == 'female':
        return 0
Titanic2['sexR'] = Titanic2['sex'].apply(sex)

In [7]:
Titanic2.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck,sexR
0,0,3,male,22.0,1,0,7.25,S,,1
1,1,1,female,38.0,1,0,71.2833,C,C,0
2,1,3,female,26.0,0,0,7.925,S,,0
3,1,1,female,35.0,1,0,53.1,S,C,0
4,0,3,male,35.0,0,0,8.05,S,,1


In [8]:
Titanic2.embarked.value_counts()

embarked
S    644
C    168
Q     77
Name: count, dtype: int64

### Recode 'embarked': S = 0, C =1, Q =2

In [9]:
def embarked (series):
    if series == 'S':
        return 0
    if series == 'C':
        return 1
    if series =="Q":
        return 2
Titanic2['embarkedR'] =Titanic2['embarked'].apply(embarked)

In [10]:
Titanic2.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck,sexR,embarkedR
0,0,3,male,22.0,1,0,7.25,S,,1,0.0
1,1,1,female,38.0,1,0,71.2833,C,C,0,1.0
2,1,3,female,26.0,0,0,7.925,S,,0,0.0
3,1,1,female,35.0,1,0,53.1,S,C,0,0.0
4,0,3,male,35.0,0,0,8.05,S,,1,0.0


In [11]:
Titanic2.deck.value_counts()

deck
C    59
B    47
D    33
E    32
A    15
F    13
G     4
Name: count, dtype: int64

### Recode 'deck': A = 0, B = 1, C = 2, D = 3, E = 4, F = 5, G =6

In [12]:
def deck (series):
    if series == 'A':
        return 0
    if series == 'B':
        return 1
    if series == 'C':
        return 2
    if series == 'D':
        return 3
    if series == 'E':
        return 4
    if series == 'F':
        return 5
    if series == 'G':
        return 6
Titanic2['deckR'] = Titanic2['deck'].apply(deck)

In [90]:
Titanic2.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck,alone,sexR,embarkedR,deckR
0,0,3,male,22.0,1,0,7.25,S,,False,1,0.0,
1,1,1,female,38.0,1,0,71.2833,C,C,False,0,1.0,2.0
2,1,3,female,26.0,0,0,7.925,S,,True,0,0.0,
3,1,1,female,35.0,1,0,53.1,S,C,False,0,0.0,2.0
4,0,3,male,35.0,0,0,8.05,S,,True,1,0.0,


### I'm going to drop the old columns for a nice clean dataset.

In [14]:
Titanic3 = Titanic2.drop(['sex', 'embarked', 'deck'], axis=1)

In [15]:
Titanic3.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,sexR,embarkedR,deckR
0,0,3,22.0,1,0,7.25,1,0.0,
1,1,1,38.0,1,0,71.2833,0,1.0,2.0
2,1,3,26.0,0,0,7.925,0,0.0,
3,1,1,35.0,1,0,53.1,0,0.0,2.0
4,0,3,35.0,0,0,8.05,1,0.0,


### Last wrangling step is to drop NA values.

In [16]:
Titanic3.dropna(inplace=True)

In [17]:
Titanic3.head(25)

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,sexR,embarkedR,deckR
1,1,1,38.0,1,0,71.2833,0,1.0,2
3,1,1,35.0,1,0,53.1,0,0.0,2
6,0,1,54.0,0,0,51.8625,1,0.0,4
10,1,3,4.0,1,1,16.7,0,0.0,6
11,1,1,58.0,0,0,26.55,0,0.0,2
21,1,2,34.0,0,0,13.0,1,0.0,3
23,1,1,28.0,0,0,35.5,1,0.0,0
27,0,1,19.0,3,2,263.0,1,0.0,2
52,1,1,49.0,1,0,76.7292,0,1.0,3
54,0,1,65.0,0,1,61.9792,1,1.0,1


### This view looks pretty nice, BUT there was a lot of data dropped.

### Now we need to define the x & y variables that will be fed into the model

In [18]:
x = Titanic3[['pclass', 'age', 'sibsp', 'parch', 'fare', 'sexR', 'embarkedR', 'deckR']]
y = Titanic3['survived']

## Test Train Split

### Now the x & y variable will be fed into the model. I will go with 30% reserve for testing, since that's pretty standard.

In [19]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3)

## Create Initial Decision Tree

In [20]:
decisionTree = DecisionTreeClassifier()
decisionTree.fit(x_train, y_train)

## Look at the ouput:

### First, predictions in a confusion matrix

In [21]:
treePredictions = decisionTree.predict(x_test)
print(confusion_matrix(y_test, treePredictions))

[[ 6 12]
 [ 9 28]]


### Not terribly great.

In [22]:
print(classification_report(y_test, treePredictions))

              precision    recall  f1-score   support

           0       0.40      0.33      0.36        18
           1       0.70      0.76      0.73        37

    accuracy                           0.62        55
   macro avg       0.55      0.55      0.55        55
weighted avg       0.60      0.62      0.61        55



### Here deaths were predcited with 40% accuracy and survival with 70%, but overall is only 60%. 

### I wonder if a smaller test set will help to improve this?

In [23]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

In [24]:
decisionTree = DecisionTreeClassifier()
decisionTree.fit(x_train, y_train)

In [25]:
treePredictions = decisionTree.predict(x_test)
print(confusion_matrix(y_test, treePredictions))

[[ 8  6]
 [ 2 21]]


In [26]:
print(classification_report(y_test, treePredictions))

              precision    recall  f1-score   support

           0       0.80      0.57      0.67        14
           1       0.78      0.91      0.84        23

    accuracy                           0.78        37
   macro avg       0.79      0.74      0.75        37
weighted avg       0.79      0.78      0.77        37



## Ohhh... This might be the best one yet! With 20% reserved for testing we get 80% accuracy on predicting deaths, 78% on predicting survival, and overall 79%.

# Random Forest Model

### Beware, for now you enter the forest, thick with decision trees. If you get lost here your mind may wander and never return. Please enter with caution.

## Initial Random Forest Model

In [27]:
forest = RandomForestClassifier(n_estimators=500)
forest.fit(x_train, y_train)

## Evaluate Model Fit

In [28]:
forestPredictions = forest.predict(x_test)
print(confusion_matrix(y_test, forestPredictions))
print(classification_report(y_test, forestPredictions))

[[ 5  9]
 [ 1 22]]
              precision    recall  f1-score   support

           0       0.83      0.36      0.50        14
           1       0.71      0.96      0.81        23

    accuracy                           0.73        37
   macro avg       0.77      0.66      0.66        37
weighted avg       0.76      0.73      0.70        37



### Looks like prediction accuracy is 83% for death, 71% for survival, and 76% overall.

## Hypertuning the model

In [29]:
n_estimators_array = [1, 4, 5, 8, 10, 20, 50, 75, 100, 250, 500]
results = []
for n in n_estimators_array:
    forest = RandomForestClassifier(n_estimators=n, random_state=76)
    forest.fit(x_train, y_train)
    result = accuracy_score(y_test, forest.predict(x_test))
    results.append(result) 
    print(n, ':', result)

1 : 0.7027027027027027
4 : 0.7297297297297297
5 : 0.7567567567567568
8 : 0.7567567567567568
10 : 0.7567567567567568
20 : 0.7837837837837838
50 : 0.7567567567567568
75 : 0.7567567567567568
100 : 0.7027027027027027
250 : 0.6756756756756757
500 : 0.7297297297297297


### Looks like 20 gives us 78% accuracy, so we'll run with that. But we won't run with scissors.

### Continuing to tune our other 3 parameters

### First we feed:

In [30]:
max_features = ['sqrt', None, 'log2']

In [31]:
max_depth = [10, 20, 30, 40, 50, 60, 70, 80, 90, None]

In [32]:
min_samples_leaf = [1, 2, 4]

In [33]:
random_grid = {'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_leaf': min_samples_leaf}
print(random_grid)

{'max_features': ['sqrt', None, 'log2'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, None], 'min_samples_leaf': [1, 2, 4]}


In [34]:
rf = RandomForestClassifier(n_estimators=10)
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 90, cv = 3, random_state=42)

In [35]:
rf_random.fit(x_train, y_train)

### Then we read:

In [36]:
rf_random.best_params_

{'min_samples_leaf': 1, 'max_features': None, 'max_depth': 40}

### Looks like our best model is produced with n_estimators = 10, min_samles_leaf: 1, max_features = None, & max_depth = 40

### Note and huge discovery! Each time you rerun the code, a different output is generated. I depends on the exact model you run. This makes sense because each time the forest is generated it starts at a different point and this creates different results.

## Now to run the model with the best parameters numbers.

In [37]:
forest = RandomForestClassifier(n_estimators=10, min_samples_leaf=1, max_features=None, max_depth=40)
forest.fit(x_train, y_train)

## Prediction & Classification Report

In [38]:
forestPredictions = forest.predict(x_test)
print(confusion_matrix(y_test, forestPredictions))
print(classification_report(y_test, forestPredictions))

[[ 7  7]
 [ 2 21]]
              precision    recall  f1-score   support

           0       0.78      0.50      0.61        14
           1       0.75      0.91      0.82        23

    accuracy                           0.76        37
   macro avg       0.76      0.71      0.72        37
weighted avg       0.76      0.76      0.74        37



## This looks like our best model overall: Accuracy of prediction is 78% of deaths and 75% of survivals with overall accuracy of 76%. 

## Features Importances:

In [39]:
feature_importances = pd.Series(forest.feature_importances_, index=x.columns)
feature_importances

pclass       0.007843
age          0.310802
sibsp        0.042607
parch        0.019708
fare         0.247610
sexR         0.249810
embarkedR    0.014792
deckR        0.106828
dtype: float64

In [40]:
feature_importances.sort_values(inplace=True, ascending=False)
print(feature_importances)

age          0.310802
sexR         0.249810
fare         0.247610
deckR        0.106828
sibsp        0.042607
parch        0.019708
embarkedR    0.014792
pclass       0.007843
dtype: float64


### Overall, it looks like age and sex have the biggest affect on survival. This makes sense in an age of chivalry. Men and older people would have been much more willing to sacrifice themselves for women and children over 100 years ago. From there, how much you paid for your ticket and what deck you were on when the event occurred seem to affect your survival next. Again, this makes sense, how much you paid would have dictated where on the ship you were allowed to hang out. Being farther below the upper decks where the lifeboats were located would have reduced your chances of escaping the ship. From there, other factors affected your rate of survival very little, surprisingly. 

### In this model: DT accuracy rates were 80% death, 78% survival, and 79% overall. While RF rates were 78% death, 75% survival, and 76% overall. These rates are actually very close. My guess is this has to do with the size of the sample, as well as the number of cases dropped for NA values. Some different cleanup methods could be used to see if this could affect the prediction rates.