# Random Forest Exercises

This .inpyb file will eventually be joined to model.ipynb per the curriculum instructions, but because model.ipynb needs to be rebuilt from the bottom up, I will store the solutions here for now, and copy to model.ipynb once that file is running as it should.

In [1]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns

import graphviz
from graphviz import Graph

from env import user, password, host
from acquire import get_titanic_data
from prepare import prep_titanic

#### Q1. Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 20.

In [2]:
df = prep_titanic()
dummy_df = pd.get_dummies(df['sex']).drop(columns=['male'])
df = pd.concat([df, dummy_df], axis=1).drop(columns=['sex'])
X = df[['pclass', 'age', 'sibsp', 'parch', 'fare', 'alone', 'Q', 'S', 'female']]
X['age'].fillna(inplace=True, value=X['age'].mean())
y = df[['survived']]
X_train_validate, X_test, y_train_validate, y_test = train_test_split(X, y, test_size = .20, random_state = 123, stratify=y.survived)
X_train, X_validate, y_train, y_validate = train_test_split(X_train_validate, y_train_validate, test_size = .30, random_state = 123, stratify=y_train_validate.survived)

In [5]:
rf1 = RandomForestClassifier(random_state = 123, min_samples_leaf = 1, max_depth = 20)
rf1.fit(X_train, y_train)
print(rf1.feature_importances_)
print(X_train.columns)

[0.08958006 0.2462471  0.05101585 0.03355855 0.28881728 0.0234286
 0.01138673 0.02310322 0.23286261]
Index(['pclass', 'age', 'sibsp', 'parch', 'fare', 'alone', 'Q', 'S', 'female'], dtype='object')


#### Q2. Evaluate your results using the model score, confusion matrix, and classification report.

In [7]:
y_pred1 = rf1.predict(X_train)
y_pred1_proba = rf1.predict_proba(X_train)
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf1.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.99


In [9]:
print(confusion_matrix(y_train, y_pred1))

[[305   2]
 [  4 186]]


In [10]:
print(classification_report(y_train, y_pred1))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       307
           1       0.99      0.98      0.98       190

    accuracy                           0.99       497
   macro avg       0.99      0.99      0.99       497
weighted avg       0.99      0.99      0.99       497



#### Q3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [12]:
confusion_matrix1 = confusion_matrix(y_train, y_pred1)

print(f'Positive is defined as dying; Negative is defined as surviving')
true_positive_count = confusion_matrix1[0][0]
print(f'True positives: {true_positive_count}')
false_negative_count = confusion_matrix1[1][0]
print(f'False negatives: {false_negative_count}')
false_positive_count = confusion_matrix1[0][1]
print(f'False positives: {false_positive_count}')
true_negative_count = confusion_matrix1[1][1]
print(f'True negatives: {true_negative_count}')

Positive is defined as dying; Negative is defined as surviving
True positives: 305
False negatives: 4
False positives: 2
True negatives: 186


#### Q4. Run through steps increasing your min_samples_leaf to 5 and decreasing your max_depth to 3.

In [13]:
rf2 = RandomForestClassifier(random_state = 123, min_samples_leaf = 5, max_depth = 3)
rf2.fit(X_train, y_train)
print(rf2.feature_importances_)
print(X_train.columns)

[0.14754544 0.06312049 0.04542362 0.02189998 0.2011621  0.05112113
 0.00816355 0.00850593 0.45305775]
Index(['pclass', 'age', 'sibsp', 'parch', 'fare', 'alone', 'Q', 'S', 'female'], dtype='object')


In [14]:
y_pred2 = rf2.predict(X_train)
y_pred2_proba = rf2.predict_proba(X_train)
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf2.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.83


In [15]:
print(confusion_matrix(y_train, y_pred2))

[[288  19]
 [ 66 124]]


In [16]:
print(classification_report(y_train, y_pred2))

              precision    recall  f1-score   support

           0       0.81      0.94      0.87       307
           1       0.87      0.65      0.74       190

    accuracy                           0.83       497
   macro avg       0.84      0.80      0.81       497
weighted avg       0.83      0.83      0.82       497



#### Q5. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

In [17]:
print('Model 1: min_samples_leaf = 1, max_depth = 20')
print(classification_report(y_train, y_pred1))
print('Model 2: min_samples_leaf = 5, max_depth = 3')
print(classification_report(y_train, y_pred2))

Model 1: min_samples_leaf = 1, max_depth = 20
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       307
           1       0.99      0.98      0.98       190

    accuracy                           0.99       497
   macro avg       0.99      0.99      0.99       497
weighted avg       0.99      0.99      0.99       497

Model 2: min_samples_leaf = 5, max_depth = 3
              precision    recall  f1-score   support

           0       0.81      0.94      0.87       307
           1       0.87      0.65      0.74       190

    accuracy                           0.83       497
   macro avg       0.84      0.80      0.81       497
weighted avg       0.83      0.83      0.82       497



In [None]:
# Model 1 Performs better on every metric possible
# It is very likely that model 1 is overfitting the training data and would generalize very poorly to a validate or test set
# This is because the depth is so high and the minimum samples leaf setting is so low that each leaf in model 1 contains a very small number of observations

#### Q6. After making a few models, which one has the best performance (or closest metrics) on both train and validate?

In [19]:
# Age, Fare, and Female have the greatest weight in both models generated so far, so I will create a third model only using those features
X_train.drop(columns=['pclass', 'sibsp', 'parch', 'alone', 'Q', 'S'], inplace=True)
X_train.head()

Unnamed: 0,age,fare,female
583,36.0,40.125,0
337,41.0,134.5,1
50,7.0,39.6875,0
218,32.0,76.2917,1
31,29.642093,146.5208,1


In [20]:
rf3 = RandomForestClassifier(random_state = 123, min_samples_leaf = 5, max_depth = 3)
rf3.fit(X_train, y_train)
print(rf3.feature_importances_)
print(X_train.columns)

[0.17394507 0.34500947 0.48104546]
Index(['age', 'fare', 'female'], dtype='object')


In [22]:
y_pred3 = rf3.predict(X_train)
y_pred3_proba = rf3.predict_proba(X_train)
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf3.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.80


In [23]:
print(confusion_matrix(y_train, y_pred3))

[[278  29]
 [ 71 119]]


In [24]:
print(classification_report(y_train, y_pred3))

              precision    recall  f1-score   support

           0       0.80      0.91      0.85       307
           1       0.80      0.63      0.70       190

    accuracy                           0.80       497
   macro avg       0.80      0.77      0.78       497
weighted avg       0.80      0.80      0.79       497



In [26]:
# Now we will compare the performance of all three models on validate
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(rf1.score(X_validate, y_validate)))
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(rf2.score(X_validate, y_validate)))


Accuracy of Decision Tree classifier on validate set: 0.84
Accuracy of Decision Tree classifier on validate set: 0.79


In [28]:
X_validate.drop(columns=['pclass', 'sibsp', 'parch', 'alone', 'Q', 'S'], inplace=True)
X_validate.head()

Unnamed: 0,age,fare,female
610,39.0,31.275,1
424,18.0,20.2125,0
568,29.642093,7.2292,0
701,35.0,26.2875,0
101,29.642093,7.8958,0


In [29]:
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(rf3.score(X_validate, y_validate)))

Accuracy of Decision Tree classifier on validate set: 0.75


Surprisingly, Model1 performed the best by a large margin despite the risk of overfitting. 

#### Recommendation: pass Model 1 to the testing set. 