# imports

In [70]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


from pydataset import data

import env
import os
import acquire
import prepare

# data acquistion and prep

In [7]:
titanic = acquire.get_titanic_data(acquire.get_connection)

In [8]:
titanic = prepare.prep_titanic(titanic)

In [29]:
titanic


Unnamed: 0,survived,pclass,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,3,1,0,7.2500,0,1,0,1
1,1,1,1,0,71.2833,0,0,0,0
2,1,3,0,0,7.9250,1,0,0,1
3,1,1,1,0,53.1000,0,0,0,1
4,0,3,0,0,8.0500,1,1,0,1
...,...,...,...,...,...,...,...,...,...
886,0,2,0,0,13.0000,1,1,0,1
887,1,1,0,0,30.0000,1,0,0,1
888,0,3,1,2,23.4500,0,0,0,1
889,1,1,0,0,30.0000,1,1,0,0


In [10]:
titanic = titanic.drop(columns=['sex','embark_town'])

# train, validate, test

In [11]:
train,validate,test = prepare.split_data(titanic,'survived')

In [12]:
train.shape, validate.shape, test.shape

((498, 9), (214, 9), (179, 9))

# x and y version of train

In [13]:
x_train = train.drop(columns=['survived'])
y_train = train.survived

In [14]:

x_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

In [15]:
x_test = test.drop(columns=['survived'])
y_test = test.survived

# Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

In [17]:
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=1,
                            n_estimators=100,
                            max_depth=10, 
                            random_state=42)

In [19]:
rf.fit(x_train,y_train)

RandomForestClassifier(max_depth=10, random_state=42)

# Evaluate your results using the model score, confusion matrix, and classification report.

## score

In [52]:
rf.score(x_train,y_train)

0.9357429718875502

## feature importance

In [22]:
print(rf.feature_importances_)

[0.09543983 0.0526442  0.06021203 0.36923365 0.01694467 0.36171593
 0.0123579  0.03145179]


## make preditions

In [24]:
y_pred = rf.predict(x_train)

## estimate probability 

In [27]:
y_pred_proba = rf.predict_proba(x_train)

## compute the accuracy

### in sample

In [31]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(x_train, y_train)))

Accuracy of random forest classifier on training set: 0.94


## confusion matrix

In [35]:
print(confusion_matrix(y_train, y_pred))

[[299   8]
 [ 24 167]]


### Precision

In [36]:
tp = 299 
tn = 167
fp = 8
fn = 24

In [57]:
comb = tp + tn + fp + fn
comb

498

In [38]:
precision = tp / (tp + fp)
precision

0.9739413680781759

### Recall

In [41]:
recall = tp / (tp + fn)
recall

0.9256965944272446

## F1-Score

In [45]:
F1 = 2 * (precision * recall) / (precision + recall)
F1


0.9492063492063493

## classification report

In [46]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.97      0.95       307
           1       0.95      0.87      0.91       191

    accuracy                           0.94       498
   macro avg       0.94      0.92      0.93       498
weighted avg       0.94      0.94      0.94       498



# Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [63]:
accuracy = (tp + tn)/ comb
print(f"Accuracy: {accuracy}")

true_positive_rate = tp/(tp+fn)
print(f"True Positive Rate: {true_positive_rate}")

false_positive_rate = fp/(fp+tn)
print(f"False Positive Rate: {false_positive_rate}")

true_negative_rate = tn/(tn+fp)
print(f"True Negative Rate: {true_negative_rate}")

false_negative_rate = fn/(fn+tp)
print(f"False Negative Rate: {false_negative_rate}")

print(f"Precision: {precision}")

print(f"Recall: {recall}")

print(f"F1 Score: {f1_score}")

support_pos = tp + fn
print(f"Support (0): {support_pos}")

support_neg = fp + tn
print(f"Support (1): {support_neg}")

Accuracy: 0.9357429718875502
True Positive Rate: 0.9256965944272446
False Positive Rate: 0.045714285714285714
True Negative Rate: 0.9542857142857143
False Negative Rate: 0.07430340557275542
Precision: 0.9739413680781759
Recall: 0.9256965944272446
F1 Score: 0.9492063492063493
Support (0): 323
Support (1): 175


# Run through steps increasing your min_samples_leaf and decreasing your max_depth.

In [67]:
runs = []

# clarifies different max depth and min_samples_leaf
for i in range (1,10):
    for k in range(2,10):
        rft = RandomForestClassifier(max_depth=k, 
                                    min_samples_leaf=i, 
                                    random_state=123)
        # fit the model ( train)
        rft = rft.fit(x_train,y_train)
        
        # eval
        in_samp_acc = rft.score(x_train, y_train)
        
        out_samp_acc = rft.score(x_validate,y_validate)
        
        returns = {
            "min_samples_per_leaf": i,
            "max_depth": k,
            "train_accuracy": in_samp_acc,
            "validate_accuracy": out_samp_acc 
        }
        
        runs.append(returns)

In [71]:
df = pd.DataFrame(runs)

In [75]:
df['difference'] = df.train_accuracy - df.validate_accuracy

In [78]:
df.sort_values(by=['validate_accuracy'], ascending=False).head(10)

Unnamed: 0,min_samples_per_leaf,max_depth,train_accuracy,validate_accuracy,difference
49,7,3,0.817269,0.803738,0.013531
9,2,3,0.819277,0.803738,0.015539
2,1,4,0.835341,0.803738,0.031603
57,8,3,0.819277,0.803738,0.015539
6,1,8,0.921687,0.803738,0.117948
7,1,9,0.931727,0.803738,0.127989
1,1,3,0.819277,0.803738,0.015539
25,4,3,0.817269,0.799065,0.018204
33,5,3,0.819277,0.799065,0.020212
65,9,3,0.813253,0.799065,0.014188
