# Random Forests: Excercises

<hr style="border:2px solid black">

# Imports:

In [12]:
## Imports:

#standard DS imports
import pandas as pd
import numpy as np

#visualization imports
import matplotlib.pyplot as plt
import seaborn as sns

#metrics import
from sklearn.metrics import classification_report, confusion_matrix

#custom modules
import acquire
import prepare

<hr style="border:2px solid black">

## Acquire
Plan --> **Acquire** --> Prepare --> Explore --> Model --> Deliver

In [13]:
#get my data
df = acquire.get_titanic_data('titanic_db')

## Prepare
Plan --> Acquire --> **Prepare** --> Explore --> Model --> Deliver

In [14]:
#clean my data
df = prepare.prep_titanic(df)

In [15]:
#split my data into train, validate, test
train, validate, test = prepare.split_function(df, 'survived')

In [16]:
# further split by x and y:
target = 'survived'

# train
x_train = train.drop(columns=[target])
y_train = train[target]

# validate
x_validate = validate.drop(columns=[target])
y_validate = validate[target]

# test
x_test = test.drop(columns=[target])
y_test = test[target]

## Model
Plan --> Acquire --> Prepare --> Explore --> **Model** --> Deliver

## 1.) Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.



### Create Object:

In [17]:
#new import!
from sklearn.ensemble import RandomForestClassifier

In [18]:
#make the object
rf = RandomForestClassifier(random_state=666, min_samples_leaf=1, max_depth=10)
rf

### Fit Object:

In [19]:
#building our model on our train values
rf.fit(x_train, y_train)

### Transform:

In [20]:
# make predictions
y_pred = rf.predict(x_train)

## 2.) Evaluate your results using the model score, confusion matrix, and classification report.

### Evaluate:

In [21]:
# Score
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(x_train, y_train)))

Accuracy of random forest classifier on training set: 0.98


In [22]:
#find labels in our dataset & sort
labels = sorted(y_train.unique())
labels

[0, 1]

In [23]:
# Confusion Matrix
conf = confusion_matrix(y_train,y_pred)
conf = pd.DataFrame(conf,
            index=[str(label) + '_actual'for label in labels],
            columns=[str(label) + '_predict'for label in labels])
conf

Unnamed: 0,0_predict,1_predict
0_actual,329,0
1_actual,12,193


In [24]:
# Classification Report
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98       329
           1       1.00      0.94      0.97       205

    accuracy                           0.98       534
   macro avg       0.98      0.97      0.98       534
weighted avg       0.98      0.98      0.98       534



## 3.) Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.



In [25]:
TP, TN, FP, FN = 190, 329, 0, 15

In [26]:
all_ = (TP + TN + FP + FN)
accuracy = (TP + TN) / all_
TPR = recall = TP / (TP + FN)
FPR = FP / (FP + TN)
TNR = TN / (FP + TN)
FNR = FN / (FN + TP)
precision =  TP / (TP + FP)
f1 =  2 * ((precision * recall) / ( precision + recall))
support_pos = TP + FN
support_neg = FP + TN
print(f"Accuracy: {accuracy: .2%}\n")
print(f"True Positive Rate/Sensitivity/Recall/Power: {TPR: .2%}")
print(f"False Positive Rate/False Alarm Ratio/Fall-out: {FPR: .2%}")
print(f"True Negative Rate/Specificity/Selectivity: {TNR: .2%}")
print(f"False Negative Rate/Miss Rate: {FNR: .2%}\n")
print(f"Precision/PPV: {precision}")
print(f"F1 Score: {f1}\n")
print(f"Support (0): {support_pos}")
print(f"Support (1): {support_neg}")


Accuracy:  97.19%

True Positive Rate/Sensitivity/Recall/Power:  92.68%
False Positive Rate/False Alarm Ratio/Fall-out:  0.00%
True Negative Rate/Specificity/Selectivity:  100.00%
False Negative Rate/Miss Rate:  7.32%

Precision/PPV: 1.0
F1 Score: 0.9620253164556963

Support (0): 205
Support (1): 329


## 4.) Run through steps increasing your min_samples_leaf and decreasing your max_depth.



In [27]:
scores_all = []

for x in range(1,11):
    
    # looping through min_samples_leaf front to back 
    # looping through max_depth back to front
    rf = RandomForestClassifier(random_state=123, min_samples_leaf=x, max_depth=11-x) # different if x = 10 vs x = 1
    #fit it
    rf.fit(x_train, y_train)
    #transform it
    train_acc = rf.score(x_train, y_train)
    
    #evaluate on my validate data
    val_acc = rf.score(x_validate, y_validate)
    
    scores_all.append([x, 11-x, train_acc, val_acc])
    
scores_df = pd.DataFrame(scores_all, columns =['min_samples_leaf','max_depth','train_acc','val_acc'])
scores_df

Unnamed: 0,min_samples_leaf,max_depth,train_acc,val_acc
0,1,10,0.97191,0.758427
1,2,9,0.910112,0.769663
2,3,8,0.893258,0.752809
3,4,7,0.872659,0.752809
4,5,6,0.857678,0.775281
5,6,5,0.853933,0.764045
6,7,4,0.844569,0.769663
7,8,3,0.837079,0.769663
8,9,2,0.812734,0.747191
9,10,1,0.79588,0.741573


## 5.) What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

**ANSWER: The best model for in-sample data is the model with min_samples_leaf = 1 and max_depth=10. This is because both increasing the max_depth and decreasing the min_samples_leaf lead to overfitting. A leaf sample of 1 is specific to a single data point, while overly increasing the max_depth will create too many/too specific splits.**

In [28]:
scores_df['difference'] = scores_df.train_acc - scores_df.val_acc
scores_df.sort_values('train_acc', ascending=False)

Unnamed: 0,min_samples_leaf,max_depth,train_acc,val_acc,difference
0,1,10,0.97191,0.758427,0.213483
1,2,9,0.910112,0.769663,0.140449
2,3,8,0.893258,0.752809,0.140449
3,4,7,0.872659,0.752809,0.11985
4,5,6,0.857678,0.775281,0.082397
5,6,5,0.853933,0.764045,0.089888
6,7,4,0.844569,0.769663,0.074906
7,8,3,0.837079,0.769663,0.067416
8,9,2,0.812734,0.747191,0.065543
9,10,1,0.79588,0.741573,0.054307
