In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import sklearn as sk

import warnings
warnings.filterwarnings('ignore')

from env import get_db_url, user, password, host

import acquire
import prepare
import explore

# pandas display preferences
pd.set_option('display.max_columns', 50)
pd.set_option('display.precision', 3)
#pd.option_context('display.max_rows', None)

line_break = ('-' * 50)
line_break_2 = ('\n' + '=' * 50 + '\n')

# Titanic Random Forest



    Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

    Evaluate your results using the model score, confusion matrix, and classification report.

    Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

    Run through steps increasing your min_samples_leaf and decreasing your max_depth.

    What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

After making a few models, which one has the best performance (or closest metrics) on both train and validate?

In [2]:
df = acquire.get_titanic_data()
df = prepare.prep_titanic(df)

Reading from local CSV...


In [3]:
target = 'survived'
positive = 1

In [4]:
train, test, validate = prepare.train_test_validate_split(df, target=target)

train	 n = 498
test	 n = 179
validate n = 214


In [5]:
x_train = train.drop(columns=target)
y_train = train[target]

x_validate = validate.drop(columns=target)
y_validate = validate[target]

x_test = test.drop(columns=target)
y_test = test[target]

### 1. Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

In [6]:
clf = RandomForestClassifier(random_state=42, min_samples_leaf=1, max_depth=10)
clf = clf.fit(x_train, y_train)
y_pred = clf.predict(x_train)

In [7]:
train_results = pd.DataFrame()
train_results['actual'] = train[target]
train_results['baseline'] = train[target].mode()[0]
train_results['predicted'] = y_pred

### 2. Evaluate your results using the model score, confusion matrix, and classification report.


In [8]:
class_report = pd.DataFrame(classification_report(y_train, y_pred, output_dict=True))

In [9]:
print(f'Model Score: {clf.score(x_train, y_train):.2f}')
print(line_break)
print('Confusion Matrix: \n', confusion_matrix(y_train, y_pred))
print(line_break)
print('Classification Report: \n', class_report)

Model Score: 0.93
--------------------------------------------------
Confusion Matrix: 
 [[299   8]
 [ 27 164]]
--------------------------------------------------
Classification Report: 
                  0        1  accuracy  macro avg  weighted avg
precision    0.917    0.953      0.93      0.935         0.931
recall       0.974    0.859      0.93      0.916         0.930
f1-score     0.945    0.904      0.93      0.924         0.929
support    307.000  191.000      0.93    498.000       498.000


### 3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.


In [10]:
accuracy = clf.score(x_train, y_train)
precision = sk.metrics.precision_score(y_train, y_pred, pos_label=positive)
recall = sk.metrics.recall_score(y_train, y_pred, pos_label=positive)
f1_score = sk.metrics.f1_score(y_train, y_pred, pos_label=positive)
support_1 = int(y_train[y_train == 1].count())
support_0 = int(y_train[y_train == 0].count())

n=len(train)
tp_rate = len(train_results[(train_results.predicted == positive) & (train_results.actual == positive)]) / n
fp_rate = len(train_results[(train_results.predicted == positive) & (train_results.actual != positive)]) / n
tn_rate = len(train_results[(train_results.predicted != positive) & (train_results.actual != positive)]) / n
fn_rate = len(train_results[(train_results.predicted != positive) & (train_results.actual == positive)]) / n

print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1_score:.2f}')
print(f'Support 1: {support_1}')
print(f'Support 0: {support_0}')
print()
print(f'True Postive Rate:\t{tp_rate:.2f}')
print(f'False Positive Rate:\t{fp_rate:.2f}')
print(f'True Negative Rate:\t{tn_rate:.2f}')
print(f'False Negative Rate:\t{fn_rate:.2f}')

Accuracy: 0.93
Precision: 0.95
Recall: 0.86
F1 Score: 0.90
Support 1: 191
Support 0: 307

True Postive Rate:	0.33
False Positive Rate:	0.02
True Negative Rate:	0.60
False Negative Rate:	0.05


### 4. Run through steps increasing your min_samples_leaf and decreasing your max_depth.


In [11]:
min_samples_leaf = 1
max_depth = 10

for i in range(1, 11):
    
    print(f'FOR MIN_SAMPLES_LEAF = {min_samples_leaf}\nAND MAX_DEPTH = {max_depth}')
    print()

    # 1. Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample)
    clf = RandomForestClassifier(random_state=42, 
                                 min_samples_leaf=min_samples_leaf, 
                                 max_depth=max_depth)
    clf = clf.fit(x_train, y_train)
    y_pred = clf.predict(x_train)
    
    # 2. Evaluate your results using the model score, confusion matrix, and classification report.
    print(f'Model Score: {clf.score(x_train, y_train):.2f}')
    print(line_break)
    print('Confusion Matrix: \n', confusion_matrix(y_train, y_pred))
    print(line_break)
    print('Classification Report: \n', pd.DataFrame(classification_report(y_train, y_pred, output_dict=True)))
    print(line_break)
    
    # 3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.
    accuracy = clf.score(x_train, y_train)
    precision = sk.metrics.precision_score(y_train, y_pred, pos_label=positive)
    recall = sk.metrics.recall_score(y_train, y_pred, pos_label=positive)
    f1_score = sk.metrics.f1_score(y_train, y_pred, pos_label=positive)
    support_1 = int(y_train[y_train == 1].count())
    support_0 = int(y_train[y_train == 0].count())

    n=len(train)
    tp_rate = len(train_results[(train_results.predicted == positive) & (train_results.actual == positive)]) / n
    fp_rate = len(train_results[(train_results.predicted == positive) & (train_results.actual != positive)]) / n
    tn_rate = len(train_results[(train_results.predicted != positive) & (train_results.actual != positive)]) / n
    fn_rate = len(train_results[(train_results.predicted != positive) & (train_results.actual == positive)]) / n

    print(f'Accuracy: {accuracy:.2f}')
    print(f'Precision: {precision:.2f}')
    print(f'Recall: {recall:.2f}')
    print(f'F1 Score: {f1_score:.2f}')
    print(f'Support 1: {support_1}')
    print(f'Support 0: {support_0}')
    print()
    print(f'True Postive Rate:\t{tp_rate:.2f}')
    print(f'False Positive Rate:\t{fp_rate:.2f}')
    print(f'True Negative Rate:\t{tn_rate:.2f}')
    print(f'False Negative Rate:\t{fn_rate:.2f}')
    
    
    print(line_break_2)
    
    min_samples_leaf += 1
    max_depth -= 1

FOR MIN_SAMPLES_LEAF = 1
AND MAX_DEPTH = 10

Model Score: 0.93
--------------------------------------------------
Confusion Matrix: 
 [[299   8]
 [ 27 164]]
--------------------------------------------------
Classification Report: 
                  0        1  accuracy  macro avg  weighted avg
precision    0.917    0.953      0.93      0.935         0.931
recall       0.974    0.859      0.93      0.916         0.930
f1-score     0.945    0.904      0.93      0.924         0.929
support    307.000  191.000      0.93    498.000       498.000
--------------------------------------------------
Accuracy: 0.93
Precision: 0.95
Recall: 0.86
F1 Score: 0.90
Support 1: 191
Support 0: 307

True Postive Rate:	0.33
False Positive Rate:	0.02
True Negative Rate:	0.60
False Negative Rate:	0.05


FOR MIN_SAMPLES_LEAF = 2
AND MAX_DEPTH = 9

Model Score: 0.88
--------------------------------------------------
Confusion Matrix: 
 [[293  14]
 [ 45 146]]
--------------------------------------------------
C

### 5. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?


In [12]:
# create empty dataframe to store model results
model_results = pd.DataFrame(columns=['model_number', 'metric_type', 'sample_type', 'score'])

# empty dataframe to store information about the model itself
model_info = pd.DataFrame(columns=['model_number', 'min_samples_leaf', 'max_depth'])
######################################################################################
# store baseline metrics

model_number = 'baseline'

# store info about the model
dct = {'model_number': model_number,
       'min_samples_leaf': np.nan,
       'max_depth': np.nan}
model_info = model_info.append(dct, ignore_index=True)

# establish baseline predictions for train sample
y_pred = baseline_pred = pd.Series([train[target].mode()[0]]).repeat(len(train))

# get metrics
dct = {'model_number': model_number, 
       'sample_type': 'train', 
       'metric_type': 'accuracy',
       'score': sk.metrics.accuracy_score(y_train, y_pred)}
model_results = model_results.append(dct, ignore_index=True)

dct = {'model_number': model_number, 
       'sample_type': 'train', 
       'metric_type': 'precision',
       'score': sk.metrics.precision_score(y_train, y_pred, pos_label=positive)}
model_results = model_results.append(dct, ignore_index=True)

dct = {'model_number': model_number, 
       'sample_type': 'train', 
       'metric_type': 'recall',
       'score': sk.metrics.recall_score(y_train, y_pred, pos_label=positive)}
model_results = model_results.append(dct, ignore_index=True)

dct = {'model_number': model_number, 
       'sample_type': 'train', 
       'metric_type': 'f1_score',
       'score': sk.metrics.f1_score(y_train, y_pred, pos_label=positive)}
model_results = model_results.append(dct, ignore_index=True)

# establish baseline predictions for validate sample
y_pred = baseline_pred = pd.Series([train[target].mode()[0]]).repeat(len(validate))

# get metrics
dct = {'model_number': model_number, 
       'sample_type': 'validate', 
       'metric_type': 'f1_score',
       'score': sk.metrics.f1_score(y_validate, y_pred, pos_label=positive)}
model_results = model_results.append(dct, ignore_index=True)

dct = {'model_number': model_number, 
       'sample_type': 'validate', 
       'metric_type': 'accuracy',
       'score': sk.metrics.accuracy_score(y_validate, y_pred)}
model_results = model_results.append(dct, ignore_index=True)

dct = {'model_number': model_number, 
       'sample_type': 'validate', 
       'metric_type': 'precision',
       'score': sk.metrics.precision_score(y_validate, y_pred, pos_label=positive)}
model_results = model_results.append(dct, ignore_index=True)

dct = {'model_number': model_number, 
       'sample_type': 'validate', 
       'metric_type': 'recall',
       'score': sk.metrics.recall_score(y_validate, y_pred, pos_label=positive)}
model_results = model_results.append(dct, ignore_index=True)


#######################################################################################
# create models

model_number = 1
max_depth = 10
min_samples_leaf = 1

for i in range(1, 11):
    
    # store info about the model
    dct = {'model_number': model_number,
           'min_samples_leaf': min_samples_leaf,
           'max_depth': max_depth}
    model_info = model_info.append(dct, ignore_index=True)
    
    # fit the classifier to the training sample and transform
    clf = RandomForestClassifier(random_state=42, 
                                     min_samples_leaf=min_samples_leaf, 
                                     max_depth=max_depth)
    clf = clf.fit(x_train, y_train)
    y_pred = clf.predict(x_train)
    

    # results for train sample
    y_pred = clf.predict(x_train)
    
    
    # get metrics
    dct = {'model_number': model_number, 
           'sample_type': 'train', 
           'metric_type': 'accuracy',
           'score': sk.metrics.accuracy_score(y_train, y_pred)}
    model_results = model_results.append(dct, ignore_index=True)

    dct = {'model_number': model_number, 
           'sample_type': 'train', 
           'metric_type': 'precision',
           'score': sk.metrics.precision_score(y_train, y_pred, pos_label=positive)}
    model_results = model_results.append(dct, ignore_index=True)

    dct = {'model_number': model_number, 
           'sample_type': 'train', 
           'metric_type': 'recall',
           'score': sk.metrics.recall_score(y_train, y_pred, pos_label=positive)}
    model_results = model_results.append(dct, ignore_index=True)

    dct = {'model_number': model_number, 
           'sample_type': 'train', 
           'metric_type': 'f1_score',
           'score': sk.metrics.f1_score(y_train, y_pred, pos_label=positive)}
    model_results = model_results.append(dct, ignore_index=True)


    # results for validate sample
    y_pred = clf.predict(x_validate)
    
    # get metrics
    dct = {'model_number': model_number, 
           'sample_type': 'validate', 
           'metric_type': 'f1_score',
           'score': sk.metrics.f1_score(y_validate, y_pred, pos_label=positive)}
    model_results = model_results.append(dct, ignore_index=True)

    dct = {'model_number': model_number, 
           'sample_type': 'validate', 
           'metric_type': 'accuracy',
           'score': sk.metrics.accuracy_score(y_validate, y_pred)}
    model_results = model_results.append(dct, ignore_index=True)

    dct = {'model_number': model_number, 
           'sample_type': 'validate', 
           'metric_type': 'precision',
           'score': sk.metrics.precision_score(y_validate, y_pred, pos_label=positive)}
    model_results = model_results.append(dct, ignore_index=True)

    dct = {'model_number': model_number, 
           'sample_type': 'validate', 
           'metric_type': 'recall',
           'score': sk.metrics.recall_score(y_validate, y_pred, pos_label=positive)}
    model_results = model_results.append(dct, ignore_index=True)
    
    model_number += 1
    min_samples_leaf += 1
    max_depth -= 1

In [13]:
def display_model_results():
    return model_results.pivot_table(columns='model_number', 
                                     index=('metric_type', 'sample_type'), 
                                     values='score',
                                     aggfunc=lambda x: x)

In [14]:
model_info

Unnamed: 0,model_number,min_samples_leaf,max_depth
0,baseline,,
1,1,1.0,10.0
2,2,2.0,9.0
3,3,3.0,8.0
4,4,4.0,7.0
5,5,5.0,6.0
6,6,6.0,5.0
7,7,7.0,4.0
8,8,8.0,3.0
9,9,9.0,2.0


In [15]:
display_model_results().T

metric_type,accuracy,accuracy,f1_score,f1_score,precision,precision,recall,recall
sample_type,train,validate,train,validate,train,validate,train,validate
model_number,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
1,0.93,0.804,0.904,0.716,0.953,0.803,0.859,0.646
2,0.882,0.818,0.832,0.738,0.912,0.821,0.764,0.671
3,0.865,0.818,0.808,0.742,0.892,0.812,0.738,0.683
4,0.863,0.813,0.806,0.73,0.887,0.818,0.738,0.659
5,0.851,0.818,0.792,0.742,0.855,0.812,0.738,0.683
6,0.833,0.818,0.764,0.735,0.838,0.831,0.702,0.659
7,0.821,0.808,0.752,0.732,0.804,0.789,0.707,0.683
8,0.817,0.822,0.738,0.743,0.821,0.833,0.67,0.671
9,0.801,0.808,0.715,0.725,0.795,0.806,0.649,0.659
10,0.743,0.729,0.508,0.482,0.957,0.9,0.346,0.329


Accuracy tends to be the highest metric across the models, followed by precision, then recall. 
Model 1 performs best on the in-sample data, most likely due to it's large max depth_value of 10. 

### After making a few models, which one has the best performance (or closest metrics) on both train and validate?

Model 8 - with min_samples_leaf = 8 and max_depth = 3 - performs best across all metrics (i.e. has less dropoff in performance between train and validate sets. )