In [1]:
import new_lib as nl
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None
%matplotlib inline
import acquire as a
import prepare as p

In [2]:
titanic = a.get_titanic_data('titanic')
titanic

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,0,3,male,22.0,1,0,7.2500,S,Third,,Southampton,0
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,2,1,3,female,26.0,0,0,7.9250,S,Third,,Southampton,1
3,3,3,1,1,female,35.0,1,0,53.1000,S,First,C,Southampton,0
4,4,4,0,3,male,35.0,0,0,8.0500,S,Third,,Southampton,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,886,886,0,2,male,27.0,0,0,13.0000,S,Second,,Southampton,1
887,887,887,1,1,female,19.0,0,0,30.0000,S,First,B,Southampton,1
888,888,888,0,3,female,,1,2,23.4500,S,Third,,Southampton,0
889,889,889,1,1,male,26.0,0,0,30.0000,C,First,C,Cherbourg,1


In [3]:
titanic = p.prep_titanic(titanic)
titanic

Unnamed: 0,survived,pclass,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,3,1,0,7.2500,0,1,0,1
1,1,1,1,0,71.2833,0,0,0,0
2,1,3,0,0,7.9250,1,0,0,1
3,1,1,1,0,53.1000,0,0,0,1
4,0,3,0,0,8.0500,1,1,0,1
...,...,...,...,...,...,...,...,...,...
886,0,2,0,0,13.0000,1,1,0,1
887,1,1,0,0,30.0000,1,0,0,1
888,0,3,1,2,23.4500,0,0,0,1
889,1,1,0,0,30.0000,1,1,0,0


In [5]:
train, val, test, X_train, y_train, X_val, y_val, X_test, y_test = nl.train_vailidate_test_split(titanic, 'survived')

In [39]:
rf = RandomForestClassifier(max_depth=10, min_samples_leaf=1,
                             random_state=77)

In [40]:
rf.fit(X_train, y_train)

In [41]:
rf.score(X_train, y_train)

0.9257028112449799

In [10]:
for i in range(3, 11):
    rf = RandomForestClassifier(max_depth=i, n_estimators=200,
                                random_state=42, max_samples=.5)
    rf.fit(X_train, y_train)
    
    train_accuracy = rf.score(X_train, y_train)
    
    val_accuracy = rf.score(X_val, y_val)
    
    print(f'Max depth of {i}. Train accuracy: {train_accuracy}. Validation accuracy: {val_accuracy}.')
    print()
    print('---------------------------------------')
    print()

Max depth of 3. Train accuracy: 0.8232931726907631. Validation accuracy: 0.8037383177570093.

---------------------------------------

Max depth of 4. Train accuracy: 0.8253012048192772. Validation accuracy: 0.7990654205607477.

---------------------------------------

Max depth of 5. Train accuracy: 0.8413654618473896. Validation accuracy: 0.7897196261682243.

---------------------------------------

Max depth of 6. Train accuracy: 0.8614457831325302. Validation accuracy: 0.8037383177570093.

---------------------------------------

Max depth of 7. Train accuracy: 0.8795180722891566. Validation accuracy: 0.7850467289719626.

---------------------------------------

Max depth of 8. Train accuracy: 0.8975903614457831. Validation accuracy: 0.7850467289719626.

---------------------------------------

Max depth of 9. Train accuracy: 0.9036144578313253. Validation accuracy: 0.7897196261682243.

---------------------------------------

Max depth of 10. Train accuracy: 0.9056224899598394. Va

In [13]:
y_preds = rf.predict(X_train)

In [14]:
print(classification_report(y_train, y_preds))


              precision    recall  f1-score   support

           0       0.90      0.94      0.92       294
           1       0.91      0.86      0.88       204

    accuracy                           0.91       498
   macro avg       0.91      0.90      0.90       498
weighted avg       0.91      0.91      0.91       498



In [15]:
rf_cm= pd.DataFrame(confusion_matrix(y_train, y_preds))
rf_cm

Unnamed: 0,0,1
0,276,18
1,29,175


In [17]:
importance_df = pd.DataFrame({'feature': X_train.columns.tolist(),
              'importance': rf.feature_importances_})

In [18]:
importance_df.sort_values('importance', ascending=False)

Unnamed: 0,feature,importance
3,fare,0.365255
5,sex_male,0.359981
0,pclass,0.096669
1,sibsp,0.05801
2,parch,0.049994
7,embark_town_Southampton,0.032444
4,alone,0.023005
6,embark_town_Queenstown,0.014642


In [27]:
tp = 276
fp = 18
fn = 29
tn = 175
support = train['survived'].value_counts().sum()
f1 = 2 * ((tp/(tp+fp)*tp/(tp+fn))/(tp/(tp+fp)+tp/(tp+fn)))

In [52]:
print(f'Accuracy is: {(tp+tn)/(tp+tn+fp+fn)}')
print(f'Precision is: {tp/(tp+fp)}')
print(f'Recall is: {tp/(tp+fn)}')
print(f'Support is: {support}')
print(f'F1 Score is: {f1}')
print(f'True Positive Rate is: {tp/(tp+fn)}')
print(f'False Positive Rate is: {fp/(fp+tn)}')
print(f'True Negative Rate is: {tn/(tn+fp)}')
print(f'False Negative Rate is: {fn/(fn+tp)}')

Accuracy is: 0.8955223880597015
Precision is: 0.9359430604982206
Recall is: 0.8945578231292517
Support is: 498
F1 Score is: 0.9215358931552589
True Positive Rate is: 0.8945578231292517
False Positive Rate is: 0.10285714285714286
True Negative Rate is: 0.8971428571428571
False Negative Rate is: 0.1054421768707483


In [35]:
rf2 = RandomForestClassifier(max_depth=7, min_samples_leaf=5,
                             random_state=77)

In [36]:
rf2.fit(X_train, y_train)

In [37]:
rf2.score(X_train, y_train)

0.8433734939759037

In [38]:
rf2.score(X_val, y_val)

0.7663551401869159

In [42]:
rf.score(X_train, y_train)

0.9257028112449799

In [43]:
rf.score(X_val, y_val)

0.7570093457943925

In [44]:
preds = rf2.predict(X_train)

In [48]:
cm2 = pd.DataFrame(confusion_matrix(preds, y_train))
cm2

Unnamed: 0,0,1
0,263,47
1,31,157


In [49]:
tp = 263
tn = 47
fn = 31
tn = 157

In [53]:
print(f'Accuracy is: {(tp+tn)/(tp+tn+fp+fn)}')
print(f'Precision is: {tp/(tp+fp)}')
print(f'Recall is: {tp/(tp+fn)}')
print(f'Support is: {support}')
print(f'F1 Score is: {f1}')
print(f'True Positive Rate is: {tp/(tp+fn)}')
print(f'False Positive Rate is: {fp/(fp+tn)}')
print(f'True Negative Rate is: {tn/(tn+fp)}')
print(f'False Negative Rate is: {fn/(fn+tp)}')

Accuracy is: 0.8955223880597015
Precision is: 0.9359430604982206
Recall is: 0.8945578231292517
Support is: 498
F1 Score is: 0.9215358931552589
True Positive Rate is: 0.8945578231292517
False Positive Rate is: 0.10285714285714286
True Negative Rate is: 0.8971428571428571
False Negative Rate is: 0.1054421768707483


first model is overfit and the second does a better job with validate