# Decision Tree: Exercises
Using the titanic data, in your classification-exercises repository, create a notebook, decision_tree.ipynb where you will do the following:

In [33]:
## Imports:

# Standard DS imports
import numpy as np
import pandas as pd

# acquisition:
from pydataset import data
import acquire

# prep
import prepare

# viz
import matplotlib.pyplot as plt
import seaborn as sns

# stats
from scipy import stats

# read_csv
import os

# Modeling
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier as DT, plot_tree, export_text

In [34]:
# aquire data
df = acquire.get_titanic_data('titanic_db')

In [35]:
# prep data
df = prepare.prep_titanic(df)

In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   passenger_id             891 non-null    int64  
 1   survived                 891 non-null    int64  
 2   pclass                   891 non-null    int64  
 3   sex                      891 non-null    object 
 4   sibsp                    891 non-null    int64  
 5   parch                    891 non-null    int64  
 6   fare                     891 non-null    float64
 7   embark_town              891 non-null    object 
 8   alone                    891 non-null    int64  
 9   sex_male                 891 non-null    int64  
 10  embark_town_Queenstown   891 non-null    int64  
 11  embark_town_Southampton  891 non-null    int64  
dtypes: float64(1), int64(9), object(2)
memory usage: 90.5+ KB


In [37]:
df = df.drop(columns=['sex','embark_town', 'passenger_id', 'sibsp'])

In [38]:
# split data
train_titanic, validate_titanic, test_titanic = prepare.split_function(df, 'survived')
print(f'Prepared df: {df.shape}')
print()
print(f'Train: {train_titanic.shape}')
print(f'Validate: {validate_titanic.shape}')
print(f'Test: {test_titanic.shape}')

Prepared df: (891, 8)

Train: (534, 8)
Validate: (178, 8)
Test: (179, 8)


### 1.. What is your baseline prediction? What is your baseline accuracy? remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.



#### 1.1. What is your baseline prediction?

In [39]:
# The most common value is the encoded value '0', or "Did not survive"
train_titanic.survived.value_counts()

survived
0    329
1    205
Name: count, dtype: int64

#### 1.2. What is your baseline accuracy?

In [40]:
baseline_accuracy = (train_titanic.survived == 0).mean()
print(f'Baseline accuracy is{baseline_accuracy: .2%}')

Baseline accuracy is 61.61%


### 2.. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [41]:
### Split Train, validate, test into x and y:
# x and y for Training Set
x_train = train_titanic.drop(columns=['survived'])
y_train = train_titanic['survived']

# x and y for Validate Set
x_validate = validate_titanic.drop(columns=['survived'])
y_validate = validate_titanic['survived']

# x and y for Test Set
x_test = test_titanic.drop(columns=['survived'])
y_test = test_titanic['survived']

In [42]:
# 1. CREATE OBJECT:
clf = DT(random_state=666)

In [43]:
# 2. FIT OBJECT
clf.fit(x_train, y_train)

### 3.. Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [44]:
def get_metrics(model,x,y):
    
    labels = sorted(y_train.unique())

    # OUTPUTS AN ARRAY OF PREDICTIONS
    preds = clf.predict(x)
    print("Accuracy Score:", model.score(x,y))
    print()
    print('Confusion Matrix:')
    conf = confusion_matrix(y,preds)
    conf = pd.DataFrame(conf,
            index=[str(label) + '_actual'for label in labels],
            columns=[str(label) + '_predict'for label in labels])
    print(conf)
    print()
    print("Classification Report:")
    print(classification_report(y, preds))

In [45]:
# Training Sample results from get_metrics()
get_metrics(clf,x_train,y_train)

Accuracy Score: 0.9438202247191011

Confusion Matrix:
          0_predict  1_predict
0_actual        327          2
1_actual         28        177

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.99      0.96       329
           1       0.99      0.86      0.92       205

    accuracy                           0.94       534
   macro avg       0.95      0.93      0.94       534
weighted avg       0.95      0.94      0.94       534



### 4.. Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.
**ANSWER: Accuracy: 79.21%; 

In [46]:
def compute_metrics(TN,FP,FN,TP):
    all_ = (TP + TN + FP + FN)

    accuracy = (TP + TN) / all_

    TPR = recall = TP / (TP + FN)
    FPR = FP / (FP + TN)

    TNR = TN / (FP + TN)
    FNR = FN / (FN + TP)

    precision =  TP / (TP + FP)
    f1 =  2 * ((precision * recall) / ( precision + recall))

    support_pos = TP + FN
    support_neg = FP + TN

    print(f"Accuracy: {accuracy}\n")
    print(f"True Positive Rate/Sensitivity/Recall/Power: {TPR}")
    print(f"False Positive Rate/False Alarm Ratio/Fall-out: {FPR}")
    print(f"True Negative Rate/Specificity/Selectivity: {TNR}")
    print(f"False Negative Rate/Miss Rate: {FNR}\n")
    print(f"Precision/PPV: {precision}")
    print(f"F1 Score: {f1}\n")
    print(f"Support (0): {support_pos}")
    print(f"Support (1): {support_neg}")

In [47]:
confu = confusion_matrix(y_train,clf.predict(x_train))
TN, FP, FN, TP = confu.ravel()
TN, FP, FN, TP 

(327, 2, 28, 177)

In [48]:
# Precision, Recall, f1-score, and support: Use classification_report
compute_metrics(TN,FP,FN,TP)

Accuracy: 0.9438202247191011

True Positive Rate/Sensitivity/Recall/Power: 0.8634146341463415
False Positive Rate/False Alarm Ratio/Fall-out: 0.0060790273556231
True Negative Rate/Specificity/Selectivity: 0.993920972644377
False Negative Rate/Miss Rate: 0.13658536585365855

Precision/PPV: 0.9888268156424581
F1 Score: 0.921875

Support (0): 205
Support (1): 329


### 5.. Run through steps 2-4 using a different max_depth value.

In [49]:
# 2. Model
clf_2=DT(max_depth=2, random_state=666)
clf_2.fit(x_train, y_train)
clf_2.score(x_train, y_train)

0.7921348314606742

In [50]:
# 3. Eval
get_metrics(clf_2, x_train, y_train)

Accuracy Score: 0.7921348314606742

Confusion Matrix:
          0_predict  1_predict
0_actual        327          2
1_actual         28        177

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.99      0.96       329
           1       0.99      0.86      0.92       205

    accuracy                           0.94       534
   macro avg       0.95      0.93      0.94       534
weighted avg       0.95      0.94      0.94       534



In [51]:
# 4. Compute
confu = confusion_matrix(y_train,clf_2.predict(x_train))
TN, FP, FN, TP = confu.ravel()
TN, FP, FN, TP 

(283, 46, 65, 140)

In [52]:
compute_metrics(TN,FP,FN,TP)

Accuracy: 0.7921348314606742

True Positive Rate/Sensitivity/Recall/Power: 0.6829268292682927
False Positive Rate/False Alarm Ratio/Fall-out: 0.1398176291793313
True Negative Rate/Specificity/Selectivity: 0.8601823708206687
False Negative Rate/Miss Rate: 0.3170731707317073

Precision/PPV: 0.7526881720430108
F1 Score: 0.7161125319693095

Support (0): 205
Support (1): 329


In [53]:
for x in range(1,20):
    tree = DT(max_depth=x, random_state=666)
    tree.fit(x_train, y_train)
    acc = tree.score(x_train, y_train)
    print(f'for depth of {x:2}, the accuracy is {round(acc,2)}')

for depth of  1, the accuracy is 0.79
for depth of  2, the accuracy is 0.79
for depth of  3, the accuracy is 0.82
for depth of  4, the accuracy is 0.82
for depth of  5, the accuracy is 0.83
for depth of  6, the accuracy is 0.87
for depth of  7, the accuracy is 0.88
for depth of  8, the accuracy is 0.9
for depth of  9, the accuracy is 0.92
for depth of 10, the accuracy is 0.92
for depth of 11, the accuracy is 0.93
for depth of 12, the accuracy is 0.94
for depth of 13, the accuracy is 0.94
for depth of 14, the accuracy is 0.94
for depth of 15, the accuracy is 0.94
for depth of 16, the accuracy is 0.94
for depth of 17, the accuracy is 0.94
for depth of 18, the accuracy is 0.94
for depth of 19, the accuracy is 0.94


### 6.. Which model performs better on your in-sample data?
**ANSWER: For in-sample data, the model with a max depth of 10 had the highest accuracy.**

In [54]:
scores_all = []

for x in range(1,20):

    tree = DT(max_depth=x, random_state=666)
    tree.fit(x_train, y_train)
    
    # evaluate on train
    train_acc = tree.score(x_train, y_train)
    
    #evaluate on validate
    val_acc = tree.score(x_validate, y_validate)
    
    # difference between train and validate
    train_val_diff = abs(train_acc - val_acc)
    
    scores_all.append([x, train_acc, val_acc, train_val_diff])
    
scores_df = pd.DataFrame(scores_all, columns=['max_depth','train_acc','val_acc', 'train_val_diff'])
scores_df.sort_values(by=['val_acc'], ascending=False).head()

Unnamed: 0,max_depth,train_acc,val_acc,train_val_diff
2,3,0.818352,0.792135,0.026217
3,4,0.820225,0.792135,0.02809
9,10,0.921348,0.780899,0.140449
8,9,0.91573,0.780899,0.134831
6,7,0.878277,0.780899,0.097378


### 7.. Which model performs best on your out-of-sample data, the validate set?
**ANSWER: For the validate set, models with max_depth of 3 and 4 performed equally well. They had the highest accuracy and the smallest difference between train_accuracy and validate_accuracy.**

In [55]:
def DecisionTreeClassifier_depth_through_twenty(xtrain,ytrain,xvalidate,yvalidate):
    scores_all = []

    for x in range(1,20):

        tree = DT(max_depth=x, random_state=666)
        tree.fit(xtrain, ytrain)
        
        # evaluate on train
        train_acc = tree.score(xtrain, ytrain)
        
        #evaluate on validate
        val_acc = tree.score(xvalidate, yvalidate)
        
        # difference between train and validate
        train_val_diff = abs(train_acc - val_acc)
        
        scores_all.append([x, train_acc, val_acc, train_val_diff])
        
    scores_df = pd.DataFrame(scores_all, columns=['max_depth','train_acc','val_acc', 'train_val_diff'])
    return scores_df.sort_values(by=['val_acc'], ascending=False)

In [58]:
DecisionTreeClassifier_depth_through_twenty(x_train,y_train,x_validate,y_validate).head()

Unnamed: 0,max_depth,train_acc,val_acc,train_val_diff
2,3,0.818352,0.792135,0.026217
3,4,0.820225,0.792135,0.02809
9,10,0.921348,0.780899,0.140449
8,9,0.91573,0.780899,0.134831
6,7,0.878277,0.780899,0.097378
