In [1]:
import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns

from acquire import *
from prepare import *

In [2]:
titanic = get_titanic_data()
titanic = prep_titanic_data(titanic)
train, validate, test = split_data(titanic, 'survived')
train.shape, validate.shape, test.shape

((498, 11), (214, 11), (179, 11))

In [3]:
columns = ['pclass', 'sex_male', 'alone']

X_train = train[columns]
y_train = train.survived

X_validate = validate[columns]
y_validate = validate.survived

X_test = test[columns]
y_test = test.survived

# Decision Tree Exercises

## 1. 

What is your baseline prediction? What is your baseline accuracy? remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.

In [4]:
train.survived.value_counts(normalize = True)

0    0.616466
1    0.383534
Name: survived, dtype: float64

Our baseline model would be predicting the most common target value, which is did not survive. Given a baseline model that always predicts did not survive the accuracy of this model would be ~62%.

## 2.

Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [5]:
model1 = DecisionTreeClassifier(max_depth = 2)

In [6]:
model1.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=2)

In [7]:
y_pred = model1.predict(X_train)
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1,

## 3

Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [8]:
model1.score(X_train, y_train)

0.7831325301204819

In [9]:
index_labels = ['Actual Did Not Survive', 'Actual Survived']
column_labels = ['Predicted Did Not Survive', 'Predicted Survived']
pd.DataFrame(confusion_matrix(y_train, y_pred, labels = [0, 1]), index = index_labels, columns = column_labels)

Unnamed: 0,Predicted Did Not Survive,Predicted Survived
Actual Did Not Survive,303,4
Actual Survived,104,87


In [10]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.99      0.85       307
           1       0.96      0.46      0.62       191

    accuracy                           0.78       498
   macro avg       0.85      0.72      0.73       498
weighted avg       0.83      0.78      0.76       498



## 4

Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

> Accuracy:            0.78
<br>
> True Positive Rate:  0.17
<br>
> False Positive Rate: 0.008
<br>
> True Negative Rate:  0.61
<br>
> False Negative rate: 0.21
<br>
> Precision:           0.96, 0.74
<br>
> Recall:              0.46, 0.99
<br>
> f1-score:            0.62, 0.85
<br>
> Support:             498

## 5

Run through steps 2-4 using a different max_depth value.

In [11]:
model2 = DecisionTreeClassifier(max_depth = 4)
model2.fit(X_train, y_train)
y_pred = model2.predict(X_train)
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1,

In [12]:
model2.score(X_train, y_train)

0.7951807228915663

In [13]:
pd.DataFrame(confusion_matrix(y_train, y_pred, labels = [0, 1]), index = index_labels, columns = column_labels)

Unnamed: 0,Predicted Did Not Survive,Predicted Survived
Actual Did Not Survive,286,21
Actual Survived,81,110


In [14]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.93      0.85       307
           1       0.84      0.58      0.68       191

    accuracy                           0.80       498
   macro avg       0.81      0.75      0.77       498
weighted avg       0.80      0.80      0.79       498



## 6

Which model performs better on your in-sample data?

> In terms of accuracy model 2 performed better, but only slightly better.

## 7

Which model performs best on your out-of-sample data, the validate set?

In [15]:
model1.score(X_validate, y_validate), model2.score(X_validate, y_validate)

(0.7897196261682243, 0.794392523364486)

Model 2 again performs slightly better on the validate set.