In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import sklearn as sk

import warnings
warnings.filterwarnings('ignore')

from env import get_db_url, user, password, host

import acquire
import prepare
import explore

# pandas display preferences
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 3)
#pd.option_context('display.max_rows', None)


# Titanic - Decision Tree

In [2]:
df = acquire.get_titanic_data()

Reading from local CSV...


In [3]:
df = prepare.prep_titanic(df)

In [4]:
train, test = train_test_split(df, test_size=.2, random_state=42, stratify=df.survived)
train, validate = train_test_split(train, test_size=.3, random_state=42, stratify=train.survived)

In [5]:
train.shape

(498, 10)

In [6]:
test.shape

(179, 10)

In [7]:
validate.shape

(214, 10)

In [8]:
target = 'survived'

x_train = train.drop(columns=target)
y_train = train[target]

x_validate = validate.drop(columns=target)
y_validate = validate[target]

x_test = test.drop(columns=target)
y_test = test[target]

### 1. What is your baseline prediction? What is your baseline accuracy? 


In [9]:
target = 'survived'

train_results = pd.DataFrame()
train_results['actual'] = train[target]
train_results['baseline'] = train[target].mode()[0]
train_results.head(3)

Unnamed: 0,actual,baseline
779,1,0
159,0,0
738,0,0


In [13]:
print(f'Baseline prediction: Survived = {train[target].mode()[0]}')
print(f'Baseline accuracy: {sk.metrics.accuracy_score(train_results.actual, train_results.baseline, normalize=True):.2f}')

Baseline prediction: Survived = 0
Baseline accuracy: 0.62


### 2. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)


In [56]:
# max depth = 3
clf = DecisionTreeClassifier(max_depth=3, random_state=42)
clf = clf.fit(x_train, y_train)
y_pred = clf.predict(x_train)

### 3. Evaluate your in-sample results using the model score, confusion matrix, and classification report.


In [57]:
line_break = ('-' * 30)

print(f'Model Score: {clf.score(x_train, y_train):.2f}')
print(line_break)
print('Classification Report:')
print(classification_report(y_train, y_pred))
print(line_break)
labels = sorted(y_train.unique())
print('Confusion Matrix:')
print(pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels))
# I'm not sure which represents actual vs predicted


Model Score: 0.81
------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.89      0.85       307
           1       0.79      0.68      0.73       191

    accuracy                           0.81       498
   macro avg       0.81      0.78      0.79       498
weighted avg       0.81      0.81      0.81       498

------------------------------
Confusion Matrix:
     0    1
0  273   34
1   61  130


### 4. Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.


In [58]:
train_results['predicted'] = y_pred
train_results.head(3)

Unnamed: 0,actual,baseline,predicted
779,1,0,1
159,0,0,0
738,0,0,0


In [59]:
positive = 1
negative = 0
n = len(train)

accuracy = sk.metrics.accuracy_score(y_train, y_pred, normalize=True)
tp_rate = len(train_results[(train_results.predicted == positive) & (train_results.actual == positive)]) / n
fp_rate = len(train_results[(train_results.predicted == positive) & (train_results.actual == negative)]) / n
tn_rate = len(train_results[(train_results.predicted == negative) & (train_results.actual == negative)]) / n
fn_rate = len(train_results[(train_results.predicted == negative) & (train_results.actual == positive)]) / n
precision = sk.metrics.precision_score(y_train, y_pred, pos_label = positive)
f1_score = sk.metrics.f1_score(y_train, y_pred, pos_label=positive)
support = '?__?' # I don't undertsand what 'support' is and how to calculate it

print(f'Accuracy:\t\t{accuracy:.2f}')
print(f'Precision:\t\t{precision:.2f}')
print(f'F1 Score:\t\t{f1_score:.2f}')
print(f'Support:\t\t{support}')
print()
print(f'True Postive Rate:\t{tp_rate:.2f}')
print(f'False Positive Rate:\t{fp_rate:.2f}')
print(f'True Negative Rate:\t{tn_rate:.2f}')
print(f'False Negative Rate:\t{fn_rate:.2f}')

Accuracy:		0.81
Precision:		0.79
F1 Score:		0.73
Support:		?__?

True Postive Rate:	0.26
False Positive Rate:	0.07
True Negative Rate:	0.55
False Negative Rate:	0.12


### 5. Run through steps 2-4 using a different max_depth value.


 2. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)


In [60]:
# max depth = 4
clf = DecisionTreeClassifier(max_depth=4, random_state=42)
clf = clf.fit(x_train, y_train)
y_pred = clf.predict(x_train)

 3. Evaluate your in-sample results using the model score, confusion matrix, and classification report.


In [61]:
line_break = ('-' * 30)

print(f'Model Score: {clf.score(x_train, y_train):.2f}')
print(line_break)
print('Classification Report:')
print(classification_report(y_train, y_pred))
print(line_break)
labels = sorted(y_train.unique())
print('Confusion Matrix:')
print(pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels))
# I'm not sure which represents actual vs predicted

Model Score: 0.83
------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.94      0.87       307
           1       0.87      0.65      0.74       191

    accuracy                           0.83       498
   macro avg       0.84      0.79      0.81       498
weighted avg       0.83      0.83      0.82       498

------------------------------
Confusion Matrix:
     0    1
0  288   19
1   67  124


 4. Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.


In [62]:
positive = 1
negative = 0
n = len(train)

accuracy = sk.metrics.accuracy_score(y_train, y_pred, normalize=True)
tp_rate = len(train_results[(train_results.predicted == positive) & (train_results.actual == positive)]) / n
fp_rate = len(train_results[(train_results.predicted == positive) & (train_results.actual == negative)]) / n
tn_rate = len(train_results[(train_results.predicted == negative) & (train_results.actual == negative)]) / n
fn_rate = len(train_results[(train_results.predicted == negative) & (train_results.actual == positive)]) / n
precision = sk.metrics.precision_score(y_train, y_pred, pos_label = positive)
f1_score = sk.metrics.f1_score(y_train, y_pred, pos_label=positive)
support = '?__?' # I don't undertsand what 'support' is and how to calculate it

print(f'Accuracy:\t\t{accuracy:.2f}')
print(f'Precision:\t\t{precision:.2f}')
print(f'F1 Score:\t\t{f1_score:.2f}')
print(f'Support:\t\t{support}')
print()
print(f'True Postive Rate:\t{tp_rate:.2f}')
print(f'False Positive Rate:\t{fp_rate:.2f}')
print(f'True Negative Rate:\t{tn_rate:.2f}')
print(f'False Negative Rate:\t{fn_rate:.2f}')

Accuracy:		0.83
Precision:		0.87
F1 Score:		0.74
Support:		?__?

True Postive Rate:	0.26
False Positive Rate:	0.07
True Negative Rate:	0.55
False Negative Rate:	0.12


### 6. Which model performs better on your in-sample data?


The model with max depth of 4 performs better in metrics of Accuracy, Precision, and F1 Score. 

### 7. Which model performs best on your out-of-sample data, the validate set?
