In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import sklearn as sk

import warnings
warnings.filterwarnings('ignore')

from env import get_db_url, user, password, host

import acquire
import prepare
import explore

# pandas display preferences
pd.set_option('display.max_columns', 50)
pd.set_option('display.precision', 3)
#pd.option_context('display.max_rows', None)


# Titanic - Decision Tree

In [2]:
df = acquire.get_titanic_data()

Reading from local CSV...


In [3]:
df = prepare.prep_titanic(df)

In [4]:
train, test = train_test_split(df, test_size=.2, random_state=42, stratify=df.survived)
train, validate = train_test_split(train, test_size=.3, random_state=42, stratify=train.survived)

In [5]:
train.shape

(498, 10)

In [6]:
test.shape

(179, 10)

In [7]:
validate.shape

(214, 10)

In [8]:
target = 'survived'

x_train = train.drop(columns=target)
y_train = train[target]

x_validate = validate.drop(columns=target)
y_validate = validate[target]

x_test = test.drop(columns=target)
y_test = test[target]

### 1. What is your baseline prediction? What is your baseline accuracy? 


In [9]:
target = 'survived'

train_results = pd.DataFrame()
train_results['actual'] = train[target]
train_results['baseline'] = train[target].mode()[0]
train_results.head(3)

Unnamed: 0,actual,baseline
779,1,0
159,0,0
738,0,0


In [10]:
print(f'Baseline prediction: {target} = {train[target].mode()[0]}')
print(f'Baseline accuracy: {sk.metrics.accuracy_score(train_results.actual, train_results.baseline, normalize=True):.2f}')

Baseline prediction: survived = 0
Baseline accuracy: 0.62


### 2. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)


In [11]:
# max depth = 3
clf = DecisionTreeClassifier(max_depth=3, random_state=42)
clf = clf.fit(x_train, y_train)
y_pred = clf.predict(x_train)

### 3. Evaluate your in-sample results using the model score, confusion matrix, and classification report.


In [12]:
line_break = ('-' * 30)

print(f'Model Score: {clf.score(x_train, y_train):.2f}')
print(line_break)
print('Classification Report:')
print(classification_report(y_train, y_pred))
print(line_break)
labels = sorted(y_train.unique())
print('Confusion Matrix:')
print(pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels))
# I'm not sure which represents actual vs predicted


Model Score: 0.81
------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.89      0.85       307
           1       0.79      0.68      0.73       191

    accuracy                           0.81       498
   macro avg       0.81      0.78      0.79       498
weighted avg       0.81      0.81      0.81       498

------------------------------
Confusion Matrix:
     0    1
0  273   34
1   61  130


### 4. Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.


In [13]:
train_results['predicted'] = y_pred
train_results.head(3)

Unnamed: 0,actual,baseline,predicted
779,1,0,1
159,0,0,0
738,0,0,0


In [14]:
positive = 1
negative = 0
n = len(train)

accuracy = sk.metrics.accuracy_score(y_train, y_pred, normalize=True)
tp_rate = len(train_results[(train_results.predicted == positive) & (train_results.actual == positive)]) / n
fp_rate = len(train_results[(train_results.predicted == positive) & (train_results.actual == negative)]) / n
tn_rate = len(train_results[(train_results.predicted == negative) & (train_results.actual == negative)]) / n
fn_rate = len(train_results[(train_results.predicted == negative) & (train_results.actual == positive)]) / n
precision = sk.metrics.precision_score(y_train, y_pred, pos_label = positive)
f1_score = sk.metrics.f1_score(y_train, y_pred, pos_label=positive)
recall = sk.metrics.recall_score(y_train, y_pred, pos_label=positive)
support = '?__?' # I don't undertsand what 'support' is and how to calculate it

print(f'Accuracy:\t\t{accuracy:.2f}')
print(f'Precision:\t\t{precision:.2f}')
print(f'F1 Score:\t\t{f1_score:.2f}')
print(f'Recall: \t\t{recall:.2f}')
print(f'Support:\t\t{support}')
print()
print(f'True Postive Rate:\t{tp_rate:.2f}')
print(f'False Positive Rate:\t{fp_rate:.2f}')
print(f'True Negative Rate:\t{tn_rate:.2f}')
print(f'False Negative Rate:\t{fn_rate:.2f}')

Accuracy:		0.81
Precision:		0.79
F1 Score:		0.73
Recall: 		0.68
Support:		?__?

True Postive Rate:	0.26
False Positive Rate:	0.07
True Negative Rate:	0.55
False Negative Rate:	0.12


### 5. Run through steps 2-4 using a different max_depth value.


 2. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)


In [15]:
# max depth = 4
clf = DecisionTreeClassifier(max_depth=4, random_state=42)
clf = clf.fit(x_train, y_train)

In [16]:
y_pred = clf.predict(x_train)

 3. Evaluate your in-sample results using the model score, confusion matrix, and classification report.


In [17]:
line_break = ('-' * 30)

print(f'Model Score: {clf.score(x_train, y_train):.2f}')
print(line_break)
print('Classification Report:')
print(classification_report(y_train, y_pred))
print(line_break)
labels = sorted(y_train.unique())
print('Confusion Matrix:')
print(pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels))
# I'm not sure which represents actual vs predicted

Model Score: 0.83
------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.94      0.87       307
           1       0.87      0.65      0.74       191

    accuracy                           0.83       498
   macro avg       0.84      0.79      0.81       498
weighted avg       0.83      0.83      0.82       498

------------------------------
Confusion Matrix:
     0    1
0  288   19
1   67  124


 4. Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.


In [18]:
positive = 1
negative = 0
n = len(train)

accuracy = sk.metrics.accuracy_score(y_train, y_pred, normalize=True)
tp_rate = len(train_results[(train_results.predicted == positive) & (train_results.actual == positive)]) / n
fp_rate = len(train_results[(train_results.predicted == positive) & (train_results.actual == negative)]) / n
tn_rate = len(train_results[(train_results.predicted == negative) & (train_results.actual == negative)]) / n
fn_rate = len(train_results[(train_results.predicted == negative) & (train_results.actual == positive)]) / n
precision = sk.metrics.precision_score(y_train, y_pred, pos_label = positive)
f1_score = sk.metrics.f1_score(y_train, y_pred, pos_label=positive)
recall = sk.metrics.recall_score(y_train, y_pred, pos_label=positive)
support = '?__?' # I don't undertsand what 'support' is and how to calculate it

print(f'Accuracy:\t\t{accuracy:.2f}')
print(f'Precision:\t\t{precision:.2f}')
print(f'F1 Score:\t\t{f1_score:.2f}')
print(f'Recall: \t\t{recall:.2f}')
print(f'Support:\t\t{support}')
print()
print(f'True Postive Rate:\t{tp_rate:.2f}')
print(f'False Positive Rate:\t{fp_rate:.2f}')
print(f'True Negative Rate:\t{tn_rate:.2f}')
print(f'False Negative Rate:\t{fn_rate:.2f}')

Accuracy:		0.83
Precision:		0.87
F1 Score:		0.74
Recall: 		0.65
Support:		?__?

True Postive Rate:	0.26
False Positive Rate:	0.07
True Negative Rate:	0.55
False Negative Rate:	0.12


### 6. Which model performs better on your in-sample data?


The model with max depth of 4 performs better in metrics of Accuracy, Precision, and F1 Score. 

### 7. Which model performs best on your out-of-sample data, the validate set?


max_depth 3 performs better in precision

max_depth 4 performs better in recall, f1 score, and accuracy

(see below)

In [19]:
# re-establish classifier with max_depth 3
clf = DecisionTreeClassifier(max_depth=3, random_state=42)
clf = clf.fit(x_train, y_train)

# create predictions for the validate set
y_pred = clf.predict(x_validate)

# display report
print(classification_report(y_validate, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.91      0.86       132
           1       0.82      0.67      0.74        82

    accuracy                           0.82       214
   macro avg       0.82      0.79      0.80       214
weighted avg       0.82      0.82      0.81       214



In [20]:
# re-establish classifier with max_depth 4
clf = DecisionTreeClassifier(max_depth=4, random_state=42)
clf = clf.fit(x_train, y_train)

# create predictions for the validate set
y_pred = clf.predict(x_validate)

# display report
print(classification_report(y_validate, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.93      0.87       132
           1       0.86      0.66      0.74        82

    accuracy                           0.83       214
   macro avg       0.84      0.80      0.81       214
weighted avg       0.83      0.83      0.82       214



# Telco Decision Tree

In [21]:
df = acquire.get_telco_data()
df = prepare.prep_telco(df)

Reading from local CSV...


In [22]:
target = 'churn'

In [23]:
train, test, validate = prepare.train_test_validate_split(df, target)

train	 n = 3937
test	 n = 1407
validate n = 1688


In [24]:
x_train = train.drop(columns=target)
y_train = train[target]

x_validate = validate.drop(columns=target)
y_validate = validate[target]

x_test = test.drop(columns=target)
y_test = test[target]

### 1. What is your baseline prediction? What is your baseline accuracy? 


In [25]:
train_results = pd.DataFrame()
train_results['actual'] = train[target]
train_results['baseline'] = train[target].mode()[0]

print(f'Baseline prediction: {target} = {train[target].mode()[0]}')
print(f'Baseline accuracy: {sk.metrics.accuracy_score(train_results.actual, train_results.baseline, normalize=True):.2f}')

Baseline prediction: churn = No
Baseline accuracy: 0.73


### 2. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [26]:
# no max_depth established
clf = DecisionTreeClassifier(random_state=42)
clf = clf.fit(x_train, y_train)
y_pred = clf.predict(x_train)

### 3. Evaluate your in-sample results using the model score, confusion matrix, and classification report.


In [27]:
line_break = ('-' * 30)

print(f'Model Score: {clf.score(x_train, y_train):.2f}')
print(line_break)
print('Classification Report:')
print(classification_report(y_train, y_pred))
print(line_break)
labels = sorted(y_train.unique())
print('Confusion Matrix:')
print(pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels))
# I'm not sure which represents actual vs predicted

Model Score: 1.00
------------------------------
Classification Report:
              precision    recall  f1-score   support

          No       1.00      1.00      1.00      2891
         Yes       1.00      0.99      1.00      1046

    accuracy                           1.00      3937
   macro avg       1.00      1.00      1.00      3937
weighted avg       1.00      1.00      1.00      3937

------------------------------
Confusion Matrix:
       No   Yes
No   2891     0
Yes     6  1040


### 4. Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.


In [28]:
positive = 'Yes'
negative = 'No'
n = len(train)

train_results['predicted'] = y_pred

accuracy = sk.metrics.accuracy_score(y_train, y_pred, normalize=True)
tp_rate = len(train_results[(train_results.predicted == positive) & (train_results.actual == positive)]) / n
fp_rate = len(train_results[(train_results.predicted == positive) & (train_results.actual == negative)]) / n
tn_rate = len(train_results[(train_results.predicted == negative) & (train_results.actual == negative)]) / n
fn_rate = len(train_results[(train_results.predicted == negative) & (train_results.actual == positive)]) / n
precision = sk.metrics.precision_score(y_train, y_pred, pos_label = positive)
f1_score = sk.metrics.f1_score(y_train, y_pred, pos_label=positive)
recall = sk.metrics.recall_score(y_train, y_pred, pos_label=positive)
support = '?__?' # I don't undertsand what 'support' is and how to calculate it

print(f'Accuracy:\t\t{accuracy:.2f}')
print(f'Precision:\t\t{precision:.2f}')
print(f'F1 Score:\t\t{f1_score:.2f}')
print(f'Recall: \t\t{recall:.2f}')
print(f'Support:\t\t{support}')
print()
print(f'True Postive Rate:\t{tp_rate:.2f}')
print(f'False Positive Rate:\t{fp_rate:.2f}')
print(f'True Negative Rate:\t{tn_rate:.2f}')
print(f'False Negative Rate:\t{fn_rate:.2f}')

Accuracy:		1.00
Precision:		1.00
F1 Score:		1.00
Recall: 		0.99
Support:		?__?

True Postive Rate:	0.26
False Positive Rate:	0.00
True Negative Rate:	0.73
False Negative Rate:	0.00


### 5. Run through steps 2-4 using a different max_depth value.

In [29]:
# reestablish classifier with max_depth=4
clf = DecisionTreeClassifier(max_depth=4, random_state=42)
clf = clf.fit(x_train, y_train)

y_pred = clf.predict(x_train)

line_break = ('-' * 30)

print(f'Model Score: {clf.score(x_train, y_train):.2f}')
print(line_break)
print('Classification Report:')
print(classification_report(y_train, y_pred))
print(line_break)
labels = sorted(y_train.unique())
print('Confusion Matrix:')
print(pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels))
# I'm not sure which represents actual vs predicted

positive = 'Yes'
negative = 'No'
n = len(train)

train_results['predicted'] = y_pred

accuracy = sk.metrics.accuracy_score(y_train, y_pred, normalize=True)
tp_rate = len(train_results[(train_results.predicted == positive) & (train_results.actual == positive)]) / n
fp_rate = len(train_results[(train_results.predicted == positive) & (train_results.actual == negative)]) / n
tn_rate = len(train_results[(train_results.predicted == negative) & (train_results.actual == negative)]) / n
fn_rate = len(train_results[(train_results.predicted == negative) & (train_results.actual == positive)]) / n
precision = sk.metrics.precision_score(y_train, y_pred, pos_label = positive)
f1_score = sk.metrics.f1_score(y_train, y_pred, pos_label=positive)
recall = sk.metrics.recall_score(y_train, y_pred, pos_label=positive)
support = '?__?' # I don't undertsand what 'support' is and how to calculate it

print(line_break)
print(f'Accuracy:\t\t{accuracy:.2f}')
print(f'Precision:\t\t{precision:.2f}')
print(f'F1 Score:\t\t{f1_score:.2f}')
print(f'Recall: \t\t{recall:.2f}')
print(f'Support:\t\t{support}')
print()
print(f'True Postive Rate:\t{tp_rate:.2f}')
print(f'False Positive Rate:\t{fp_rate:.2f}')
print(f'True Negative Rate:\t{tn_rate:.2f}')
print(f'False Negative Rate:\t{fn_rate:.2f}')

Model Score: 0.80
------------------------------
Classification Report:
              precision    recall  f1-score   support

          No       0.87      0.86      0.87      2891
         Yes       0.63      0.63      0.63      1046

    accuracy                           0.80      3937
   macro avg       0.75      0.75      0.75      3937
weighted avg       0.80      0.80      0.80      3937

------------------------------
Confusion Matrix:
       No  Yes
No   2499  392
Yes   383  663
------------------------------
Accuracy:		0.80
Precision:		0.63
F1 Score:		0.63
Recall: 		0.63
Support:		?__?

True Postive Rate:	0.17
False Positive Rate:	0.10
True Negative Rate:	0.63
False Negative Rate:	0.10


In [30]:
# reestablish classifier with max_depth=3
clf = DecisionTreeClassifier(max_depth=3, random_state=42)
clf = clf.fit(x_train, y_train)

y_pred = clf.predict(x_train)

line_break = ('-' * 30)

print(f'Model Score: {clf.score(x_train, y_train):.2f}')
print(line_break)
print('Classification Report:')
print(classification_report(y_train, y_pred))
print(line_break)
labels = sorted(y_train.unique())
print('Confusion Matrix:')
print(pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels))
# I'm not sure which represents actual vs predicted

positive = 'Yes'
negative = 'No'
n = len(train)

train_results['predicted'] = y_pred

accuracy = sk.metrics.accuracy_score(y_train, y_pred, normalize=True)
tp_rate = len(train_results[(train_results.predicted == positive) & (train_results.actual == positive)]) / n
fp_rate = len(train_results[(train_results.predicted == positive) & (train_results.actual == negative)]) / n
tn_rate = len(train_results[(train_results.predicted == negative) & (train_results.actual == negative)]) / n
fn_rate = len(train_results[(train_results.predicted == negative) & (train_results.actual == positive)]) / n
precision = sk.metrics.precision_score(y_train, y_pred, pos_label = positive)
f1_score = sk.metrics.f1_score(y_train, y_pred, pos_label=positive)
recall = sk.metrics.recall_score(y_train, y_pred, pos_label=positive)
support = '?__?' # I don't undertsand what 'support' is and how to calculate it

print(line_break)
print(f'Accuracy:\t\t{accuracy:.2f}')
print(f'Precision:\t\t{precision:.2f}')
print(f'F1 Score:\t\t{f1_score:.2f}')
print(f'Recall: \t\t{recall:.2f}')
print(f'Support:\t\t{support}')
print()
print(f'True Postive Rate:\t{tp_rate:.2f}')
print(f'False Positive Rate:\t{fp_rate:.2f}')
print(f'True Negative Rate:\t{tn_rate:.2f}')
print(f'False Negative Rate:\t{fn_rate:.2f}')

Model Score: 0.79
------------------------------
Classification Report:
              precision    recall  f1-score   support

          No       0.81      0.93      0.87      2891
         Yes       0.68      0.41      0.51      1046

    accuracy                           0.79      3937
   macro avg       0.75      0.67      0.69      3937
weighted avg       0.78      0.79      0.77      3937

------------------------------
Confusion Matrix:
       No  Yes
No   2691  200
Yes   614  432
------------------------------
Accuracy:		0.79
Precision:		0.68
F1 Score:		0.51
Recall: 		0.41
Support:		?__?

True Postive Rate:	0.11
False Positive Rate:	0.05
True Negative Rate:	0.68
False Negative Rate:	0.16


### 6. Which model performs better on your in-sample data?


While the model with no established max-depth undoubtedly performs best, but this is likely due to overfitting.

The model with max-depth 4 performs better than the one with max-depth 3 on the in-sample data (for all metrics).

### 7. Which model performs best on your out-of-sample data, the validate set?


In [31]:
# re-establish classifier with no max_depth
clf = DecisionTreeClassifier(random_state=42)
clf = clf.fit(x_train, y_train)

# create predictions for the validate set
y_pred = clf.predict(x_validate)

# display report
print(classification_report(y_validate, y_pred))

              precision    recall  f1-score   support

          No       0.81      0.81      0.81      1239
         Yes       0.48      0.49      0.49       449

    accuracy                           0.72      1688
   macro avg       0.65      0.65      0.65      1688
weighted avg       0.73      0.72      0.72      1688



In [32]:
# re-establish classifier with max_depth 4
clf = DecisionTreeClassifier(max_depth=4, random_state=42)
clf = clf.fit(x_train, y_train)

# create predictions for the validate set
y_pred = clf.predict(x_validate)

# display report
print(classification_report(y_validate, y_pred))

              precision    recall  f1-score   support

          No       0.84      0.87      0.85      1239
         Yes       0.60      0.55      0.57       449

    accuracy                           0.78      1688
   macro avg       0.72      0.71      0.71      1688
weighted avg       0.78      0.78      0.78      1688



In [33]:
# re-establish classifier with max_depth 3
clf = DecisionTreeClassifier(max_depth=3, random_state=42)
clf = clf.fit(x_train, y_train)

# create predictions for the validate set
y_pred = clf.predict(x_validate)

# display report
print(classification_report(y_validate, y_pred))

              precision    recall  f1-score   support

          No       0.80      0.93      0.86      1239
         Yes       0.66      0.36      0.47       449

    accuracy                           0.78      1688
   macro avg       0.73      0.65      0.66      1688
weighted avg       0.76      0.78      0.76      1688



#### Results: 
for out-of-sample data, max-depth of 3 performs best for precision, recall, and f1 score. Max-depth 3 and max-depth 4 have approximately equal accuracy. 

# OTHER DATA: Austin Animal Center Decision Tree
### Work through these exercises on other datasets with a higher number of output classes. 

In [34]:
intakes = pd.read_csv('aac_intakes_20220304.csv')
outcomes = pd.read_csv('aac_outcomes_20220304.csv')

In [35]:
df = prepare.aac_prep(intakes, outcomes)

In [36]:
df = prepare.aac_get_dogs(df)

In [37]:
df.head(3)

Unnamed: 0,animal_id,name,datetime_intake,found_location,intake_type,intake_condition,animal_type,outcome_type,month_intake,fixed,sex,breed_mixed,breed_1,breed_2,breed_3,color_1,color_2,age_intake
0,A786884,*Brock,2019-01-03 16:19:00,2501 Magin Meadow Dr in Austin (TX),Stray,Normal,Dog,Transfer,January,True,male,True,Beagle,,,Tricolor,,730 days
1,A706918,Belle,2015-07-05 12:59:00,9409 Bluegrass Dr in Austin (TX),Stray,Normal,Dog,Return to Owner,July,True,female,False,English Springer Spaniel,,,White,Liver,2920 days
2,A724273,Runster,2016-04-14 18:43:00,2818 Palomino Trail in Austin (TX),Stray,Normal,Dog,Return to Owner,April,False,male,True,Basenji,,,Sable,White,330 days


In [38]:
df = prepare.aac_prep_for_modeling(df)

In [39]:
df.head(3)

Unnamed: 0,outcome_type,age_intake,fixed_True,fixed_unknown,breed_mixed_True,intake_type_Euthanasia Request,intake_type_Owner Surrender,intake_type_Public Assist,intake_type_Stray,intake_type_Wildlife,intake_condition_Behavior,intake_condition_Feral,intake_condition_Injured,intake_condition_Med Attn,intake_condition_Medical,intake_condition_Neonatal,intake_condition_Normal,intake_condition_Nursing,intake_condition_Other,intake_condition_Pregnant,intake_condition_Sick,month_intake_August,month_intake_December,month_intake_February,month_intake_January,...,color_1_Blue Tiger,color_1_Brown,color_1_Brown Brindle,color_1_Brown Merle,color_1_Brown Tiger,color_1_Buff,color_1_Chocolate,color_1_Cream,color_1_Fawn,color_1_Gold,color_1_Gray,color_1_Liver,color_1_Liver Tick,color_1_Orange,color_1_Red,color_1_Red Merle,color_1_Red Tick,color_1_Ruddy,color_1_Sable,color_1_Silver,color_1_Tan,color_1_Tricolor,color_1_White,color_1_Yellow,color_1_Yellow Brindle
0,Transfer,730.0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1,Return to Owner,2920.0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,Return to Owner,330.0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0


In [40]:
target = 'outcome_type'
train, test, validate = prepare.train_test_validate_split(df, target)

train	 n = 29929
test	 n = 10690
validate n = 12827


In [41]:
x_train = train.drop(columns=target)
y_train = train[target]

x_validate = validate.drop(columns=target)
y_validate = validate[target]

x_test = test.drop(columns=target)
y_test = test[target]

#### 1. What is your baseline prediction? What is your baseline accuracy? 


In [42]:
train_results = pd.DataFrame()
train_results['actual'] = train[target]
train_results['baseline'] = train[target].mode()[0]
print(f'Baseline prediction: {target} = {train[target].mode()[0]}')
print(f'Baseline accuracy: {sk.metrics.accuracy_score(train_results.actual, train_results.baseline, normalize=True):.2f}')

Baseline prediction: outcome_type = Adoption
Baseline accuracy: 0.48


#### 2. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

#### 3. Evaluate your in-sample results using the model score, confusion matrix, and classification report.


#### 4. Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [43]:
line_break = ('-' * 30)

# Max-depth = 3

clf = DecisionTreeClassifier(max_depth=3, random_state=42)
clf = clf.fit(x_train, y_train)
y_pred = clf.predict(x_train)

print(f'Model Score: {clf.score(x_train, y_train):.2f}')
print(line_break)
print('Classification Report:')
print(classification_report(y_train, y_pred))
print(line_break)
labels = sorted(y_train.unique())
print('Confusion Matrix:')
print(pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels))
print(line_break)
# I'm not sure which represents actual vs predicted


for outcome_type in df.outcome_type.unique():
    
    print(f'For outcome_type == {outcome_type}:')
    positive = outcome_type
    n = len(train)

    train_results['predicted'] = y_pred

    tp_rate = len(train_results[(train_results.predicted == positive) & (train_results.actual == positive)]) / n
    fp_rate = len(train_results[(train_results.predicted == positive) & (train_results.actual != positive)]) / n
    tn_rate = len(train_results[(train_results.predicted != positive) & (train_results.actual != positive)]) / n
    fn_rate = len(train_results[(train_results.predicted != positive) & (train_results.actual == positive)]) / n
    
    print()
    print(f'True Postive Rate:\t{tp_rate:.2f}')
    print(f'False Positive Rate:\t{fp_rate:.2f}')
    print(f'True Negative Rate:\t{tn_rate:.2f}')
    print(f'False Negative Rate:\t{fn_rate:.2f}')
    print(line_break)

Model Score: 0.58
------------------------------
Classification Report:
                 precision    recall  f1-score   support

       Adoption       0.58      0.82      0.68     14433
Return to Owner       0.61      0.60      0.61      7272
       Transfer       0.52      0.15      0.24      8224

       accuracy                           0.58     29929
      macro avg       0.57      0.52      0.51     29929
   weighted avg       0.57      0.58      0.54     29929

------------------------------
Confusion Matrix:
                 Adoption  Return to Owner  Transfer
Adoption            11805             1665       963
Return to Owner      2727             4358       187
Transfer             5906             1068      1250
------------------------------
For outcome_type == Transfer:

True Postive Rate:	0.04
False Positive Rate:	0.04
True Negative Rate:	0.69
False Negative Rate:	0.23
------------------------------
For outcome_type == Return to Owner:

True Postive Rate:	0.15
False Pos

#### 5. Run through steps 2-4 using a different max_depth value.

In [44]:
line_break = ('-' * 30)

# Max-depth = 4

clf = DecisionTreeClassifier(max_depth=4, random_state=42)
clf = clf.fit(x_train, y_train)
y_pred = clf.predict(x_train)

print(f'Model Score: {clf.score(x_train, y_train):.2f}')
print(line_break)
print('Classification Report:')
print(classification_report(y_train, y_pred))
print(line_break)
labels = sorted(y_train.unique())
print('Confusion Matrix:')
print(pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels))
print(line_break)
# I'm not sure which represents actual vs predicted


for outcome_type in df.outcome_type.unique():
    
    print(f'For outcome_type == {outcome_type}:')
    positive = outcome_type
    n = len(train)

    train_results['predicted'] = y_pred

    tp_rate = len(train_results[(train_results.predicted == positive) & (train_results.actual == positive)]) / n
    fp_rate = len(train_results[(train_results.predicted == positive) & (train_results.actual != positive)]) / n
    tn_rate = len(train_results[(train_results.predicted != positive) & (train_results.actual != positive)]) / n
    fn_rate = len(train_results[(train_results.predicted != positive) & (train_results.actual == positive)]) / n
    
    print()
    print(f'True Postive Rate:\t{tp_rate:.2f}')
    print(f'False Positive Rate:\t{fp_rate:.2f}')
    print(f'True Negative Rate:\t{tn_rate:.2f}')
    print(f'False Negative Rate:\t{fn_rate:.2f}')
    print(line_break)

Model Score: 0.59
------------------------------
Classification Report:
                 precision    recall  f1-score   support

       Adoption       0.57      0.85      0.68     14433
Return to Owner       0.62      0.59      0.60      7272
       Transfer       0.60      0.12      0.20      8224

       accuracy                           0.59     29929
      macro avg       0.60      0.52      0.50     29929
   weighted avg       0.59      0.59      0.53     29929

------------------------------
Confusion Matrix:
                 Adoption  Return to Owner  Transfer
Adoption            12244             1656       533
Return to Owner      2836             4302       134
Transfer             6236              998       990
------------------------------
For outcome_type == Transfer:

True Postive Rate:	0.03
False Positive Rate:	0.02
True Negative Rate:	0.70
False Negative Rate:	0.24
------------------------------
For outcome_type == Return to Owner:

True Postive Rate:	0.14
False Pos

#### 6. Which model performs better on your in-sample data?


In [45]:
# Max-Depth = 3
    # Classification Report:
    #                 precision    recall  f1-score   support

    #       Adoption       0.58      0.82      0.68     14433
    # Return to Owner      0.61      0.60      0.61      7272
    #       Transfer       0.52      0.15      0.24      8224

    #       accuracy                           0.58     29929
    
# Max-Depth = 4
# Classification Report:
#                  precision    recall  f1-score   support

#        Adoption       0.57      0.85      0.68     14433
# Return to Owner       0.62      0.59      0.60      7272
#        Transfer       0.60      0.12      0.20      8224

#        accuracy                           0.59     29929

There are only very slight differences in accuracy, precision, and recall between the two models (can be seen above) and which one is "better" would depend on what the goals of the model's predictions are and which outcome_type category we are trying to identify as our positive case. 

#### 7. Which model performs best on your out-of-sample data, the validate set?


In [46]:
# re-establish classifier with max-depth = 3
clf = DecisionTreeClassifier(max_depth=3, random_state=42)
clf = clf.fit(x_train, y_train)

# create predictions for the validate set
y_pred = clf.predict(x_validate)

# display report
print('Max-Depth = 3:')
print(classification_report(y_validate, y_pred))



# re-establish classifier with max_depth = 4
clf = DecisionTreeClassifier(max_depth=4, random_state=42)
clf = clf.fit(x_train, y_train)

# create predictions for the validate set
y_pred = clf.predict(x_validate)

# display report
print(line_break)
print('Max-Depth = 4')
print(classification_report(y_validate, y_pred))

Max-Depth = 3:
                 precision    recall  f1-score   support

       Adoption       0.58      0.82      0.68      6185
Return to Owner       0.62      0.61      0.61      3117
       Transfer       0.53      0.15      0.24      3525

       accuracy                           0.59     12827
      macro avg       0.57      0.53      0.51     12827
   weighted avg       0.57      0.59      0.54     12827

------------------------------
Max-Depth = 4
                 precision    recall  f1-score   support

       Adoption       0.58      0.85      0.69      6185
Return to Owner       0.62      0.60      0.61      3117
       Transfer       0.59      0.13      0.21      3525

       accuracy                           0.59     12827
      macro avg       0.60      0.52      0.50     12827
   weighted avg       0.59      0.59      0.54     12827



Again, there are only very slight differences in accuracy, precision, and recall between the two models (can be seen above) and which one is "better" would depend on what the goals of the model's predictions are and which outcome_type category we are trying to identify as our positive case. 