In [1]:
import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns

from acquire import *
from prepare import *

In [2]:
titanic = get_titanic_data()
titanic = prep_titanic_data(titanic)
train, validate, test = split_data(titanic, 'survived')
train.shape, validate.shape, test.shape

((498, 11), (214, 11), (179, 11))

In [3]:
columns = ['pclass', 'sex_male', 'alone']

X_train = train[columns]
y_train = train.survived

X_validate = validate[columns]
y_validate = validate.survived

X_test = test[columns]
y_test = test.survived

# Decision Tree Exercises

## Titanic Dataset

### 1. 

What is your baseline prediction? What is your baseline accuracy? remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.

In [4]:
train.survived.value_counts(normalize = True)

0    0.616466
1    0.383534
Name: survived, dtype: float64

Our baseline model would be predicting the most common target value, which is did not survive. Given a baseline model that always predicts did not survive the accuracy of this model would be ~62%.

### 2.

Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [5]:
model1 = DecisionTreeClassifier(max_depth = 2)

In [6]:
model1.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=2)

In [7]:
y_pred = model1.predict(X_train)
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1,

### 3

Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [8]:
model1.score(X_train, y_train)

0.7831325301204819

In [9]:
index_labels = ['Actual Did Not Survive', 'Actual Survived']
column_labels = ['Predicted Did Not Survive', 'Predicted Survived']
pd.DataFrame(confusion_matrix(y_train, y_pred, labels = [0, 1]), index = index_labels, columns = column_labels)

Unnamed: 0,Predicted Did Not Survive,Predicted Survived
Actual Did Not Survive,303,4
Actual Survived,104,87


In [10]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.99      0.85       307
           1       0.96      0.46      0.62       191

    accuracy                           0.78       498
   macro avg       0.85      0.72      0.73       498
weighted avg       0.83      0.78      0.76       498



### 4

Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

> Accuracy:            0.78
<br>
> True Positive Rate:  0.17
<br>
> False Positive Rate: 0.008
<br>
> True Negative Rate:  0.61
<br>
> False Negative rate: 0.21
<br>
> Precision:           0.96, 0.74
<br>
> Recall:              0.46, 0.99
<br>
> f1-score:            0.62, 0.85
<br>
> Support:             498

### 5

Run through steps 2-4 using a different max_depth value.

In [11]:
model2 = DecisionTreeClassifier(max_depth = 4)
model2.fit(X_train, y_train)
y_pred = model2.predict(X_train)
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1,

In [12]:
model2.score(X_train, y_train)

0.7951807228915663

In [13]:
pd.DataFrame(confusion_matrix(y_train, y_pred, labels = [0, 1]), index = index_labels, columns = column_labels)

Unnamed: 0,Predicted Did Not Survive,Predicted Survived
Actual Did Not Survive,286,21
Actual Survived,81,110


In [14]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.93      0.85       307
           1       0.84      0.58      0.68       191

    accuracy                           0.80       498
   macro avg       0.81      0.75      0.77       498
weighted avg       0.80      0.80      0.79       498



### 6

Which model performs better on your in-sample data?

> In terms of accuracy model 2 performed better, but only slightly better.

### 7

Which model performs best on your out-of-sample data, the validate set?

In [15]:
model1.score(X_validate, y_validate), model2.score(X_validate, y_validate)

(0.7897196261682243, 0.794392523364486)

Model 2 again performs slightly better on the validate set.

## Telco Dataset

### Acquire Data

In [16]:
telco = get_telco_data()
telco = prep_telco_data(telco)
train, validate, test = split_data(telco, 'churn')
train.shape, validate.shape, test.shape

((3937, 47), (1688, 47), (1407, 47))

In [17]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3937 entries, 5467 to 2212
Data columns (total 47 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   gender                                 3937 non-null   object 
 1   senior_citizen                         3937 non-null   int64  
 2   partner                                3937 non-null   object 
 3   dependents                             3937 non-null   object 
 4   tenure                                 3937 non-null   int64  
 5   phone_service                          3937 non-null   object 
 6   multiple_lines                         3937 non-null   object 
 7   online_security                        3937 non-null   object 
 8   online_backup                          3937 non-null   object 
 9   device_protection                      3937 non-null   object 
 10  tech_support                           3937 non-null   object 
 11  s

### Create Model

> We'll look at a model utilizing the internet_service_type_Fiber optic, internet_service_type_None, tenure, and monthly_charges columns.

In [18]:
columns = ['internet_service_type_Fiber optic', 'internet_service_type_None', 'tenure', 'monthly_charges']

X_train = train[columns]
y_train = train.churn

X_validate = validate[columns]
y_validate = validate.churn

X_test = test[columns]
y_test = test.churn

In [19]:
model1 = DecisionTreeClassifier(max_depth = 3, random_state = 24)
model1.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=3, random_state=24)

In [20]:
model1.score(X_train, y_train)

0.7866395732791466

In [21]:
print(classification_report(y_train, model1.predict(X_train)))

              precision    recall  f1-score   support

          No       0.81      0.93      0.86      2891
         Yes       0.66      0.40      0.50      1046

    accuracy                           0.79      3937
   macro avg       0.74      0.66      0.68      3937
weighted avg       0.77      0.79      0.77      3937



### A Model With Different max_depth

In [22]:
model2 = DecisionTreeClassifier(max_depth = 9, random_state = 24)
model2.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=9, random_state=24)

In [23]:
model2.score(X_train, y_train)

0.8432816865633731

In [24]:
print(classification_report(y_train, model2.predict(X_train)))

              precision    recall  f1-score   support

          No       0.87      0.92      0.90      2891
         Yes       0.74      0.64      0.68      1046

    accuracy                           0.84      3937
   macro avg       0.81      0.78      0.79      3937
weighted avg       0.84      0.84      0.84      3937



In [25]:
model1.score(X_validate, y_validate), model2.score(X_validate, y_validate)

(0.7932464454976303, 0.7648104265402843)

### A Model Using Different Features

> Here we will use tenure, payment_type, and contract_type

In [26]:
columns = [
    'payment_type_Credit card (automatic)',
    'payment_type_Electronic check',
    'payment_type_Mailed check',
    'contract_type_One year',
    'contract_type_Two year',
    'tenure'
]

X_train = train[columns]
y_train = train.churn

X_validate = validate[columns]
y_validate = validate.churn

X_test = test[columns]
y_test = test.churn

In [27]:
model3 = DecisionTreeClassifier(max_depth = 9, random_state = 24)
model3.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=9, random_state=24)

In [28]:
model3.score(X_train, y_train)

0.7858775717551435

In [29]:
print(classification_report(y_train, model3.predict(X_train)))

              precision    recall  f1-score   support

          No       0.82      0.90      0.86      2891
         Yes       0.63      0.46      0.54      1046

    accuracy                           0.79      3937
   macro avg       0.73      0.68      0.70      3937
weighted avg       0.77      0.79      0.77      3937



> Let's try again with a different max_depth

In [30]:
model4 = DecisionTreeClassifier(max_depth = 15, random_state = 24)
model4.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=15, random_state=24)

In [31]:
model4.score(X_train, y_train)

0.7899415798831597

In [32]:
model3.score(X_validate, y_validate), model4.score(X_validate, y_validate)

(0.7618483412322274, 0.7541469194312796)

### Takeaways

Model 1 has consistent performance in predicting customer churn even with out of sample data.

# Random Forest Exercises

## 1

>Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

In [41]:
titanic.head()

Unnamed: 0,survived,pclass,sex,sibsp,parch,fare,embark_town,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,3,male,1,0,7.25,Southampton,0,1,0,1
1,1,1,female,1,0,71.2833,Cherbourg,0,0,0,0
2,1,3,female,0,0,7.925,Southampton,1,0,0,1
3,1,1,female,1,0,53.1,Southampton,0,0,0,1
4,0,3,male,0,0,8.05,Southampton,1,1,0,1


In [42]:
train, validate, test = split_data(titanic, 'survived')

In [43]:
features = ['pclass', 'alone', 'sex_male']

X_train = train[features]
y_train = train.survived

X_validate = validate[features]
y_validate = validate.survived

X_test = test[features]
y_test = test.survived

In [44]:
model1 = RandomForestClassifier(min_samples_leaf = 1, max_depth = 10)
model1.fit(X_train, y_train)

RandomForestClassifier(max_depth=10)

In [45]:
model1.predict(X_train)

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1,

## 2

> Evaluate your results using the model score, confusion matrix, and classification report.

In [46]:
model1.score(X_train, y_train)

0.7951807228915663

In [47]:
index_labels = ['Actual Did Not Survive', 'Actual Survived']
column_labels = ['Predicted Did Not Survive', 'Predicted Survived']
pd.DataFrame(confusion_matrix(y_train, model1.predict(X_train), labels = [0, 1]), index = index_labels, columns = column_labels)

Unnamed: 0,Predicted Did Not Survive,Predicted Survived
Actual Did Not Survive,286,21
Actual Survived,81,110


In [48]:
print(classification_report(y_train, model1.predict(X_train)))

              precision    recall  f1-score   support

           0       0.78      0.93      0.85       307
           1       0.84      0.58      0.68       191

    accuracy                           0.80       498
   macro avg       0.81      0.75      0.77       498
weighted avg       0.80      0.80      0.79       498



## 3

> Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

- Accuracy:            0.80
- True Positive Rate:  0.22
- False Positive Rate: 0.04
- True Negative Rate:  0.57
- False Negative Rate: 0.16
- Precision:
--    Survived:        0.84
--    Did Not Survive: 0.78
- Recall:
--    Survived:        0.58
--    Did Not Survive: 0.93
- f1-score:
--    Survived:        0.68
--    Did Not Survive: 0.85
- Support:             498

## 4

> Run through steps increasing your min_samples_leaf and decreasing your max_depth.

In [50]:
model2 = RandomForestClassifier(min_samples_leaf = 3, max_depth = 8)
model2.fit(X_train, y_train)

RandomForestClassifier(max_depth=8, min_samples_leaf=3)

In [51]:
print(classification_report(y_train, model2.predict(X_train)))

              precision    recall  f1-score   support

           0       0.78      0.93      0.85       307
           1       0.84      0.58      0.68       191

    accuracy                           0.80       498
   macro avg       0.81      0.75      0.77       498
weighted avg       0.80      0.80      0.79       498



In [52]:
model3 = RandomForestClassifier(min_samples_leaf = 5, max_depth = 6)
model3.fit(X_train, y_train)

RandomForestClassifier(max_depth=6, min_samples_leaf=5)

In [53]:
print(classification_report(y_train, model3.predict(X_train)))

              precision    recall  f1-score   support

           0       0.78      0.93      0.85       307
           1       0.84      0.58      0.68       191

    accuracy                           0.80       498
   macro avg       0.81      0.75      0.77       498
weighted avg       0.80      0.80      0.79       498



In [65]:
model4 = RandomForestClassifier(min_samples_leaf = 8, max_depth = 3)
model4.fit(X_train, y_train)

RandomForestClassifier(max_depth=3, min_samples_leaf=8)

In [66]:
print(classification_report(y_train, model4.predict(X_train)))

              precision    recall  f1-score   support

           0       0.78      0.93      0.85       307
           1       0.84      0.58      0.68       191

    accuracy                           0.80       498
   macro avg       0.81      0.75      0.77       498
weighted avg       0.80      0.80      0.79       498



In [63]:
model5 = RandomForestClassifier(min_samples_leaf = 10, max_depth = 1)
model5.fit(X_train, y_train)

RandomForestClassifier(max_depth=1, min_samples_leaf=10)

In [64]:
print(classification_report(y_train, model5.predict(X_train)))

              precision    recall  f1-score   support

           0       0.77      0.89      0.82       307
           1       0.76      0.57      0.65       191

    accuracy                           0.76       498
   macro avg       0.76      0.73      0.73       498
weighted avg       0.76      0.76      0.75       498



## 5

> What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

The evaluation metrics remain constant as the min_samples_leaf argument is increased and the max_depth argument is decreased with the exception that when min_samples_leaf is 10 and max_depth is 1 the performance of the model decreases. Any of models 1-4 perform better than model 5 in accuracy, precision and recall.

## *

> After making a few models, which one has the best performance (or closest metrics) on both train and validate?

In [69]:
models = [model1, model2, model3, model4, model5]

for index, model in enumerate(models):
    print(f'Train score for model {index + 1}: {model.score(X_train, y_train)}')
    print(f'Validate score for model {index + 1}: {model.score(X_validate, y_validate)}')
    print('-------------------------')

Train score for model 1: 0.7951807228915663
Validate score for model 1: 0.794392523364486
-------------------------
Train score for model 2: 0.7951807228915663
Validate score for model 2: 0.794392523364486
-------------------------
Train score for model 3: 0.7951807228915663
Validate score for model 3: 0.794392523364486
-------------------------
Train score for model 4: 0.7951807228915663
Validate score for model 4: 0.794392523364486
-------------------------
Train score for model 5: 0.7630522088353414
Validate score for model 5: 0.8084112149532711
-------------------------


Model 1 - 4 have consistent performance on both the train and validate datasets. Model 5 does perform better on the validate set, but not so much on the train set.