In [1]:
import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, precision_score, recall_score
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns

from acquire import *
from prepare import *

In [2]:
titanic = get_titanic_data()
titanic = prep_titanic_data(titanic)
train, validate, test = split_data(titanic, 'survived')
train.shape, validate.shape, test.shape

((498, 11), (214, 11), (179, 11))

In [3]:
columns = ['pclass', 'sex_male', 'alone']

X_train = train[columns]
y_train = train.survived

X_validate = validate[columns]
y_validate = validate.survived

X_test = test[columns]
y_test = test.survived

# Decision Tree Exercises

## Titanic Dataset

### 1. 

What is your baseline prediction? What is your baseline accuracy? remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.

In [4]:
train.survived.value_counts(normalize = True)

0    0.616466
1    0.383534
Name: survived, dtype: float64

Our baseline model would be predicting the most common target value, which is did not survive. Given a baseline model that always predicts did not survive the accuracy of this model would be ~62%.

### 2.

Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [5]:
model1 = DecisionTreeClassifier(max_depth = 2)

In [6]:
model1.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=2)

In [7]:
y_pred = model1.predict(X_train)
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1,

### 3

Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [8]:
model1.score(X_train, y_train)

0.7831325301204819

In [9]:
index_labels = ['Actual Did Not Survive', 'Actual Survived']
column_labels = ['Predicted Did Not Survive', 'Predicted Survived']
pd.DataFrame(confusion_matrix(y_train, y_pred, labels = [0, 1]), index = index_labels, columns = column_labels)

Unnamed: 0,Predicted Did Not Survive,Predicted Survived
Actual Did Not Survive,303,4
Actual Survived,104,87


In [10]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.99      0.85       307
           1       0.96      0.46      0.62       191

    accuracy                           0.78       498
   macro avg       0.85      0.72      0.73       498
weighted avg       0.83      0.78      0.76       498



### 4

Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

> Accuracy:            0.78
<br>
> True Positive Rate:  0.17
<br>
> False Positive Rate: 0.008
<br>
> True Negative Rate:  0.61
<br>
> False Negative rate: 0.21
<br>
> Precision:           0.96, 0.74
<br>
> Recall:              0.46, 0.99
<br>
> f1-score:            0.62, 0.85
<br>
> Support:             498

### 5

Run through steps 2-4 using a different max_depth value.

In [11]:
model2 = DecisionTreeClassifier(max_depth = 4)
model2.fit(X_train, y_train)
y_pred = model2.predict(X_train)
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1,

In [12]:
model2.score(X_train, y_train)

0.7951807228915663

In [13]:
pd.DataFrame(confusion_matrix(y_train, y_pred, labels = [0, 1]), index = index_labels, columns = column_labels)

Unnamed: 0,Predicted Did Not Survive,Predicted Survived
Actual Did Not Survive,286,21
Actual Survived,81,110


In [14]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.93      0.85       307
           1       0.84      0.58      0.68       191

    accuracy                           0.80       498
   macro avg       0.81      0.75      0.77       498
weighted avg       0.80      0.80      0.79       498



### 6

Which model performs better on your in-sample data?

> In terms of accuracy model 2 performed better, but only slightly better.

### 7

Which model performs best on your out-of-sample data, the validate set?

In [15]:
model1.score(X_validate, y_validate), model2.score(X_validate, y_validate)

(0.7897196261682243, 0.794392523364486)

Model 2 again performs slightly better on the validate set. Compared to the baseline which had an accuracy of 0.62 both models have better performance.

## Telco Dataset

### Acquire Data

In [16]:
telco = get_telco_data()
telco = prep_telco_data(telco)
train, validate, test = split_data(telco, 'churn')
train.shape, validate.shape, test.shape

((3937, 47), (1688, 47), (1407, 47))

In [17]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3937 entries, 5467 to 2212
Data columns (total 47 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   gender                                 3937 non-null   object 
 1   senior_citizen                         3937 non-null   int64  
 2   partner                                3937 non-null   object 
 3   dependents                             3937 non-null   object 
 4   tenure                                 3937 non-null   int64  
 5   phone_service                          3937 non-null   object 
 6   multiple_lines                         3937 non-null   object 
 7   online_security                        3937 non-null   object 
 8   online_backup                          3937 non-null   object 
 9   device_protection                      3937 non-null   object 
 10  tech_support                           3937 non-null   object 
 11  s

### Establish Baseline

In [18]:
train.churn.value_counts(normalize = True)

No     0.734315
Yes    0.265685
Name: churn, dtype: float64

Our baseline model will be always predicting a customer does not churn which will have an accuracy of 0.73.

### Create Model

> We'll look at a model utilizing the internet_service_type_Fiber optic, internet_service_type_None, tenure, and monthly_charges columns.

In [19]:
columns = ['internet_service_type_Fiber optic', 'internet_service_type_None', 'tenure', 'monthly_charges']

X_train = train[columns]
y_train = train.churn

X_validate = validate[columns]
y_validate = validate.churn

X_test = test[columns]
y_test = test.churn

In [20]:
model1 = DecisionTreeClassifier(max_depth = 3, random_state = 24)
model1.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=3, random_state=24)

In [21]:
model1.score(X_train, y_train)

0.7866395732791466

In [22]:
print(classification_report(y_train, model1.predict(X_train)))

              precision    recall  f1-score   support

          No       0.81      0.93      0.86      2891
         Yes       0.66      0.40      0.50      1046

    accuracy                           0.79      3937
   macro avg       0.74      0.66      0.68      3937
weighted avg       0.77      0.79      0.77      3937



### A Model With Different max_depth

In [23]:
model2 = DecisionTreeClassifier(max_depth = 9, random_state = 24)
model2.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=9, random_state=24)

In [24]:
model2.score(X_train, y_train)

0.8432816865633731

In [25]:
print(classification_report(y_train, model2.predict(X_train)))

              precision    recall  f1-score   support

          No       0.87      0.92      0.90      2891
         Yes       0.74      0.64      0.68      1046

    accuracy                           0.84      3937
   macro avg       0.81      0.78      0.79      3937
weighted avg       0.84      0.84      0.84      3937



In [26]:
model1.score(X_validate, y_validate), model2.score(X_validate, y_validate)

(0.7932464454976303, 0.7648104265402843)

### A Model Using Different Features

> Here we will use tenure, payment_type, and contract_type

In [27]:
columns = [
    'payment_type_Credit card (automatic)',
    'payment_type_Electronic check',
    'payment_type_Mailed check',
    'contract_type_One year',
    'contract_type_Two year',
    'tenure'
]

X_train = train[columns]
y_train = train.churn

X_validate = validate[columns]
y_validate = validate.churn

X_test = test[columns]
y_test = test.churn

In [28]:
model3 = DecisionTreeClassifier(max_depth = 9, random_state = 24)
model3.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=9, random_state=24)

In [29]:
model3.score(X_train, y_train)

0.7858775717551435

In [30]:
print(classification_report(y_train, model3.predict(X_train)))

              precision    recall  f1-score   support

          No       0.82      0.90      0.86      2891
         Yes       0.63      0.46      0.54      1046

    accuracy                           0.79      3937
   macro avg       0.73      0.68      0.70      3937
weighted avg       0.77      0.79      0.77      3937



> Let's try again with a different max_depth

In [31]:
model4 = DecisionTreeClassifier(max_depth = 15, random_state = 24)
model4.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=15, random_state=24)

In [32]:
model4.score(X_train, y_train)

0.7899415798831597

In [33]:
model3.score(X_validate, y_validate), model4.score(X_validate, y_validate)

(0.7618483412322274, 0.7541469194312796)

### Takeaways

Model 1 has consistent performance in predicting customer churn even with out of sample data. Compared to the baseline which had an accuracy of 73% model 1 does perform better.

# Random Forest Exercises

## 1

>Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

In [34]:
titanic.head()

Unnamed: 0,survived,pclass,sex,sibsp,parch,fare,embark_town,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,3,male,1,0,7.25,Southampton,0,1,0,1
1,1,1,female,1,0,71.2833,Cherbourg,0,0,0,0
2,1,3,female,0,0,7.925,Southampton,1,0,0,1
3,1,1,female,1,0,53.1,Southampton,0,0,0,1
4,0,3,male,0,0,8.05,Southampton,1,1,0,1


In [35]:
train, validate, test = split_data(titanic, 'survived')

In [36]:
features = ['pclass', 'alone', 'sex_male', 'sibsp', 'fare']

X_train = train[features]
y_train = train.survived

X_validate = validate[features]
y_validate = validate.survived

X_test = test[features]
y_test = test.survived

In [37]:
model1 = RandomForestClassifier(min_samples_leaf = 1, max_depth = 10)
model1.fit(X_train, y_train)

RandomForestClassifier(max_depth=10)

In [38]:
model1.predict(X_train)

array([0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0,

## 2

> Evaluate your results using the model score, confusion matrix, and classification report.

In [39]:
model1.score(X_train, y_train)

0.9116465863453815

In [40]:
index_labels = ['Actual Did Not Survive', 'Actual Survived']
column_labels = ['Predicted Did Not Survive', 'Predicted Survived']
pd.DataFrame(confusion_matrix(y_train, model1.predict(X_train), labels = [0, 1]), index = index_labels, columns = column_labels)

Unnamed: 0,Predicted Did Not Survive,Predicted Survived
Actual Did Not Survive,294,13
Actual Survived,31,160


In [41]:
print(classification_report(y_train, model1.predict(X_train)))

              precision    recall  f1-score   support

           0       0.90      0.96      0.93       307
           1       0.92      0.84      0.88       191

    accuracy                           0.91       498
   macro avg       0.91      0.90      0.90       498
weighted avg       0.91      0.91      0.91       498



## 3

> Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

- Accuracy:            0.91
- True Positive Rate:  0.33
- False Positive Rate: 0.03
- True Negative Rate:  0.58
- False Negative Rate: 0.05
- Precision:
--    Survived:        0.91
--    Did Not Survive: 0.92
- Recall:
--    Survived:        0.86
--    Did Not Survive: 0.95
- f1-score:
--    Survived:        0.88
--    Did Not Survive: 0.93
- Support:             498

## 4

> Run through steps increasing your min_samples_leaf and decreasing your max_depth.

In [42]:
models = [model1]
start = 2
stop = 11

for i in range(start, stop):
    clf = RandomForestClassifier(min_samples_leaf = i, max_depth = stop - i)
    clf.fit(X_train, y_train)
    print(classification_report(y_train, clf.predict(X_train)))
    
    models.append(clf)

              precision    recall  f1-score   support

           0       0.85      0.95      0.90       307
           1       0.90      0.72      0.80       191

    accuracy                           0.86       498
   macro avg       0.87      0.84      0.85       498
weighted avg       0.87      0.86      0.86       498

              precision    recall  f1-score   support

           0       0.84      0.93      0.89       307
           1       0.87      0.72      0.79       191

    accuracy                           0.85       498
   macro avg       0.86      0.83      0.84       498
weighted avg       0.85      0.85      0.85       498

              precision    recall  f1-score   support

           0       0.83      0.93      0.88       307
           1       0.86      0.69      0.77       191

    accuracy                           0.84       498
   macro avg       0.85      0.81      0.82       498
weighted avg       0.84      0.84      0.84       498

              preci

## 5

> What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

As min_samples_leaf is increased and max_depth is decreased the performance of the model overall decreases with min_samples_leaf at 1 and max_depth at 10 providing the best performance overall. This is likely because the decision trees generated by the random forest provide better predictions given they have a greater max_depth.

## *

> After making a few models, which one has the best performance (or closest metrics) on both train and validate?

In [43]:
for index, model in enumerate(models):
    print(f'Train score for model {index + 1}: {model.score(X_train, y_train)}')
    print(f'Validate score for model {index + 1}: {model.score(X_validate, y_validate)}')
    print(f'Validate precision for model {index + 1}: {precision_score(y_validate, model.predict(X_validate), pos_label = 0)}')
    print(f'Validate recall for model {index + 1}: {recall_score(y_validate, model.predict(X_validate), pos_label = 0)}')
    print('-------------------------')

Train score for model 1: 0.9116465863453815
Validate score for model 1: 0.794392523364486
Validate precision for model 1: 0.7933333333333333
Validate recall for model 1: 0.9015151515151515
-------------------------
Train score for model 2: 0.8634538152610441
Validate score for model 2: 0.8130841121495327
Validate precision for model 2: 0.7987012987012987
Validate recall for model 2: 0.9318181818181818
-------------------------
Train score for model 3: 0.8514056224899599
Validate score for model 3: 0.794392523364486
Validate precision for model 3: 0.7894736842105263
Validate recall for model 3: 0.9090909090909091
-------------------------
Train score for model 4: 0.8393574297188755
Validate score for model 4: 0.8037383177570093
Validate precision for model 4: 0.7848101265822784
Validate recall for model 4: 0.9393939393939394
-------------------------
Train score for model 5: 0.8313253012048193
Validate score for model 5: 0.8037383177570093
Validate precision for model 5: 0.78125
Validat

When testing the models on the validate set all models have similar accuracy, precision and recall with some variety of scores for each.

# KNN Exericses

In [44]:
titanic.head()

Unnamed: 0,survived,pclass,sex,sibsp,parch,fare,embark_town,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,3,male,1,0,7.25,Southampton,0,1,0,1
1,1,1,female,1,0,71.2833,Cherbourg,0,0,0,0
2,1,3,female,0,0,7.925,Southampton,1,0,0,1
3,1,1,female,1,0,53.1,Southampton,0,0,0,1
4,0,3,male,0,0,8.05,Southampton,1,1,0,1


In [45]:
train.shape, validate.shape, test.shape

((498, 11), (214, 11), (179, 11))

## 1

> Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)

In [46]:
model1 = KNeighborsClassifier(n_neighbors = 2, weights = 'uniform')
model1.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=2)

In [47]:
model1.predict(X_train)

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0,

## 2

> Evaluate your results using the model score, confusion matrix, and classification report.

In [48]:
model1.score(X_train, y_train)

0.8232931726907631

In [49]:
index_labels = ['Actual Did Not Survive', 'Actual Survived']
column_labels = ['Predicted Did Not Survive', 'Predicted Survived']
pd.DataFrame(confusion_matrix(y_train, model1.predict(X_train), labels = [0, 1]), index = index_labels, columns = column_labels)

Unnamed: 0,Predicted Did Not Survive,Predicted Survived
Actual Did Not Survive,304,3
Actual Survived,85,106


In [50]:
print(classification_report(y_train, model1.predict(X_train)))

              precision    recall  f1-score   support

           0       0.78      0.99      0.87       307
           1       0.97      0.55      0.71       191

    accuracy                           0.82       498
   macro avg       0.88      0.77      0.79       498
weighted avg       0.85      0.82      0.81       498



## 3

> Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

- Accuracy:            0.82
- True Positive Rate:  0.21
- False Positive Rate: 0.01
- True Negative Rate:  0.61
- False Negative Rate: 0.17
- Precision:
--    Survived:        0.97
--    Did Not Survive: 0.78
- Recall:
--    Survived:        0.55
--    Did Not Survive: 0.99
- f1-score:
--    Survived:        0.71
--    Did Not Survive: 0.87
- Support:             498

## 4

> Run through steps 1-3 setting k to 10

In [51]:
model2 = KNeighborsClassifier(n_neighbors = 10, weights = 'uniform')
model2.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=10)

In [52]:
model2.predict(X_train)

array([0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1,
       0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1,

In [53]:
print(classification_report(y_train, model2.predict(X_train)))

              precision    recall  f1-score   support

           0       0.80      0.88      0.84       307
           1       0.77      0.65      0.71       191

    accuracy                           0.79       498
   macro avg       0.79      0.77      0.77       498
weighted avg       0.79      0.79      0.79       498



## 5

> Run through setps 1-3 setting k to 20

In [54]:
model3 = KNeighborsClassifier(n_neighbors = 20, weights = 'uniform')
model3.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=20)

In [55]:
model3.predict(X_train)

array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1,
       0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1,

In [56]:
print(classification_report(y_train, model3.predict(X_train)))

              precision    recall  f1-score   support

           0       0.73      0.86      0.79       307
           1       0.69      0.50      0.58       191

    accuracy                           0.72       498
   macro avg       0.71      0.68      0.69       498
weighted avg       0.72      0.72      0.71       498



## 6

> What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

Model 2 and 3 have the same performance and both perform better than model 1 in terms of accuracy and recall, but model 1 has better precision.

## 7

> Which model performs best on our out-of-sample data from validate?

In [57]:
models = [model1, model2, model3]

for index, model in enumerate(models):
    print(f'Model {index + 1}')
    print(classification_report(y_validate, model.predict(X_validate)))

Model 1
              precision    recall  f1-score   support

           0       0.75      0.93      0.83       132
           1       0.82      0.49      0.61        82

    accuracy                           0.76       214
   macro avg       0.78      0.71      0.72       214
weighted avg       0.77      0.76      0.74       214

Model 2
              precision    recall  f1-score   support

           0       0.76      0.86      0.81       132
           1       0.72      0.56      0.63        82

    accuracy                           0.75       214
   macro avg       0.74      0.71      0.72       214
weighted avg       0.74      0.75      0.74       214

Model 3
              precision    recall  f1-score   support

           0       0.70      0.87      0.78       132
           1       0.66      0.40      0.50        82

    accuracy                           0.69       214
   macro avg       0.68      0.64      0.64       214
weighted avg       0.69      0.69      0.67       

Overall as k is increased the performance of the model decreases with the model having k = 2 with the best performance. This is likely due to a larger value of k resulting in new data points being essentially dragged in too many directions and thus being more likely to be misclassified.

# Logistic Regression Exercises

## 1

> Create a model that includes age in addition to fare and pclass. Does this model perform better than your baseline?