In [1]:
import env
import acquire
import numpy as np
import seaborn as sns
import scipy.stats as stats
import pandas as pd
import matplotlib.pyplot as plt
from pydataset import data
import os

Given the following confusion matrix, evaluate (by hand) the model's performance.

|               | pred dog   | pred cat   |
|:------------  |-----------:|-----------:|
| actual dog    |         46 |         7  |
| actual cat    |         13 |         34 |

In the context of this problem, what is a false positive?
In the context of this problem, what is a false negative?
How would you describe this model?

In [None]:
# False positive: you predicted dog but it was actually a cat

In [None]:
# False negative: you predicted a cat but it actually was a dog

In [2]:
tp = 46
tn = 34
fn = 7
fp = 13

In [4]:
accuracy = (tp + tn) / (tp + fp + fn + tn)
print(f'The accuracy of this model is {accuracy:.2%}')

The accuracy of this model is 80.00%


In [5]:
precision = tp / (tp + fp)
print(f'The precision of this model is {precision:.2%}')

The precision of this model is 77.97%


In [6]:
recall = tp / (tp + fn)
print(f'The recall of this model is {recall:.2%}')

The recall of this model is 86.79%


In [9]:
misclassification_rate = 1 - accuracy
print(f'The misclassification rate of this model is {misclassification_rate:.2%}')

The misclassification rate of this model is 20.00%


In [10]:
sensitivity = tp / (tp + fn)
print(f'The sensitivity of this model is {sensitivity:.2%}')

The sensitivity of this model is 86.79%


In [11]:
specificity = tn / (fp + tn)
print(f'The specificity of this model is {specificity:.2%}')

The specificity of this model is 72.34%


In [12]:
negative_predictive_value = tn / (tn + fn)
print(f'The negative predictive value of this model is {negative_predictive_value:.2%}')

The negative predictive value of this model is 82.93%


In [13]:
f1_score = 2 * ((precision * recall) / (precision + recall))
print(f'The F1 Score of this model is {f1_score:.2%}')

The F1 Score of this model is 82.14%


3. You are working as a datascientist working for Codeup Cody Creator (C3 for short), a rubber-duck manufacturing plant.

In [19]:
url = 'https://ds.codeup.com/data/c3.csv'

In [20]:
df = pd.read_csv(url)

In [21]:
df

Unnamed: 0,actual,model1,model2,model3
0,No Defect,No Defect,Defect,No Defect
1,No Defect,No Defect,Defect,Defect
2,No Defect,No Defect,Defect,No Defect
3,No Defect,Defect,Defect,Defect
4,No Defect,No Defect,Defect,No Defect
...,...,...,...,...
195,No Defect,No Defect,Defect,Defect
196,Defect,Defect,No Defect,No Defect
197,No Defect,No Defect,No Defect,No Defect
198,No Defect,No Defect,Defect,Defect


 - An internal team wants to investigate the cause of the manufacturing defects. They tell you that they want to identify as many of the ducks that have a defect as possible. Which evaluation metric would be appropriate here? Which model would be the best fit for this use case?

In [22]:
pd.crosstab(df.actual, df.model1)

model1,Defect,No Defect
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
Defect,8,8
No Defect,2,182


In [None]:
# For model 1:
# if defect is positive

m1_TP = 8
m1_TN = 182
m1_FP = 2
m1_FN = 8

In [None]:
# Recall Models

In [None]:
# focuses on the positive so defects

In [26]:
df['baseline'] = 'Defect' 

In [27]:
# This subset holds all the values in which the rubber duck
# was indeed defected. We pull this in order to access it against
# the predicted defects

subset = df[df.actual == 'Defect']

In [28]:
subset

Unnamed: 0,actual,model1,model2,model3,baseline
13,Defect,No Defect,Defect,Defect,Defect
30,Defect,Defect,No Defect,Defect,Defect
65,Defect,Defect,Defect,Defect,Defect
70,Defect,Defect,Defect,Defect,Defect
74,Defect,No Defect,No Defect,Defect,Defect
87,Defect,No Defect,Defect,Defect,Defect
118,Defect,No Defect,Defect,No Defect,Defect
135,Defect,Defect,No Defect,Defect,Defect
140,Defect,No Defect,Defect,Defect,Defect
147,Defect,Defect,No Defect,Defect,Defect


In [32]:
model1_recall = (subset.model1 == subset.actual)
model1_recall.mean()
# So model 1 only caught 50% of the defected rubber duckies

0.5

In [33]:
model2_recall = (subset.model2 == subset.actual).mean()
model2_recall
# So model 2 only caught 56.25% of the defected rubber duckies

0.5625

In [34]:
model3_recall = (subset.model3 == subset.actual).mean()
model3_recall
# so model 3 caught 81.25% of the defected rubber duckies

0.8125

Recently several stories in the local news have come out highlighting customers who received a rubber duck with a defect, and portraying C3 in a bad light. The PR team has decided to launch a program that gives customers with a defective duck a vacation to Hawaii. They need you to predict which ducks will have defects, but tell you the really don't want to accidentally give out a vacation package when the duck really doesn't have a defect. Which evaluation metric would be appropriate here? Which model would be the best fit for this use case?

In [None]:
# So we need to find false positives 
# we want to identify the model that finds the false 
# positives
# Right???


In [37]:
subset = df[df.actual == 'No Defect']
subset

Unnamed: 0,actual,model1,model2,model3,baseline
0,No Defect,No Defect,Defect,No Defect,Defect
1,No Defect,No Defect,Defect,Defect,Defect
2,No Defect,No Defect,Defect,No Defect,Defect
3,No Defect,Defect,Defect,Defect,Defect
4,No Defect,No Defect,Defect,No Defect,Defect
...,...,...,...,...,...
193,No Defect,No Defect,Defect,Defect,Defect
195,No Defect,No Defect,Defect,Defect,Defect
197,No Defect,No Defect,No Defect,No Defect,Defect
198,No Defect,No Defect,Defect,Defect,Defect


In [46]:
model1_recall = (subset.actual != subset.model1).mean()
(model1_recall * 100)
# So when there was no defect, model 1's predictions said there
# was a defect 1.09% of the time

1.0869565217391304

In [47]:
model2_recall = (subset.actual != subset.model2).mean()
model2_recall * 100
# So model 2 said there was a defect when infact there was
# no defect 44.02% of the time

44.02173913043478

In [48]:
model3_recall = (subset.actual != subset.model3).mean()
model3_recall * 100
# So model 3 said that there was a defect when infact there
# was not a defect 46.74% of the time

46.73913043478261

Seems like model 1 will send the least amount of people to Hawaii 
under a false positive (they got a duck with zero defects)

4. You are working as a data scientist for Gives You Paws ™, a subscription based service that shows you cute pictures of dogs or cats (or both for an additional fee).
At Gives You Paws, anyone can upload pictures of their cats or dogs. The photos are then put through a two step process. First an automated algorithm tags pictures as either a cat or a dog (Phase I). Next, the photos that have been initially identified are put through another round of review, possibly with some human oversight, before being presented to the users (Phase II).

In [58]:
pet_df = pd.read_csv('https://ds.codeup.com/data/gives_you_paws.csv')

In [59]:
pet_df

Unnamed: 0,actual,model1,model2,model3,model4
0,cat,cat,dog,cat,dog
1,dog,dog,cat,cat,dog
2,dog,cat,cat,cat,dog
3,dog,dog,dog,cat,dog
4,cat,cat,cat,dog,dog
...,...,...,...,...,...
4995,dog,dog,dog,dog,dog
4996,dog,dog,cat,cat,dog
4997,dog,cat,cat,dog,dog
4998,cat,cat,cat,cat,dog


Given this dataset, use pandas to create a baseline model (i.e. a model that just predicts the most common class) and answer the following questions:

In [60]:
pet_df['baseline'] = 'dog'
# we've established dog as the positive

In [61]:
pet_df.actual.value_counts()

dog    3254
cat    1746
Name: actual, dtype: int64

a. In terms of accuracy, how do the various models compare to the baseline model? Are any of the models better than the baseline?

In [None]:
# SO lets get the accuracy of our baseline model, compared to
# the other models

In [62]:
pet_df

Unnamed: 0,actual,model1,model2,model3,model4,baseline
0,cat,cat,dog,cat,dog,dog
1,dog,dog,cat,cat,dog,dog
2,dog,cat,cat,cat,dog,dog
3,dog,dog,dog,cat,dog,dog
4,cat,cat,cat,dog,dog,dog
...,...,...,...,...,...,...
4995,dog,dog,dog,dog,dog,dog
4996,dog,dog,cat,cat,dog,dog
4997,dog,cat,cat,dog,dog,dog
4998,cat,cat,cat,cat,dog,dog


In [63]:
baseline_subset = pet_df[pet_df.baseline == 'dog']

In [65]:
baseline_precision = (baseline_subset.baseline == baseline_subset.actual).mean()

In [67]:
model1_subset = pet_df[pet_df.model1 == 'dog']
model1_precision = (model1_subset.model1 == model1_subset.actual).mean()

In [68]:
model2_subset = pet_df[pet_df.model2 == 'dog']
model2_precision = (model2_subset.model2 == model2_subset.actual).mean()

In [69]:
model3_subset = pet_df[pet_df.model3 == 'dog']
model3_precision = (model3_subset.model3 == model3_subset.actual).mean()

In [70]:
model4_subset = pet_df[pet_df.model4 == 'dog']
model4_precision = (model4_subset.model4 == model4_subset.actual).mean()

In [72]:
print(f'The baseline precision for the Gives You Paws model is {baseline_precision:.2%}')
print(f'The model 1 precision is {model1_precision:.2%}')
print(f'The model 2 precision is {model2_precision:.2%}')
print(f'The model 3 precision is {model3_precision:.2%}')
print(f'The model 4 precision is {model4_precision:.2%}')

The baseline precision for the Gives You Paws model is 65.08%
The model 1 precision is 89.00%
The model 2 precision is 89.32%
The model 3 precision is 65.99%
The model 4 precision is 73.12%


AAANNNDDDDD I Just realized I did precision and not accuracy...... ugh

In [75]:
model1_accuracy = (pet_df.actual == pet_df.model1).mean()
model2_accuracy = (pet_df.actual == pet_df.model2).mean()
model3_accuracy = (pet_df.actual == pet_df.model3).mean()
model4_accuracy = (pet_df.actual == pet_df.model4).mean()

In [76]:
baseline_accuracy = (pet_df.actual == pet_df.baseline).mean()

In [77]:
print(f'The baseline accuracy for the Gives You Paws model is {baseline_accuracy:.2%}')
print(f'The model 1 accuracy is {model1_accuracy:.2%}')
print(f'The model 2 accuracy is {model2_accuracy:.2%}')
print(f'The model 3 accuracy is {model3_accuracy:.2%}')
print(f'The model 4 accuracy is {model4_accuracy:.2%}')

The baseline accuracy for the Gives You Paws model is 65.08%
The model 1 accuracy is 80.74%
The model 2 accuracy is 63.04%
The model 3 accuracy is 50.96%
The model 4 accuracy is 74.26%


Models 1 and 4 performed significantly better than the baseline model, however models 2 and 3 performed worse than the baseline model. 

b. Suppose you are working on a team that solely deals with dog pictures. Which of these models would you recommend?

In [None]:
# I would recoomend the recall evaluation metric as it
# measures the positive values which we can control as 'dog'

In [79]:
subset = pet_df[pet_df.actual == 'dog']
subset

Unnamed: 0,actual,model1,model2,model3,model4,baseline
1,dog,dog,cat,cat,dog,dog
2,dog,cat,cat,cat,dog,dog
3,dog,dog,dog,cat,dog,dog
5,dog,dog,dog,dog,dog,dog
8,dog,dog,cat,dog,dog,dog
...,...,...,...,...,...,...
4993,dog,dog,cat,dog,dog,dog
4995,dog,dog,dog,dog,dog,dog
4996,dog,dog,cat,cat,dog,dog
4997,dog,cat,cat,dog,dog,dog


In [82]:
model1_recall = (subset.actual == subset.model1).mean()
model2_recall = (subset.actual == subset.model2).mean()
model3_recall = (subset.actual == subset.model3).mean()
model4_recall = (subset.actual == subset.model4).mean()

In [84]:
print(f'Given the accuracy of predicting specificlly dog pictures, this is how the four models did:\n')
print(f' Model 1: {model1_recall:.2%}')
print(f' Model 2: {model2_recall:.2%}')
print(f' Model 3: {model3_recall:.2%}')
print(f' Model 4: {model4_recall:.2%}')

Given the accuracy of predicting specificlly dog pictures, this is how the four models did:

 Model 1: 80.33%
 Model 2: 49.08%
 Model 3: 50.86%
 Model 4: 95.57%


So, in order to predict the most dog pictures most accurately, I would recommend using model 4.

c. Suppose you are working on a team that solely deals with cat pictures. Which of these models would you recommend?

So I would do a recall evaluation but just change the positive value to cat.

In [85]:
subset_cat = pet_df[pet_df.actual == 'cat']
subset_cat

Unnamed: 0,actual,model1,model2,model3,model4,baseline
0,cat,cat,dog,cat,dog,dog
4,cat,cat,cat,dog,dog,dog
6,cat,cat,cat,cat,dog,dog
7,cat,dog,cat,cat,dog,dog
11,cat,cat,dog,cat,cat,dog
...,...,...,...,...,...,...
4987,cat,dog,cat,dog,dog,dog
4989,cat,cat,cat,dog,cat,dog
4991,cat,cat,cat,cat,dog,dog
4994,cat,cat,cat,dog,dog,dog


In [86]:
model1_cat_recall = (subset_cat.actual == subset_cat.model1).mean()
model2_cat_recall = (subset_cat.actual == subset_cat.model2).mean()
model3_cat_recall = (subset_cat.actual == subset_cat.model3).mean()
model4_cat_recall = (subset_cat.actual == subset_cat.model4).mean()

In [87]:
print(f'Given the accuracy of predicting specificlly cat pictures, this is how the four models did:\n')
print(f' Model 1: {model1_cat_recall:.2%}')
print(f' Model 2: {model2_cat_recall:.2%}')
print(f' Model 3: {model3_cat_recall:.2%}')
print(f' Model 4: {model4_cat_recall:.2%}')

Given the accuracy of predicting specificlly cat pictures, this is how the four models did:

 Model 1: 81.50%
 Model 2: 89.06%
 Model 3: 51.15%
 Model 4: 34.54%


In [88]:
print(f'Given the accuracy of predicting specificlly dog pictures, this is how the four models did:\n')
print(f' Model 1: {model1_recall:.2%}')
print(f' Model 2: {model2_recall:.2%}')
print(f' Model 3: {model3_recall:.2%}')
print(f' Model 4: {model4_recall:.2%}')

Given the accuracy of predicting specificlly dog pictures, this is how the four models did:

 Model 1: 80.33%
 Model 2: 49.08%
 Model 3: 50.86%
 Model 4: 95.57%


For best predicting cat pictures, I would recommend model 2. 

5. Follow the links below to read the documentation about each function, then apply those functions to the data from the previous problem.

In [94]:
from sklearn.metrics import accuracy_score, precision_score

In [104]:
accuracy_score(pet_df.actual, pet_df.baseline)

0.6508

In [105]:
baseline_accuracy

0.6508

In [92]:
accuracy_score(pet_df.actual, pet_df.baseline) == baseline_accuracy

True

In [102]:
precision_score(baseline_subset.actual, baseline_subset.baseline, average=None, zero_division=1)

array([1.    , 0.6508])

In [103]:
baseline_precision

0.6508

In [106]:
from sklearn.metrics import recall_score

In [108]:
recall_score(subset_cat.actual, subset_cat.model1, average=None)

  _warn_prf(average, modifier, msg_start, len(result))


array([0.81500573, 0.        ])

In [109]:
model1_cat_recall

0.8150057273768614

In [110]:
from sklearn.metrics import classification_report

In [111]:
classification_report?

In [113]:
print(classification_report(pet_df.actual, pet_df.model1))

              precision    recall  f1-score   support

         cat       0.69      0.82      0.75      1746
         dog       0.89      0.80      0.84      3254

    accuracy                           0.81      5000
   macro avg       0.79      0.81      0.80      5000
weighted avg       0.82      0.81      0.81      5000



In [114]:
accuracy_score?