In [1]:
from sklearn import datasets
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# How Good is your model? 

# Classification Metrics
* Measuring model performance with accuracy:
    * fraction of correctly classified samples
    * not always a useful metric
### Class imbalance example: Emails
* Spam classification
    * 99% of emails are real and 1% are spam
    
* Could build a classifier that predicts ALL emails are real, thus:
    * 99% accurate
        * sounds good...
    * But it's horrible at actually classifying spam
    * fails its purpose
* We thus need a more nuanced method to asses the effectiveness of our model
### Diagnosing Classification Predictions
* confusion matrix
    * Provides accuracy
        *  $ \frac{tp\ +\ tn}{tp\ +\ tn\ +fp\ +\ fn} $
        * t:= true
        * f:= false
        * p:= positive
        * n:= negative
* Precision: 
    * $ \frac{tp}{tp+fp} $
    * positive predictive value
* Recall: 
    * $ \frac{tp\}{tp\ +\ fn} $
    * sensitivity
* F1score: 

    *  $ 2* \frac{"precision"\ *\ recall}{precision\ +\ recall} $
    
* 

In [6]:
diabetes = pd.read_csv('data/diabetes.csv')
diabetes.head()

X = diabetes.drop('diabetes', axis=1).values

y =diabetes.diabetes.values

In [7]:
X.shape

(768, 8)

In [8]:
y.shape

(768,)

In [17]:
# Import necessary modules
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split



logreg = LogisticRegression(max_iter=1000)

X_test, X_train, y_test, y_train = train_test_split(X, y, test_size=0.4, random_state=42)

logreg.fit(X_train, y_train)
# Compute predicted probabilities: y_pred_prob
y_pred_prob = logreg.predict_proba(X_test)[:,1]

# Compute and print AUC score
print("AUC: {}".format(roc_auc_score(y_test, y_pred_prob)))

# Compute cross-validated AUC scores: cv_auc
cv_auc = cross_val_score(logreg, X, y, cv=5, scoring='roc_auc')

# Print list of AUC scores
print("AUC scores computed using 5-fold cross-validation: {}".format(cv_auc))


AUC: 0.8150561429391034
AUC scores computed using 5-fold cross-validation: [0.81240741 0.80777778 0.82555556 0.87283019 0.84471698]
