In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve, average_precision_score, roc_auc_score, auc, roc_curve
from sklearn.linear_model import LogisticRegression

### Case 1: Balanced Dataset

In [22]:
X, y = make_classification(n_samples=10000, 
                           n_features=4, 
                           n_redundant=0, 
                           random_state=42, 
                           weights=[0.5])
# To DataFrame
columns = [f'feat_{i+1}' for i in range(X.shape[1])]
X = pd.DataFrame(X, columns=columns)
y = pd.DataFrame(y, columns=['label'])

df = X.merge(y,left_index=True, right_index=True)

In [119]:
df.sample()

Unnamed: 0,feat_1,feat_2,feat_3,feat_4,label
5883,0.727421,-1.513145,-0.778294,0.784069,0


In [120]:
train_df, test_df = train_test_split(df, test_size=0.1, shuffle=False)
X_train, y_train = train_df[X.columns], train_df[y.columns[0]]
X_test, y_test = test_df[X.columns], test_df[y.columns[0]]

In [121]:
clf = LogisticRegression()
clf.fit(X_train, y_train)

LogisticRegression()

In [122]:
def evaluation(clf, X, y):
    y_predict_proba = clf.predict_proba(X)[:, 1]
    
    precision, recall, thresholds = precision_recall_curve(y, y_predict_proba)
    fpr, tpr, thresholds = roc_curve(y, y_predict_proba)
    
    return{
        'ROC AUC': auc(fpr, tpr), # Alternatively: roc_auc_score(y, y_predict_proba)
        'PR AUC': auc(recall, precision) # Alternatively (with slight difference): average_precision_score(y, y_predict_proba)
    }

In [123]:
evaluation(clf, X_train, y_train)

{'ROC AUC': 0.9488691535252812, 'PR AUC': 0.9410024055042296}

In [124]:
evaluation(clf, X_test, y_test)

{'ROC AUC': 0.953169335834175, 'PR AUC': 0.9377326624603982}

### Case 2: Unbalanced Dataset

In [125]:
X, y = make_classification(n_samples=10000, 
                           n_features=4, 
                           n_redundant=0, 
                           random_state=42, 
                           weights=[0.9])
# To DataFrame
columns = [f'feat_{i+1}' for i in range(X.shape[1])]
X = pd.DataFrame(X, columns=columns)
y = pd.DataFrame(y, columns=['label'])

df = X.merge(y,left_index=True, right_index=True)

In [126]:
train_df, test_df = train_test_split(df, test_size=0.1, shuffle=False)
X_train, y_train = train_df[X.columns], train_df[y.columns[0]]
X_test, y_test = test_df[X.columns], test_df[y.columns[0]]

In [127]:
clf = LogisticRegression(class_weight='balanced')
clf.fit(X_train, y_train)

LogisticRegression(class_weight='balanced')

In [128]:
evaluation(clf, X_train, y_train)

{'ROC AUC': 0.9325527987637363, 'PR AUC': 0.7045278360634368}

In [129]:
evaluation(clf, X_test, y_test)

{'ROC AUC': 0.9613000145741545, 'PR AUC': 0.7173812596081262}

PR VS ROC: PR is better for class imbalances, particularly if we want to treat positive classes with greater weight

## Build it Yourself

Notes: `auc` is typically found using the trapezoid rule. 

Proof here: https://github.com/scikit-learn/scikit-learn/blob/95119c13a/sklearn/metrics/_ranking.py#L100

In [121]:
def roc_auc_manual(y, y_predict_proba):     
    y_predict_proba_df = pd.DataFrame(y_predict_proba, columns=['y_predict_proba'])
    frame = pd.concat([y.reset_index()['label'], y_predict_proba_df], axis=1)

    FPR, TPR = [], []
    thresholds = np.arange(0.0, 1.01, .01)

    P = y[y==1].shape[0]
    N = y[y==0].shape[0]
    
    for thresh in thresholds:
        frame['y_pred'] = (frame['y_predict_proba'] > thresh).astype(int)
        FP = ((frame['y_pred']==1) & (frame['label']==0)).sum()
        TP = ((frame['y_pred']==1) & (frame['label']==1)).sum()
        FN = ((frame['y_pred']==0) & (frame['label']==1)).sum()
        TN = ((frame['y_pred']==0) & (frame['label']==0)).sum()

        TPR.append(TP / P)
        FPR.append(FP / N)
        
    direction = -1 if FPR[0] > FPR[1] else 1
    area = direction * np.trapz(TPR, FPR)   
    return area


def pr_auc_manual(y, y_predict_proba): 
    y_predict_proba_df = pd.DataFrame(y_predict_proba, columns=['y_predict_proba'])
    frame = pd.concat([y.reset_index()['label'], y_predict_proba_df], axis=1)

    PRECISION, RECALL = [], []
    thresholds = np.arange(0.0, 1.01, .01)

    for thresh in thresholds:
        frame['y_pred'] = (frame['y_predict_proba'] > thresh).astype(int)
        FP = ((frame['y_pred']==1) & (frame['label']==0)).sum()
        TP = ((frame['y_pred']==1) & (frame['label']==1)).sum()
        FN = ((frame['y_pred']==0) & (frame['label']==1)).sum()
        TN = ((frame['y_pred']==0) & (frame['label']==0)).sum()
        
        # Not exactly how it's computed, but close enough
        prec = TP / (TP + FP) if TP + FP > 0 else 1
        rec = TP / (TP + FN) if (TP + FN) > 0 else 1
        
        PRECISION.append(prec)
        RECALL.append(rec)

    area = -1 * np.trapz(PRECISION, RECALL)   
    return area


def evaluation(clf, X, y):
    y_predict_proba = clf.predict_proba(X)[:, 1]
    
    precision, recall, _ = precision_recall_curve(y, y_predict_proba)
    fpr, tpr, _ = roc_curve(y, y_predict_proba)
    
    return{
        'ROC AUC': auc(fpr, tpr),
        'PR AUC': auc(recall, precision),
        'ROC AUC Manual': roc_auc_manual(y, y_predict_proba),
        'PR AUC Manual': pr_auc_manual(y, y_predict_proba),
    }

### Balanced Dataset

In [122]:
X, y = make_classification(n_samples=10000, 
                           n_features=4, 
                           n_redundant=0, 
                           random_state=42, 
                           weights=[0.5])
# To DataFrame
columns = [f'feat_{i+1}' for i in range(X.shape[1])]
X = pd.DataFrame(X, columns=columns)
y = pd.DataFrame(y, columns=['label'])

df = X.merge(y,left_index=True, right_index=True)

train_df, test_df = train_test_split(df, test_size=0.1, shuffle=False)
X_train, y_train = train_df[X.columns], train_df[y.columns[0]]
X_test, y_test = test_df[X.columns], test_df[y.columns[0]]

clf = LogisticRegression()
clf.fit(X_train, y_train)

LogisticRegression()

In [123]:
evaluation(clf, X_train, y_train)

{'ROC AUC': 0.9488691535252812,
 'PR AUC': 0.9410024055042296,
 'ROC AUC Manual': 0.9488153012239954,
 'PR AUC Manual': 0.9441070778755031}

In [124]:
evaluation(clf, X_test, y_test)

{'ROC AUC': 0.953169335834175,
 'PR AUC': 0.9377326624603982,
 'ROC AUC Manual': 0.9531573125571107,
 'PR AUC Manual': 0.9413042289698784}

### Unbalanced Dataset

In [125]:
X, y = make_classification(n_samples=10000, 
                           n_features=4, 
                           n_redundant=0, 
                           random_state=42, 
                           weights=[0.9])
# To DataFrame
columns = [f'feat_{i+1}' for i in range(X.shape[1])]
X = pd.DataFrame(X, columns=columns)
y = pd.DataFrame(y, columns=['label'])

df = X.merge(y,left_index=True, right_index=True)

train_df, test_df = train_test_split(df, test_size=0.1, shuffle=False)
X_train, y_train = train_df[X.columns], train_df[y.columns[0]]
X_test, y_test = test_df[X.columns], test_df[y.columns[0]]

clf = LogisticRegression(class_weight='balanced')
clf.fit(X_train, y_train)

LogisticRegression(class_weight='balanced')

In [126]:
evaluation(clf, X_train, y_train)

{'ROC AUC': 0.9325527987637363,
 'PR AUC': 0.7045278360634368,
 'ROC AUC Manual': 0.9324968892026183,
 'PR AUC Manual': 0.7107998620573341}

In [127]:
evaluation(clf, X_test, y_test)

{'ROC AUC': 0.9613000145741545,
 'PR AUC': 0.7173812596081262,
 'ROC AUC Manual': 0.9616923956546599,
 'PR AUC Manual': 0.7301772949225389}

`precision_recall_curve` is actually determined from the true positives and false positives like this https://github.com/scikit-learn/scikit-learn/blob/95119c13a/sklearn/metrics/_ranking.py#L733

Better to use an AUC vs Specific Precision & Recall scores becuase: AUC is representative of how well the model does at various thresholds. So it isn't bogged down by the arbitrariness