In [19]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv('creditcard.csv')[:80_000]
df.head(3)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0


In [23]:
X = df.drop(columns = ['Time', 'Amount','Class']).values
y = df['Class'].values
f'Shapes of X={X.shape} y={y.shape}, #Fraud Cases={y.sum()}'

'Shapes of X=(80000, 28) y=(80000,), #Fraud Cases=196'

In [33]:
from sklearn.linear_model import LogisticRegression

mod = LogisticRegression(class_weight={0:1,1:2},max_iter=1000)
mod.fit(X,y).predict(X).sum()

# We set max_iter=1000 bcs by default it was 100, and we have a huge dataset to accommodate that, so we set it to 1000.
# setting class_weight allows us to catch more frauds and make the model more efficient

172

In [58]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score, recall_score, make_scorer

grid = GridSearchCV(
    estimator=LogisticRegression(),
    param_grid= {'class_weight':[{0:1,1:v} for v in np.linspace(1,20,30)]},
    scoring={'precision':make_scorer(precision_score),'recall_score':make_scorer(recall_score)},
    refit='precision',
    return_train_score=True,
    cv=10,
    n_jobs=1
)
grid.fit(X,y)

# we have to pass precision and recall under make_scorer func first
# used np.linspace for more iterations of v
# cv=10 for better accuracy

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [59]:
from sklearn.metrics import precision_score, recall_score

recall_score(y, grid.predict(X))

# precision_score tells us how accurate I am given that I predict the fraud
# recall_score tells us if we get all the fraud cases

0.6632653061224489

In [60]:
pd.DataFrame(grid.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,params,split0_test_precision,split1_test_precision,split2_test_precision,split3_test_precision,...,split2_train_recall_score,split3_train_recall_score,split4_train_recall_score,split5_train_recall_score,split6_train_recall_score,split7_train_recall_score,split8_train_recall_score,split9_train_recall_score,mean_train_recall_score,std_train_recall_score
0,0.164753,0.014309,0.008141,0.002588,"{0: 1, 1: 1.0}","{'class_weight': {0: 1, 1: 1.0}}",1.0,0.463415,0.583333,1.0,...,0.632768,0.559322,0.573864,0.573864,0.5625,0.619318,0.625,0.602273,0.611049,0.05175
1,0.167189,0.038433,0.007296,0.001007,"{0: 1, 1: 1.6551724137931034}","{'class_weight': {0: 1, 1: 1.6551724137931034}}",1.0,0.44186,0.583333,1.0,...,0.689266,0.627119,0.670455,0.647727,0.630682,0.681818,0.698864,0.6875,0.681366,0.053324
2,0.171041,0.031321,0.007911,0.001839,"{0: 1, 1: 2.310344827586207}","{'class_weight': {0: 1, 1: 2.310344827586207}}",1.0,0.431818,0.583333,1.0,...,0.734463,0.683616,0.715909,0.693182,0.681818,0.721591,0.744318,0.727273,0.723889,0.044372
3,0.163638,0.02633,0.008899,0.004483,"{0: 1, 1: 2.9655172413793105}","{'class_weight': {0: 1, 1: 2.9655172413793105}}",1.0,0.431818,0.583333,1.0,...,0.779661,0.706215,0.744318,0.732955,0.715909,0.755682,0.772727,0.738636,0.749413,0.039118
4,0.171571,0.028462,0.008246,0.002489,"{0: 1, 1: 3.6206896551724137}","{'class_weight': {0: 1, 1: 3.6206896551724137}}",1.0,0.431818,0.583333,1.0,...,0.819209,0.734463,0.761364,0.744318,0.732955,0.778409,0.784091,0.761364,0.770939,0.035655
5,0.180751,0.047346,0.006698,0.000459,"{0: 1, 1: 4.275862068965517}","{'class_weight': {0: 1, 1: 4.275862068965517}}",1.0,0.431818,0.583333,1.0,...,0.841808,0.774011,0.778409,0.789773,0.784091,0.789773,0.801136,0.772727,0.79419,0.029023
6,0.194371,0.020262,0.008782,0.001887,"{0: 1, 1: 4.931034482758621}","{'class_weight': {0: 1, 1: 4.931034482758621}}",1.0,0.431818,0.583333,1.0,...,0.847458,0.79096,0.795455,0.818182,0.795455,0.801136,0.823864,0.795455,0.812898,0.022564
7,0.197451,0.035464,0.008713,0.002233,"{0: 1, 1: 5.586206896551724}","{'class_weight': {0: 1, 1: 5.586206896551724}}",1.0,0.431818,0.583333,1.0,...,0.847458,0.819209,0.829545,0.829545,0.8125,0.8125,0.829545,0.818182,0.82708,0.014785
8,0.187395,0.020509,0.009777,0.002944,"{0: 1, 1: 6.241379310344827}","{'class_weight': {0: 1, 1: 6.241379310344827}}",1.0,0.422222,0.583333,0.947368,...,0.847458,0.830508,0.846591,0.835227,0.818182,0.829545,0.857955,0.835227,0.838996,0.01399
9,0.220176,0.059118,0.011197,0.004589,"{0: 1, 1: 6.896551724137931}","{'class_weight': {0: 1, 1: 6.896551724137931}}",0.944444,0.422222,0.583333,0.947368,...,0.847458,0.836158,0.846591,0.840909,0.829545,0.835227,0.863636,0.846591,0.845233,0.010835


In [39]:
lr = LogisticRegression()
??lr.score #this helps us to see the implementation of score

[1;31mSignature:[0m [0mlr[0m[1;33m.[0m[0mscore[0m[1;33m([0m[0mX[0m[1;33m,[0m [0my[0m[1;33m,[0m [0msample_weight[0m[1;33m=[0m[1;32mNone[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mSource:[0m   
    [1;32mdef[0m [0mscore[0m[1;33m([0m[0mself[0m[1;33m,[0m [0mX[0m[1;33m,[0m [0my[0m[1;33m,[0m [0msample_weight[0m[1;33m=[0m[1;32mNone[0m[1;33m)[0m[1;33m:[0m[1;33m
[0m        [1;34m"""
        Return the mean accuracy on the given test data and labels.

        In multi-label classification, this is the subset accuracy
        which is a harsh metric since you require for each sample that
        each label set be correctly predicted.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Test samples.

        y : array-like of shape (n_samples,) or (n_samples, n_outputs)
            True labels for `X`.

        sample_weight : array-like of shape (n_samples,), default=None
          

In [64]:
plt.figure(figsize=(12,4))
df = pd.DataFrame(grid.cv_results_)
for score in ['mean_test_recall','mean_test_precision']:
    plt.plot([_[1] for _ in df['param_class_weight']],
            df[score],
            label=score)
plt.legend();

KeyError: 'mean_test_recall'

<Figure size 1200x400 with 0 Axes>

In [66]:
plt.figure(figsize=(12, 4))
df_results = pd.DataFrame(grid.cv_results_)
for score in ['mean_test_recall', 'mean_test_precision', 'mean_test_min_both']:
    plt.plot([_[1] for _ in df_results['param_class_weight']], 
             df_results[score], 
             label=score)
plt.legend();


KeyError: 'mean_test_recall'

<Figure size 1200x400 with 0 Axes>