In [3]:
from sklearn.datasets import load_boston
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np

In [5]:
df = pd.read_csv('creditcard.csv')[:80000]
df.head(3)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0


In [9]:
x = df.drop(columns=['Time','Amount','Class']).values
y = df['Class'].values
f"Shapes of x={x.shape} y={y.shape} #Fraud Cases={y.sum()}"


'Shapes of x=(80000, 28) y=(80000,) #Fraud Cases=196'

###### Uneven number of cases with or without fraud, so the dataset is unbalanced 

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, make_scorer
mod= LogisticRegression(class_weight = {0: 1, 1:2}, # doubles weight for fraud cases
max_iter=1000)
mod.fit(x,y).predict(x).sum()

171

##### Logistic Regression Model

In [11]:
def min_recall_precision(est, X, y_true, sample_weight=None):
    '''Returns the minimum amon the recall and precision scores'''
    y_pred = est.predict(X) 
    recall = recall = recall_score(y_true,y_pred)
    precision = precision_score(y_true, y_pred)
    return min(recall, precision)

grid = GridSearchCV(
    estimator = LogisticRegression(max_iter=1000),
    param_grid = {'class_weight':[{0:1, 1:v}for v in np.linspace(1,20,30)]},
    scoring = {'precision':make_scorer(precision_score),
    'recall_score':make_scorer(recall_score),
    'min_both': min_recall_precision},
    refit = 'precision', # selects best model based on precision
    cv=10, # Added additional cross validations for more accurate metrics
    n_jobs = -1 # makes it so grid search occurs in parallel
)
grid.fit(x,y)

In [12]:
df1 = pd.DataFrame(grid.cv_results_)

In [18]:
cols = ['rank_test_precision','mean_test_precision']
df1[cols]

Unnamed: 0,rank_test_precision,mean_test_precision
0,29,0.780937
1,1,0.882453
2,1,0.882453
3,3,0.881349
4,6,0.874206
5,5,0.874683
6,4,0.876749
7,7,0.872877
8,8,0.863844
9,9,0.85937


In [16]:
df1.columns

Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'param_class_weight', 'params', 'split0_test_precision',
       'split1_test_precision', 'split2_test_precision',
       'split3_test_precision', 'split4_test_precision',
       'split5_test_precision', 'split6_test_precision',
       'split7_test_precision', 'split8_test_precision',
       'split9_test_precision', 'mean_test_precision', 'std_test_precision',
       'rank_test_precision', 'split0_test_recall_score',
       'split1_test_recall_score', 'split2_test_recall_score',
       'split3_test_recall_score', 'split4_test_recall_score',
       'split5_test_recall_score', 'split6_test_recall_score',
       'split7_test_recall_score', 'split8_test_recall_score',
       'split9_test_recall_score', 'mean_test_recall_score',
       'std_test_recall_score', 'rank_test_recall_score',
       'split0_test_min_both', 'split1_test_min_both', 'split2_test_min_both',
       'split3_test_min_both', 'split4_test_mi

array([0, 0, 0, ..., 0, 0, 0])