In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import statsmodels as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

import pprint
from pprint import pprint

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/creditcard.csv')
df = df.rename(columns={'Class': 'Fraud'})

In [None]:
df['Fraud'] = df['Fraud'].astype(int)

X = df.drop(['Fraud'], axis = 1)
Y = df["Fraud"]

xData = X.values
yData = Y.values

In [None]:
xTrain, xTest, yTrain, yTest = train_test_split(
        xData, yData, test_size = 0.2, random_state = 42)

pipe = Pipeline([('standardScaler', StandardScaler()), ('quantiletransformer', QuantileTransformer()), ('logistic_regression', LogisticRegression())])

In [None]:
xTrain.shape, xTest.shape, yTrain.shape, yTest.shape

((227845, 30), (56962, 30), (227845,), (56962,))

In [None]:
param_grid_pspp = [{
    'logistic_regression__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'logistic_regression__C': [0.01, 0.1, 1.0, 10.0, 100.0],
    'logistic_regression__penalty': ['l2']
}]

grid_search_pspp = GridSearchCV(pipe, param_grid_pspp, cv=5, scoring= 'recall', verbose=2, n_jobs=-1)
grid_search_pspp.fit(xTrain, yTrain)

Fitting 5 folds for each of 25 candidates, totalling 125 fits




GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('standardScaler', StandardScaler()),
                                       ('quantiletransformer',
                                        QuantileTransformer()),
                                       ('logistic_regression',
                                        LogisticRegression())]),
             n_jobs=-1,
             param_grid=[{'logistic_regression__C': [0.01, 0.1, 1.0, 10.0,
                                                     100.0],
                          'logistic_regression__penalty': ['l2'],
                          'logistic_regression__solver': ['newton-cg', 'lbfgs',
                                                          'liblinear', 'sag',
                                                          'saga']}],
             scoring='recall', verbose=2)

In [None]:
pprint(grid_search_pspp.cv_results_)

{'mean_fit_time': array([ 5.61145859,  3.54838786,  4.29658179,  4.91125631,  5.34654608,
        6.37752495,  4.21260753,  4.02222228,  5.27567477, 10.22523756,
        7.39283924,  4.81795459,  4.56169333,  7.06898284, 13.77252846,
        7.81372771,  5.18265414,  4.89906025,  8.29845471, 15.66867495,
        7.88574333,  5.51786485,  5.05793619,  8.94274349, 16.36387954]),
 'mean_score_time': array([0.43639269, 0.44243989, 0.50339742, 0.41421494, 0.42965169,
       0.43896551, 0.41601362, 0.41080809, 0.40075579, 0.48201661,
       0.42961645, 0.43053937, 0.41668172, 0.39964752, 0.40363965,
       0.42514834, 0.4056612 , 0.39823723, 0.37792201, 0.39890742,
       0.44397483, 0.41767049, 0.42583599, 0.3954783 , 0.37329655]),
 'mean_test_score': array([0.        , 0.        , 0.03050957, 0.        , 0.        ,
       0.69308666, 0.69308666, 0.70574489, 0.69308666, 0.69308666,
       0.76153846, 0.76153846, 0.76153846, 0.76153846, 0.76153846,
       0.76660175, 0.76660175, 0.76660175,

In [None]:
pprint(grid_search_pspp.best_estimator_)

Pipeline(steps=[('standardScaler', StandardScaler()),
                ('quantiletransformer', QuantileTransformer()),
                ('logistic_regression',
                 LogisticRegression(penalty='none', solver='newton-cg'))])


In [None]:
param_grid_pspp = [{
    'logistic_regression__solver': ['newton-cg', 'lbfgs', 'sag', 'saga'],
    'logistic_regression__C': [0.01, 0.1, 1.0, 10.0, 100.0],
    'logistic_regression__penalty': ['none']
}]

grid_search_pspp = GridSearchCV(pipe, param_grid_pspp, cv=5, scoring= 'recall', verbose=2, n_jobs=-1)
grid_search_pspp.fit(xTrain, yTrain)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


  "Setting penalty='none' will ignore the C and l1_ratio parameters"


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('standardScaler', StandardScaler()),
                                       ('quantiletransformer',
                                        QuantileTransformer()),
                                       ('logistic_regression',
                                        LogisticRegression())]),
             n_jobs=-1,
             param_grid=[{'logistic_regression__C': [0.01, 0.1, 1.0, 10.0,
                                                     100.0],
                          'logistic_regression__penalty': ['none'],
                          'logistic_regression__solver': ['newton-cg', 'lbfgs',
                                                          'sag', 'saga']}],
             scoring='recall', verbose=2)

In [None]:
pprint(grid_search_pspp.cv_results_)

{'mean_fit_time': array([ 8.35656333,  6.17518458,  9.59073372, 16.53544998,  8.42137256,
        5.45700531,  9.72683573, 16.73693056,  7.83388543,  5.66814079,
        9.96281152, 16.15139246,  8.38768883,  5.28316269,  9.21215415,
       15.54550443,  7.58168201,  5.67666721,  9.34405022, 14.6911675 ]),
 'mean_score_time': array([0.53173752, 0.53239403, 0.4217598 , 0.41385984, 0.51495996,
       0.43952479, 0.39896436, 0.4089901 , 0.44433355, 0.44010606,
       0.38806386, 0.39944386, 0.46180649, 0.44521809, 0.3779284 ,
       0.38667736, 0.42122722, 0.51970048, 0.37006984, 0.36328964]),
 'mean_test_score': array([0.76660175, 0.76660175, 0.76660175, 0.76660175, 0.76660175,
       0.76660175, 0.76660175, 0.76660175, 0.76660175, 0.76660175,
       0.76660175, 0.76660175, 0.76660175, 0.76660175, 0.76660175,
       0.76660175, 0.76660175, 0.76660175, 0.76660175, 0.76660175]),
 'param_logistic_regression__C': masked_array(data=[0.01, 0.01, 0.01, 0.01, 0.1, 0.1, 0.1, 0.1, 1.0, 1.0,
      

In [None]:
param_grid_pspp = [{
    'logistic_regression__solver': ['liblinear', 'saga'],
    'logistic_regression__C': [0.01, 0.1, 1.0, 10.0, 100.0],
    'logistic_regression__penalty': ['l1']
}]

grid_search_pspp = GridSearchCV(pipe, param_grid_pspp, cv=5, scoring= 'recall', verbose=2, n_jobs=-1)
grid_search_pspp.fit(xTrain, yTrain)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('standardScaler', StandardScaler()),
                                       ('quantiletransformer',
                                        QuantileTransformer()),
                                       ('logistic_regression',
                                        LogisticRegression())]),
             n_jobs=-1,
             param_grid=[{'logistic_regression__C': [0.01, 0.1, 1.0, 10.0,
                                                     100.0],
                          'logistic_regression__penalty': ['l1'],
                          'logistic_regression__solver': ['liblinear',
                                                          'saga']}],
             scoring='recall', verbose=2)

In [None]:
pprint(grid_search_pspp.cv_results_)

{'mean_fit_time': array([ 4.67361164,  6.68351402,  8.20449524, 12.76315031, 17.0774261 ,
       18.37793341, 25.11043143, 21.75551462, 23.38286572, 21.98476739]),
 'mean_score_time': array([0.48502173, 0.47519479, 0.56765399, 0.4524765 , 0.47069955,
       0.45157747, 0.47144833, 0.46733499, 0.48290901, 0.46687922]),
 'mean_test_score': array([0.        , 0.        , 0.75144434, 0.72859461, 0.76660175,
       0.76660175, 0.76660175, 0.76660175, 0.76660175, 0.76660175]),
 'param_logistic_regression__C': masked_array(data=[0.01, 0.01, 0.1, 0.1, 1.0, 1.0, 10.0, 10.0, 100.0,
                   100.0],
             mask=[False, False, False, False, False, False, False, False,
                   False, False],
       fill_value='?',
            dtype=object),
 'param_logistic_regression__penalty': masked_array(data=['l1', 'l1', 'l1', 'l1', 'l1', 'l1', 'l1', 'l1', 'l1',
                   'l1'],
             mask=[False, False, False, False, False, False, False, False,
                   Fal