In [30]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import os
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, recall_score


In [31]:
card_activity = pd.read_csv('creditcard.csv')
card_activity.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [33]:
card_activity.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [34]:
X =card_activity.drop(columns=['Time', 'Class'])
y= card_activity['Class']
y

0         0
1         0
2         0
3         0
4         0
         ..
284802    0
284803    0
284804    0
284805    0
284806    0
Name: Class, Length: 284807, dtype: int64

In [35]:
print(X.shape, y.shape)

(284807, 29) (284807,)


In [36]:
card_activity['Class'].value_counts()

0    284315
1       492
Name: Class, dtype: int64

In [37]:
#separate 'training' and 'testing' data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [38]:
unique_ytrain, counts_ytrain = np.unique(y_train, return_counts=True)
print(unique_ytrain,counts_ytrain)

[0 1] [213224    381]


In [39]:
unique_ytest, counts_ytest = np.unique(y_test, return_counts=True)
print(unique_ytest,counts_ytest)

[0 1] [71091   111]


In [40]:
print('percentage of positive values in the training sample:', counts_ytrain[1]*100/(counts_ytrain[1]+counts_ytrain[0]))
print('percentage of positive values in the test sample:', counts_ytest[1]*100/(counts_ytest[1]+counts_ytest[0]))

percentage of positive values in the training sample: 0.17836661126846282
percentage of positive values in the test sample: 0.15589449734558017


In [41]:
#///////////////////////////////////////////////////////////////////////////////////
#Since the sample is biased towards non-fraud transactions resampling is necessary
#use the resample package from sklearn
#the resampling should be done after splitting the test and train samples to avoid 
#having the same rows in both test and train samples
#//////////////////////////////////////////////////////////////////////////////////
from sklearn.utils import resample

#Now let's create a new test data sample by combining the X_tarin and y_train samples
#and separate fraud and non-fraud transactions
Xy_train = pd.concat([X_train, y_train], axis=1)

non_fraud = Xy_train[Xy_train['Class']==0]
fraud = Xy_train[Xy_train['Class']==1]

# upsample fraud transactions

fraud_new = resample(fraud,replace=True, # if true Implements resampling with replacement
                          n_samples=len(non_fraud), # no. of samples
                          random_state=1)

# now combine and create a train sample with eqaul no. of fraud and non-fraud transactions
Xy_train_new = pd.concat([non_fraud, fraud_new])

#split back X and y
X_train_new = Xy_train_new.drop('Class',axis=1)
y_train_new = Xy_train_new['Class']
print(X_train_new.shape, y_train_new.shape)

(426448, 29) (426448,)


In [42]:
#Logistic Regression (lbfgs)
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(max_iter=500)#default=100 
classifier

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [55]:
#scaling
from sklearn.preprocessing import MinMaxScaler
X_scale = MinMaxScaler().fit(X_train_new)

X_train_scaled = X_scale.transform(X_train_new)
X_test_scaled = X_scale.transform(X_test)
np.savetxt("X_train_scaled.csv", X_train_scaled, delimiter=",")
np.savetxt("y_train_new.csv", y_train_new, delimiter=",")
np.savetxt("X_test_scaled.csv", X_test_scaled, delimiter=",")
np.savetxt("y_test.csv", y_test, delimiter=",")

In [44]:
#fit
model = classifier.fit(X_train_scaled, y_train_new)

In [45]:
from sklearn.metrics import classification_report
predictions = model.predict(X_test_scaled)
predictions
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99     71091
           1       0.08      0.86      0.14       111

    accuracy                           0.98     71202
   macro avg       0.54      0.92      0.57     71202
weighted avg       1.00      0.98      0.99     71202



In [46]:
#try grid search
from sklearn.model_selection import GridSearchCV
C = np.logspace(0,0.25) #did not converge when C is closer to 10
grid = GridSearchCV(classifier, dict(C=C),cv=5, verbose=3)
grid

GridSearchCV(cv=5, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=500, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': array([1.        , 1.01181716, 1.02...
       1.34137751, 1.35722878, 1.37326737, 1.38949549, 1.40591539,
       1.42252931, 1.43933957, 1.45634848, 1.47355838, 1.49097166,
       1.50859071, 1.52641797, 1.54445589, 1.56270698, 1.58117374,
       1.59985872, 1.61876451, 1.63789371, 1.65724896, 1.67683294,
       1.696648

In [47]:
best_model = grid.fit(X_train_scaled, y_train_new)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] C=1.0 ...........................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ............................... C=1.0, score=0.952, total=  12.8s
[CV] C=1.0 ...........................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   12.8s remaining:    0.0s


[CV] ............................... C=1.0, score=0.953, total=   8.6s
[CV] C=1.0 ...........................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   21.4s remaining:    0.0s


[CV] ............................... C=1.0, score=0.953, total=  11.8s
[CV] C=1.0 ...........................................................
[CV] ............................... C=1.0, score=0.952, total=  10.8s
[CV] C=1.0 ...........................................................
[CV] ............................... C=1.0, score=0.951, total=  11.8s
[CV] C=1.0118171605283166 ............................................
[CV] ................ C=1.0118171605283166, score=0.952, total=   9.6s
[CV] C=1.0118171605283166 ............................................
[CV] ................ C=1.0118171605283166, score=0.953, total=   6.3s
[CV] C=1.0118171605283166 ............................................
[CV] ................ C=1.0118171605283166, score=0.953, total=  17.4s
[CV] C=1.0118171605283166 ............................................
[CV] ................ C=1.0118171605283166, score=0.952, total=  17.3s
[CV] C=1.0118171605283166 ............................................
[CV] .

[CV] ................ C=1.1513953993264474, score=0.952, total=  13.4s
[CV] C=1.1513953993264474 ............................................
[CV] ................ C=1.1513953993264474, score=0.953, total=  13.7s
[CV] C=1.1513953993264474 ............................................
[CV] ................ C=1.1513953993264474, score=0.953, total=  14.4s
[CV] C=1.1513953993264474 ............................................
[CV] ................ C=1.1513953993264474, score=0.952, total=  12.5s
[CV] C=1.1513953993264474 ............................................
[CV] ................ C=1.1513953993264474, score=0.952, total=   8.7s
[CV] C=1.165001623591853 .............................................
[CV] ................. C=1.165001623591853, score=0.952, total=  13.9s
[CV] C=1.165001623591853 .............................................
[CV] ................. C=1.165001623591853, score=0.953, total=  10.6s
[CV] C=1.165001623591853 .............................................
[CV] .

[CV] ................ C=1.3102281887548675, score=0.952, total=  20.7s
[CV] C=1.3102281887548675 ............................................
[CV] ................ C=1.3102281887548675, score=0.952, total=  17.9s
[CV] C=1.3257113655901092 ............................................
[CV] ................ C=1.3257113655901092, score=0.952, total=  26.8s
[CV] C=1.3257113655901092 ............................................
[CV] ................ C=1.3257113655901092, score=0.953, total=  16.9s
[CV] C=1.3257113655901092 ............................................
[CV] ................ C=1.3257113655901092, score=0.952, total=  18.7s
[CV] C=1.3257113655901092 ............................................
[CV] ................ C=1.3257113655901092, score=0.952, total=  23.6s
[CV] C=1.3257113655901092 ............................................
[CV] ................ C=1.3257113655901092, score=0.952, total=  19.1s
[CV] C=1.341377509611501 .............................................
[CV] .

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] ................ C=1.3894954943731377, score=0.952, total=  23.3s
[CV] C=1.3894954943731377 ............................................
[CV] ................ C=1.3894954943731377, score=0.952, total=  16.0s
[CV] C=1.4059153856835176 ............................................
[CV] ................ C=1.4059153856835176, score=0.952, total=  18.4s
[CV] C=1.4059153856835176 ............................................
[CV] ................ C=1.4059153856835176, score=0.953, total=  21.7s
[CV] C=1.4059153856835176 ............................................
[CV] ................ C=1.4059153856835176, score=0.952, total=  16.3s
[CV] C=1.4059153856835176 ............................................
[CV] ................ C=1.4059153856835176, score=0.952, total=  21.5s
[CV] C=1.4059153856835176 ............................................
[CV] ................ C=1.4059153856835176, score=0.952, total=  16.9s
[CV] C=1.4225293134853696 ............................................
[CV] .

[CV] ................. C=1.599858719606058, score=0.952, total=  21.5s
[CV] C=1.599858719606058 .............................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] ................. C=1.599858719606058, score=0.952, total=  21.8s
[CV] C=1.599858719606058 .............................................
[CV] ................. C=1.599858719606058, score=0.952, total=  20.2s
[CV] C=1.599858719606058 .............................................
[CV] ................. C=1.599858719606058, score=0.952, total=  21.3s
[CV] C=1.6187645069182697 ............................................
[CV] ................ C=1.6187645069182697, score=0.952, total=  17.4s
[CV] C=1.6187645069182697 ............................................
[CV] ................ C=1.6187645069182697, score=0.952, total=  13.8s
[CV] C=1.6187645069182697 ............................................
[CV] ................ C=1.6187645069182697, score=0.952, total=  16.1s
[CV] C=1.6187645069182697 ............................................
[CV] ................ C=1.6187645069182697, score=0.952, total=  15.1s
[CV] C=1.6187645069182697 ............................................
[CV] .

[Parallel(n_jobs=1)]: Done 250 out of 250 | elapsed: 68.6min finished


In [48]:
predictions_grid = grid.predict(X_test_scaled)
predictions_grid
print(classification_report(y_test, predictions_grid))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99     71091
           1       0.08      0.86      0.14       111

    accuracy                           0.98     71202
   macro avg       0.54      0.92      0.57     71202
weighted avg       1.00      0.98      0.99     71202



In [49]:
#trying the linear solver to see what happens
classifier_lin = LogisticRegression(max_iter=500,solver='liblinear')#default=100 
classifier_lin
#model_lin = classifier_lin.fit(X_train_scaled, y_train_new)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [50]:
model_lin = classifier_lin.fit(X_train_scaled, y_train_new)

In [51]:
predictions_lin = model_lin.predict(X_test_scaled)
predictions_lin
print(classification_report(y_test, predictions_lin))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99     71091
           1       0.08      0.86      0.14       111

    accuracy                           0.98     71202
   macro avg       0.54      0.92      0.57     71202
weighted avg       1.00      0.98      0.99     71202



In [52]:
# Save the model
import joblib
filename = 'logistic_model.sav'
joblib.dump(best_model, filename)

['logistic_model.sav']

In [53]:
logisticmodel = joblib.load(filename)
prediction1=logisticmodel.predict(X_test_scaled)
print(classification_report(y_test, prediction1))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99     71091
           1       0.08      0.86      0.14       111

    accuracy                           0.98     71202
   macro avg       0.54      0.92      0.57     71202
weighted avg       1.00      0.98      0.99     71202

