In [43]:
import pandas  as pd 
import matplotlib.pyplot as plt 
import numpy as np 
from sklearn.linear_model import LogisticRegression 
from sklearn.preprocessing import StandardScaler 
from sklearn.metrics import confusion_matrix, classification_report 
  

data = pd.read_csv('creditcard.csv') 
  

print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [44]:
data['normAmount'] = StandardScaler().fit_transform(np.array(data['Amount']).reshape(-1, 1)) 
  
data = data.drop(['Time', 'Amount'], axis = 1) 
  
data['Class'].value_counts() 

Class
0    284315
1       492
Name: count, dtype: int64

In [45]:
from sklearn.model_selection import train_test_split 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0) 

print("Number transactions X_train dataset: ", X_train.shape) 
print("Number transactions y_train dataset: ", y_train.shape) 
print("Number transactions X_test dataset: ", X_test.shape) 
print("Number transactions y_test dataset: ", y_test.shape) 

Number transactions X_train dataset:  (700, 2)
Number transactions y_train dataset:  (700,)
Number transactions X_test dataset:  (300, 2)
Number transactions y_test dataset:  (300,)


Using SMOTE 

In [46]:
from imblearn.over_sampling import SMOTE
print("Before OverSampling, counts of label '1': {}".format(sum(y_train == 1))) 
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train == 0))) 

sm = SMOTE(random_state=2) 
X_train_res, y_train_res = sm.fit_resample(X_train, y_train.ravel()) 

print('After OverSampling, the shape of train_X: {}'.format(X_train_res.shape)) 
print('After OverSampling, the shape of train_y: {} \n'.format(y_train_res.shape)) 

print("After OverSampling, counts of label '1': {}".format(sum(y_train_res == 1))) 
print("After OverSampling, counts of label '0': {}".format(sum(y_train_res == 0))) 


Before OverSampling, counts of label '1': 137
Before OverSampling, counts of label '0': 563 

After OverSampling, the shape of train_X: (1126, 2)
After OverSampling, the shape of train_y: (1126,) 

After OverSampling, counts of label '1': 563
After OverSampling, counts of label '0': 563


In [36]:
lr1 = LogisticRegression() 
lr1.fit(X_train_res, y_train_res.ravel()) 
predictions = lr1.predict(X_test) 
  
# print classification report 
print(classification_report(y_test, predictions)) 

              precision    recall  f1-score   support

           0       0.99      1.00      1.00       237
           1       1.00      0.97      0.98        63

    accuracy                           0.99       300
   macro avg       1.00      0.98      0.99       300
weighted avg       0.99      0.99      0.99       300



Oversampling Using NearMiss 

In [47]:
print("Before Undersampling, counts of label '1': {}".format(sum(y_train == 1))) 
print("Before Undersampling, counts of label '0': {} \n".format(sum(y_train == 0))) 
  
from imblearn.under_sampling import NearMiss 
nr = NearMiss() 
  
X_train_miss, y_train_miss = nr.fit_resample(X_train, y_train.ravel()) 
  
print('After Undersampling, the shape of train_X: {}'.format(X_train_miss.shape)) 
print('After Undersampling, the shape of train_y: {} \n'.format(y_train_miss.shape)) 
  
print("After Undersampling, counts of label '1': {}".format(sum(y_train_miss == 1))) 
print("After Undersampling, counts of label '0': {}".format(sum(y_train_miss == 0))) 

Before Undersampling, counts of label '1': 137
Before Undersampling, counts of label '0': 563 

After Undersampling, the shape of train_X: (274, 2)
After Undersampling, the shape of train_y: (274,) 

After Undersampling, counts of label '1': 137
After Undersampling, counts of label '0': 137


In [48]:
lr2 = LogisticRegression() 
lr2.fit(X_train_miss, y_train_miss.ravel()) 
predictions = lr2.predict(X_test) 
  
# print classification report 
print(classification_report(y_test, predictions)) 

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       237
           1       1.00      0.95      0.98        63

    accuracy                           0.99       300
   macro avg       0.99      0.98      0.98       300
weighted avg       0.99      0.99      0.99       300



Using Random Oversampling

In [40]:
print("Before Oversampling, counts of label '1': {}".format(sum(y_train == 1))) 
print("Before Oversampling, counts of label '0': {} \n".format(sum(y_train == 0))) 

# Apply random oversampling
from imblearn.over_sampling import RandomOverSampler 
ros = RandomOverSampler(random_state=42) 
  
X_train_res, y_train_res = ros.fit_resample(X_train, y_train.ravel()) 
  
print('After Oversampling, the shape of train_X: {}'.format(X_train_res.shape)) 
print('After Oversampling, the shape of train_y: {} \n'.format(y_train_res.shape)) 
  
print("After Oversampling, counts of label '1': {}".format(sum(y_train_res == 1))) 
print("After Oversampling, counts of label '0': {}".format(sum(y_train_res == 0)))


Before Oversampling, counts of label '1': 137
Before Oversampling, counts of label '0': 563 

After Oversampling, the shape of train_X: (1126, 2)
After Oversampling, the shape of train_y: (1126,) 

After Oversampling, counts of label '1': 563
After Oversampling, counts of label '0': 563


In [41]:
# train the model on train set 
lr2 = LogisticRegression() 
lr2.fit(X_train_miss, y_train_miss.ravel()) 
predictions = lr2.predict(X_test) 
  
# print classification report 
print(classification_report(y_test, predictions)) 

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       237
           1       1.00      0.95      0.98        63

    accuracy                           0.99       300
   macro avg       0.99      0.98      0.98       300
weighted avg       0.99      0.99      0.99       300

