In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

### Load The Data

In [2]:
card = pd.read_csv('/kaggle/input/credit-card-dataset/creditcard.csv')

In [3]:
card.shape

(284807, 31)

In [4]:
card.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


#### Fraud transaction is denoted by Class '1' and not a fraud transaction is denoted by Class '0'

In [5]:
card['Class'].value_counts()

Class
0    284315
1       492
Name: count, dtype: int64

### We can see that our target variable has imbalanced data

In [6]:
# Target variable
y = card['Class']
y.head()

0    0
1    0
2    0
3    0
4    0
Name: Class, dtype: int64

In [7]:
X = card.drop(['Class'], axis = 1)
X.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99


### Split the data into training and testing

In [8]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3, random_state = 42)

## Classification Models: 
### Random Forest Classifier

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix

#### 1. Random Forest Classifier

In [10]:
# Model 
model = RandomForestClassifier()

In [11]:
# Fit the model
model.fit(X_train,y_train)

# Predict the model
y_pred = model.predict(X_test)

In [12]:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test,y_pred))

[[85298     9]
 [   25   111]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85307
           1       0.93      0.82      0.87       136

    accuracy                           1.00     85443
   macro avg       0.96      0.91      0.93     85443
weighted avg       1.00      1.00      1.00     85443

0.999602073897218


### Above result given by Random Forest Classifier is an amazing one, but we have to consider the fact that we have imbalanced data.

#### Now, to overcome these, we have a few methods, and we will try some, let's check whether we get even more improved results or a depreciated one.
#### Most common - Undersampling and Oversampling

## Undersampling :
##### This is a method in which the higher valued data is decreased and brought to near-abouts with the other data. Generally, this can be used for 
##### smaller datasets, as for bigger ones, it leads to loss in data, which affects the performance of the model poorly.

In [13]:
# To keep a count of class count
from collections import Counter

In [14]:
from imblearn.under_sampling import NearMiss

In [15]:
under_sample = NearMiss(sampling_strategy = 0.7)

In [16]:
X_train_us,y_train_us = under_sample.fit_resample(X_train,y_train)

In [17]:
print('Before fit:',Counter(y_train))
print('After fit:',Counter(y_train_us))

Before fit: Counter({0: 199008, 1: 356})
After fit: Counter({0: 508, 1: 356})


##### We can see that the number of class '0's have been decreased, and the lower valued data is about 70 percent off the higher valued data.

In [18]:
model = RandomForestClassifier()

In [19]:
model.fit(X_train_us,y_train_us)
y_pred = model.predict(X_test)

In [20]:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test,y_pred))

[[61695 23612]
 [    7   129]]
              precision    recall  f1-score   support

           0       1.00      0.72      0.84     85307
           1       0.01      0.95      0.01       136

    accuracy                           0.72     85443
   macro avg       0.50      0.84      0.43     85443
weighted avg       1.00      0.72      0.84     85443

0.7235700993644886


##### As we can see, loss in data due to under sampling has lowered the performance of the model.

#### Let's check with over sampling :

## Over Sampling :
##### This is a method in which the lower valued data is increased to within some distance of the higher valued data.

In [21]:
from imblearn.over_sampling import RandomOverSampler

In [22]:
os = RandomOverSampler(sampling_strategy = 0.8, random_state = 42)
model = RandomForestClassifier()

In [23]:
X_train_os,y_train_os = os.fit_resample(X_train,y_train)

In [24]:
print('Before fit:',Counter(y_train))
print('After fit:',Counter(y_train_os))

Before fit: Counter({0: 199008, 1: 356})
After fit: Counter({0: 199008, 1: 159206})


##### We have brought the lower valued data about 80 percent closer to the higher valued data, now let's see if our model improves.

In [25]:
model.fit(X_train_os,y_train_os)
y_pred = model.predict(X_test)

In [26]:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test,y_pred))

[[85302     5]
 [   26   110]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85307
           1       0.96      0.81      0.88       136

    accuracy                           1.00     85443
   macro avg       0.98      0.90      0.94     85443
weighted avg       1.00      1.00      1.00     85443

0.9996371850239341


#### After applying oversampling, we made the count of Class '1' close to about 80 percent to the count of Class '0', and our model even improved slightly!!

### This was an amazing task which made me learn about imbalanced data, and gave good understanding about undersampling and oversampling.