## 1: Import libraries and dataset
### 1.1: Importing libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
import sklearn
sklearn.__version__

'0.23.1'

### 1.2: Load dataset

In [8]:
data = pd.read_csv('creditcard.csv')
data

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,...,0.213454,0.111864,1.014480,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,172787.0,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,...,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,172788.0,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,...,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,172788.0,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,...,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.00,0


In [9]:
data.shape

(284807, 31)

### Check for any nan values

In [29]:
data.isnull().any()

Time      False
V1        False
V2        False
V3        False
V4        False
V5        False
V6        False
V7        False
V8        False
V9        False
V10       False
V11       False
V12       False
V13       False
V14       False
V15       False
V16       False
V17       False
V18       False
V19       False
V20       False
V21       False
V22       False
V23       False
V24       False
V25       False
V26       False
V27       False
V28       False
Amount    False
Class     False
dtype: bool

In [10]:
data['Class'].value_counts()

0    284315
1       492
Name: Class, dtype: int64

### Note: We can clearly see that the data is imbalanced, fraudulent entries are very less compared to non fraudulent entries
### We can try different approaches here: 
#### a. Use regular classifier (LogisticRegression and RandomForest) as these perform well for imbalanced data
#### b. Perform Under sampling and then train the model
#### c. Perform Over sampling and then train the model

## 2: Create train and test set
### 2.1: Creating the dependent feature, y that represents if the entry is fraudulent and independent features, X 
### To know more about the data check the readme.md file

In [11]:
X = data.iloc[:,:-1]
X

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62
1,0.0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69
2,1.0,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.524980,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.208038,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,...,1.475829,0.213454,0.111864,1.014480,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77
284803,172787.0,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,...,0.059616,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79
284804,172788.0,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,...,0.001396,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88
284805,172788.0,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,...,0.127434,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.00


In [12]:
y = data.iloc[:,-1] # y represents the outcome, where 1 indicates fraud detected and 0 indicates no fraud detected
y

0         0
1         0
2         0
3         0
4         0
         ..
284802    0
284803    0
284804    0
284805    0
284806    0
Name: Class, Length: 284807, dtype: int64

### 2.2: Splitting the X and y data in train and test set

In [13]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3)

## 3: Logistic Regression

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import KFold, GridSearchCV

### hyperparameters provided for grid search

In [11]:
lr = 10.0**np.arange(-4,4) #learning rates
lr

array([1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03])

In [15]:
algo = LogisticRegression() 
grid_val = {'C': lr, 'penalty':['l1','l2']}
cv = KFold(n_splits=10,shuffle=False) # perform 10 different splits

### Training and Results

In [18]:
classifier1 = GridSearchCV(algo, grid_val, cv=cv, scoring='f1_macro', n_jobs=-1)
classifier1.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


GridSearchCV(cv=KFold(n_splits=10, random_state=None, shuffle=False),
             error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': array([1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]),
                         'penalty': ['l1', 'l2']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='f1_macro', verbose=0)

In [20]:
classifier1.best_params_

{'C': 0.01, 'penalty': 'l2'}

In [19]:
y_pred=classifier1.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[85249    38]
 [   62    94]]
0.9988296291094648
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85287
           1       0.71      0.60      0.65       156

    accuracy                           1.00     85443
   macro avg       0.86      0.80      0.83     85443
weighted avg       1.00      1.00      1.00     85443



### Here, we focus on the precision value that is 'True Positive'. As we aim to increase the accuracy by which model detects fruadulent entries, we need to decrease the cases where a fraudulent entry is classified as non-fraudulent
### In this case, Precision=0.71 (achieved by using best parameters on Logistic Regression)

## 4: Random Forest Classifier

In [22]:
from sklearn.ensemble import RandomForestClassifier

In [25]:
y_train.value_counts()

0    199028
1       336
Name: Class, dtype: int64

### In order to reduce this imbalance, we can give more weight to entries with class 1

In [26]:
class_weight_rfc = dict({0:1,1:1000})

### Hyperparameters provided for grid search

In [27]:
algo_rfc = RandomForestClassifier(class_weight = class_weight_rfc, n_estimators=50, n_jobs=-1, max_features ='sqrt')
grid_val_rfc = { 
    'n_estimators': [100, 200],
    'max_features': ['auto', 'sqrt', 'log2']
}
classifier2 = GridSearchCV(algo_rfc, grid_val_rfc, cv=5, )

### Training and results

In [28]:
classifier2.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight={0: 1, 1: 1000},
                                              criterion='gini', max_depth=None,
                                              max_features='sqrt',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=50, n_jobs=-1,
                                              oob_score=False,
                                           

In [29]:
classifier2.best_params_

{'max_features': 'auto', 'n_estimators': 200}

In [31]:
y_pred_rfc=classifier2.predict(X_test)
print(confusion_matrix(y_test,y_pred_rfc))
print(accuracy_score(y_test,y_pred_rfc))
print(classification_report(y_test,y_pred_rfc))

[[85285     2]
 [   38   118]]
0.9995318516437859
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85287
           1       0.98      0.76      0.86       156

    accuracy                           1.00     85443
   macro avg       0.99      0.88      0.93     85443
weighted avg       1.00      1.00      1.00     85443



### Result: Precision has increased to 0.98 which suggests that are model is working well for imbalanced data as well

## 5: Under Sampling

In [14]:
from collections import Counter
from imblearn.under_sampling import NearMiss

In [16]:
ns = NearMiss(0.75) # no of 0 class entries will be reduced in a way that no of entries of class 1 will be 75% of 0 class entries
X_train_new, y_train_new = ns.fit_sample(X_train, y_train)
print(Counter(y_train)) # count no of entries of class 1 and 0 before under sampling
Counter(y_train_new) # count no of entries of class 1 and 0 after under sampling



Counter({0: 199002, 1: 362})


Counter({0: 482, 1: 362})

### Training and results

In [17]:
from sklearn.ensemble import RandomForestClassifier
classifier_us_rfc=RandomForestClassifier()
classifier_us_rfc.fit(X_train_new,y_train_new)

RandomForestClassifier()

In [20]:
y_pred_us_rfc=classifier_us_rfc.predict(X_test)
print(confusion_matrix(y_test,y_pred_us_rfc))
print(accuracy_score(y_test,y_pred_us_rfc))
print(classification_report(y_test,y_pred_us_rfc))

[[63803 21510]
 [   12   118]]
0.748112776939012
              precision    recall  f1-score   support

           0       1.00      0.75      0.86     85313
           1       0.01      0.91      0.01       130

    accuracy                           0.75     85443
   macro avg       0.50      0.83      0.43     85443
weighted avg       1.00      0.75      0.85     85443



### Result: Under sampling didn't perform well. We can attribute this to loss of data that occured due to under sampling

## 6: Over Sampling

In [25]:
from imblearn.over_sampling import RandomOverSampler

In [30]:
os = RandomOverSampler(0.75) # no of 1 class entries will be increased in a way that no of entries of class 1 will be 75% of 0 class entries
X_train_new_os, y_train_new_os = os.fit_sample(X_train, y_train)
print(Counter(y_train)) # count no of entries of class 1 and 0 before over sampling
Counter(y_train_new_os) # count no of entries of class 1 and 0 after over sampling



Counter({0: 199002, 1: 362})


Counter({0: 199002, 1: 149251})

### Training and results

In [27]:
from sklearn.ensemble import RandomForestClassifier
classifier_os_rfc=RandomForestClassifier()
classifier_os_rfc.fit(X_train_new_os,y_train_new_os)

RandomForestClassifier()

In [28]:
y_pred_os_rfc=classifier_os_rfc.predict(X_test)
print(confusion_matrix(y_test,y_pred_os_rfc))
print(accuracy_score(y_test,y_pred_os_rfc))
print(classification_report(y_test,y_pred_os_rfc))

[[85305     8]
 [   30   100]]
0.9995552590615966
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85313
           1       0.93      0.77      0.84       130

    accuracy                           1.00     85443
   macro avg       0.96      0.88      0.92     85443
weighted avg       1.00      1.00      1.00     85443



### Result: Over sampling performed fairly well with Precision = 0.93. We can attribute this to the fact that fraudulent entries increased by over sampling which has helped the model train better. 
### -------
### Note: Although we have not performed hyper parameter tuning here like in section 4: Random Forest Classifier. So it would be plausible to suggest that both over sampling and tree based classifiers perform well for problems like credit card fraud detection where data is likely to be imbalanced. 