# Credit Card Fraud Detection
***

In [51]:
# Importing all the dependencies

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

### EDA and Data cleaning
***

The feature in the dataset have been converted by using PCA method. <br>
Also, the time column is the time elapsed in seconds relative to the first transaction. <br>
The amount is in US dollars <br>
Class represents if it's a fraudulent transaction or not. 0 represents authentic transaction and 1 represents fraudulent transaction.

In [52]:
# Loading the dataset
data = pd.read_csv('creditcard.csv')
print(len(data))
data.head(5)

284807


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [53]:
data['Class'].value_counts()

Class
0    284315
1       492
Name: count, dtype: int64

As we can see from the above code cell, that the data is extremely unbalanced. There are only less than 1% of fraudulent transactions in the dataset. Training on this model will definitely cause overfitting.

In [54]:
# Separating the data for analysis
proper = data[data.Class == 0]
fraud = data[data.Class == 1]
print(len(proper), len(fraud))

284315 492


In [55]:
# Some more stats
proper.Amount.describe()

count    284315.000000
mean         88.291022
std         250.105092
min           0.000000
25%           5.650000
50%          22.000000
75%          77.050000
max       25691.160000
Name: Amount, dtype: float64

In [56]:
fraud.Amount.describe()

count     492.000000
mean      122.211321
std       256.683288
min         0.000000
25%         1.000000
50%         9.250000
75%       105.890000
max      2125.870000
Name: Amount, dtype: float64

From the above stats, we can see that the mean is higher in fraudulent transactions. Which indicates that during most of the fraudulent transactions the amount being transferred is generally higher.<br>

So now, we are going to take 492 random values from the proper dataset and then concatenate it with the fraud dataset to have an even distribution of values to prevent overfitting.

In [57]:
samples = proper.sample(n=492)
new_data = pd.concat([samples, fraud], axis=0)
new_data['Class'].value_counts()

Class
0    492
1    492
Name: count, dtype: int64

In [58]:
new_data.head(5)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
144760,86379.0,1.145815,0.236698,0.375729,0.580525,-0.240182,-0.394883,-0.089779,0.073411,-0.229796,...,-0.192972,-0.576893,0.168165,0.181425,0.080672,0.093858,-0.0118,0.019976,6.99,0
253815,156432.0,-0.484251,1.377796,-0.011398,-0.571941,0.350166,-0.535951,-0.142496,-2.817439,-0.429618,...,-1.499451,-0.081123,0.22853,-0.013201,-0.558274,0.122371,0.223719,0.136846,1.98,0
42908,41292.0,0.991915,0.064761,0.434681,1.188117,-0.38676,-0.810127,0.368929,-0.283625,-0.182983,...,-0.139442,-0.620206,-0.02229,0.392778,0.371363,-0.648833,0.0104,0.054323,126.86,0
52737,45617.0,-0.632354,-0.593086,2.320049,1.498437,-1.226283,1.196163,0.40364,0.230964,0.156554,...,0.513296,1.311512,0.547495,0.226944,-0.326006,-0.164871,-0.11745,-0.171468,280.06,0
221488,142584.0,-5.035501,3.582426,-2.05479,-0.681565,-3.072506,-0.717198,-2.601171,3.676238,0.111333,...,0.389487,0.725079,0.036392,0.124007,0.392483,0.654422,0.084539,0.074134,12.71,0


## Model Training
***
Creating a train-test split

In [59]:
#Splitting the dataset into Targets and Features

X = new_data.drop(['Class'], axis=1)
Y = new_data['Class']
print(X.columns)
Y.value_counts()

Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount'],
      dtype='object')


Class
0    492
1    492
Name: count, dtype: int64

In [60]:
# Splitting the data into train and test data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [61]:
model = LogisticRegression()
model.fit(X_train, Y_train)


### Model Evaluation

In [62]:
train_pred = model.predict(X_train)
train_accuracy = accuracy_score(train_pred, Y_train)
print('Training data accuracy: ',train_accuracy)

test_pred = model.predict(X_test)
test_accuracy = accuracy_score(test_pred, Y_test)
print('Testing data accuracy: ', test_accuracy)

Training data accuracy:  0.9415501905972046
Testing data accuracy:  0.9137055837563451


Since there is not a huge difference between the accuracy of training and test data, we have solved the problem of overfitting the model. <br>
It is also clear that the model detects the fraudulent transactions fairly well.