#### Credit Card Fraud Detection using Machine Learning 
##### Importing Libraries

In [1]:
import numpy as np 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt 
%matplotlib inline

In [2]:
df = pd.read_csv('creditcard.csv')
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [3]:
df.columns

Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
       'Class'],
      dtype='object')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [5]:
df['Class'].value_counts()

0    284315
1       492
Name: Class, dtype: int64

##### 492 are fraud transactions. Label 1 -> fraud 
##### Therefore our data is highly unbalanced.

In [6]:
df.isnull().values.any()

False

##### Therefore there are no null values
##### Now we need to do some analysis on the data

In [7]:
#Separating the fraud and legit data rows 
true = df[df.Class == 0]
fraud = df[df.Class == 1]

print(true.shape)
print(fraud.shape)

(284315, 31)
(492, 31)


In [8]:
true['Amount'].describe()

count    284315.000000
mean         88.291022
std         250.105092
min           0.000000
25%           5.650000
50%          22.000000
75%          77.050000
max       25691.160000
Name: Amount, dtype: float64

##### Therefore from above you can see that an avg amount should be around 88$

In [9]:
fraud['Amount'].describe()

count     492.000000
mean      122.211321
std       256.683288
min         0.000000
25%         1.000000
50%         9.250000
75%       105.890000
max      2125.870000
Name: Amount, dtype: float64

##### Balancing data-set

##### Under Sampling - Building sample dataset containing similar distribution of normal and fraud transaction.

In [10]:
true_sample = true.sample(492)

In [11]:
true_sample.shape

(492, 31)

##### Now we will concatinate both of the dataframes

In [12]:
data = pd.concat([true_sample,fraud],axis=0)
data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
106116,69832.0,1.214401,-0.708763,-0.484615,-0.906243,1.299708,3.882048,-1.19789,1.047207,1.072464,...,-0.270845,-0.732854,0.020844,1.078721,0.285264,0.954954,-0.036583,0.012265,35.69,0
46056,42618.0,1.179103,-0.037859,-0.249428,1.108486,0.406273,0.613879,0.0299,0.118415,0.528509,...,-0.178811,-0.416464,-0.261539,-1.331829,0.801414,-0.227344,0.023533,0.007291,51.65,0
121064,76058.0,1.22673,-0.933952,-0.52439,-1.066887,-0.395067,-0.079237,-0.17224,-0.16262,-1.055194,...,-0.406574,-0.837707,-0.260893,-0.729488,0.458359,1.511506,-0.12122,-0.00695,129.0,0
163631,116103.0,1.87227,-0.14872,-0.404831,1.097344,-0.268672,-0.196052,-0.32447,0.028802,0.374475,...,-0.089824,-0.318973,0.299926,-0.408866,-0.406811,-0.979737,0.039487,-0.029931,43.8,0
175800,122496.0,0.096905,0.876,0.011669,-0.751738,0.769715,-0.72231,1.017551,-0.180639,-0.04306,...,-0.303263,-0.673402,-0.022139,-0.65318,-0.387232,0.18015,0.248691,0.090932,2.28,0


##### Remember axis = 0 means rows axis=1 means columns

In [15]:
data['Class'].value_counts()

1    492
0    492
Name: Class, dtype: int64

##### Importing Sklearn libraries

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [19]:
#Train test split 
X = data.drop(columns='Class',axis=1)
Y = data['Class']

In [20]:
X.shape

(984, 30)

In [21]:
Y.shape

(984,)

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=2)

##### Model Training

In [24]:
model = LogisticRegression()

In [25]:
model.fit(X_train,y_train)

LogisticRegression()

##### Model Evaluation 

In [31]:
predict_train = model.predict(X_train)
train_acc = accuracy_score(predict_train,y_train)
print('Accuracy Score: ', train_acc*100)

Accuracy Score:  90.97839898348158


In [32]:
predict_test = model.predict(X_test)
test_acc = accuracy_score(predict_test,y_test)
print('Acuuracy Score: ',test_acc*100)

Acuuracy Score:  91.37055837563452
