# TASK: CREDIT CARD FRAUD DETECTION

DOMAIN : DATA SCIENCE

## Importing the Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

## Data Collection and Processing

In [2]:
# Creating a DataFrame using CSV file
data=pd.read_csv("creditcard.csv")

In [3]:
# print first five rows of the DataFrame
data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [4]:
# print the number of rows and columns
data.shape

(284807, 31)

In [5]:
# checking the null values
data.isnull().sum().sum()

0

In [6]:
# print columns nbames of the DataFrame
data.columns

Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
       'Class'],
      dtype='object')

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [8]:
# Statistical analysis
data.describe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,...,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0
mean,94813.859575,1.168375e-15,3.416908e-16,-1.379537e-15,2.074095e-15,9.604066e-16,1.487313e-15,-5.556467e-16,1.213481e-16,-2.406331e-15,...,1.654067e-16,-3.568593e-16,2.578648e-16,4.473266e-15,5.340915e-16,1.683437e-15,-3.660091e-16,-1.22739e-16,88.349619,0.001727
std,47488.145955,1.958696,1.651309,1.516255,1.415869,1.380247,1.332271,1.237094,1.194353,1.098632,...,0.734524,0.7257016,0.6244603,0.6056471,0.5212781,0.482227,0.4036325,0.3300833,250.120109,0.041527
min,0.0,-56.40751,-72.71573,-48.32559,-5.683171,-113.7433,-26.16051,-43.55724,-73.21672,-13.43407,...,-34.83038,-10.93314,-44.80774,-2.836627,-10.2954,-2.604551,-22.56568,-15.43008,0.0,0.0
25%,54201.5,-0.9203734,-0.5985499,-0.8903648,-0.8486401,-0.6915971,-0.7682956,-0.5540759,-0.2086297,-0.6430976,...,-0.2283949,-0.5423504,-0.1618463,-0.3545861,-0.3171451,-0.3269839,-0.07083953,-0.05295979,5.6,0.0
50%,84692.0,0.0181088,0.06548556,0.1798463,-0.01984653,-0.05433583,-0.2741871,0.04010308,0.02235804,-0.05142873,...,-0.02945017,0.006781943,-0.01119293,0.04097606,0.0165935,-0.05213911,0.001342146,0.01124383,22.0,0.0
75%,139320.5,1.315642,0.8037239,1.027196,0.7433413,0.6119264,0.3985649,0.5704361,0.3273459,0.597139,...,0.1863772,0.5285536,0.1476421,0.4395266,0.3507156,0.2409522,0.09104512,0.07827995,77.165,0.0
max,172792.0,2.45493,22.05773,9.382558,16.87534,34.80167,73.30163,120.5895,20.00721,15.59499,...,27.20284,10.50309,22.52841,4.584549,7.519589,3.517346,31.6122,33.84781,25691.16,1.0


In [9]:
# Checking for duplicate data
data.duplicated().any()

True

In [10]:
data.duplicated().sum()

1081

In [11]:
# Drop the dupliucate rows
data=data.drop_duplicates()
data.shape

(283726, 31)

In [12]:
data.duplicated().any()

False

In [13]:
# Categorise the legal and fraud Transactions 
data["Class"].value_counts()

Class
0    283253
1       473
Name: count, dtype: int64

 The data is highly biased

class=0 represents legal Transactions and 
class=1 represents fraud Transactions

In [14]:
print("percentage of legitimate data:",283253*100/283726)
print("percentage of fraud data:",473*100/283726)

percentage of legitimate data: 99.83328986416473
percentage of fraud data: 0.1667101358352777


In [15]:
legal=data[data["Class"]==0]
fraud=data[data["Class"]==1]

In [16]:
print(legal.shape)

(283253, 31)


In [17]:
print(fraud.shape)

(473, 31)


In [18]:
legal["Amount"].describe()

count    283253.000000
mean         88.413575
std         250.379023
min           0.000000
25%           5.670000
50%          22.000000
75%          77.460000
max       25691.160000
Name: Amount, dtype: float64

In [19]:
fraud["Amount"].describe()

count     473.000000
mean      123.871860
std       260.211041
min         0.000000
25%         1.000000
50%         9.820000
75%       105.890000
max      2125.870000
Name: Amount, dtype: float64

In [20]:
# Compare the Values for both transactions 
data.groupby("Class").mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,94835.058093,0.013439,-0.009829,0.012853,-0.01044,0.006769,0.001251,0.010447,-0.002448,0.002613,...,-0.000489,-0.00115,-0.00016,0.00036,0.000393,-0.000301,6.5e-05,0.001409,0.000418,88.413575
1,80450.513742,-4.49828,3.405965,-6.729599,4.472591,-2.957197,-1.432518,-5.175912,0.953255,-2.522124,...,0.405043,0.46655,0.086639,-0.096464,-0.106643,0.040615,0.050456,0.213774,0.07827,123.87186


## Under-Sampling

In [21]:
# Build a Sample Dataset contain similar Distribution  for Legal Transactions and Fraud Transaction 
legal_sample=legal.sample(n=473)

In [22]:
# concatenate the DataFrames
new_data = pd.concat([legal_sample, fraud], axis = 0)
new_data

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
13763,24408.0,-1.759468,1.501031,1.196554,1.809043,-1.130448,1.001991,-1.043187,1.291486,2.001229,...,-0.365608,-0.754018,0.123000,-0.377195,-0.416251,-0.519691,-0.862633,-0.281989,20.02,0
272421,165078.0,0.063408,0.680049,-0.173279,-0.950501,0.908650,-0.117556,0.797101,0.104650,-0.044495,...,-0.287133,-0.737531,-0.049157,-1.198784,-0.429766,0.197574,0.236014,0.073183,5.99,0
94809,65041.0,-4.642982,3.977754,-0.650665,-0.777976,-1.472330,-0.553084,-0.573768,1.593596,1.421184,...,-0.490588,-0.509181,0.248724,0.133599,0.144042,-0.047830,-0.405547,-0.881237,1.50,0
185750,126809.0,2.204838,-1.803297,-0.637110,-1.521885,-1.619358,-0.104144,-1.644262,0.062018,-0.515695,...,0.031637,0.493781,0.140159,0.446878,-0.215839,-0.005354,0.022981,-0.033690,58.00,0
56148,47289.0,-1.462415,-0.385962,1.179740,-1.517042,0.292525,-0.254485,-0.033292,-0.864329,-0.950800,...,0.808981,0.335552,0.300275,-0.236536,0.425234,-0.229173,0.172809,0.021649,33.74,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279863,169142.0,-1.927883,1.125653,-4.518331,1.749293,-1.566487,-2.010494,-0.882850,0.697211,-2.064945,...,0.778584,-0.319189,0.639419,-0.294885,0.537503,0.788395,0.292680,0.147968,390.00,1
280143,169347.0,1.378559,1.289381,-5.004247,1.411850,0.442581,-1.326536,-1.413170,0.248525,-1.127396,...,0.370612,0.028234,-0.145640,-0.081049,0.521875,0.739467,0.389152,0.186637,0.76,1
280149,169351.0,-0.676143,1.126366,-2.213700,0.468308,-1.120541,-0.003346,-2.234739,1.210158,-0.652250,...,0.751826,0.834108,0.190944,0.032070,-0.739695,0.471111,0.385107,0.194361,77.89,1
281144,169966.0,-3.113832,0.585864,-5.399730,1.817092,-0.840618,-2.943548,-2.208002,1.058733,-1.632333,...,0.583276,-0.269209,-0.456108,-0.183659,-0.328168,0.606116,0.884876,-0.253700,245.00,1


In [23]:
new_data["Class"].value_counts()

Class
0    473
1    473
Name: count, dtype: int64

In [24]:
new_data.groupby(["Class"]).mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,95624.665962,-0.054161,-0.129371,-0.011815,-0.052183,-0.028931,0.029041,0.048814,0.009422,0.040777,...,0.101597,-0.010758,0.002299,-0.050523,-0.011882,0.009484,-0.011036,-0.004334,0.005583,122.658182
1,80450.513742,-4.49828,3.405965,-6.729599,4.472591,-2.957197,-1.432518,-5.175912,0.953255,-2.522124,...,0.405043,0.46655,0.086639,-0.096464,-0.106643,0.040615,0.050456,0.213774,0.07827,123.87186


## Splitting the new data into Features and Target

In [25]:
X=new_data.drop("Class",axis=1)
Y=new_data["Class"]

In [26]:
X.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
13763,24408.0,-1.759468,1.501031,1.196554,1.809043,-1.130448,1.001991,-1.043187,1.291486,2.001229,...,-0.530213,-0.365608,-0.754018,0.123,-0.377195,-0.416251,-0.519691,-0.862633,-0.281989,20.02
272421,165078.0,0.063408,0.680049,-0.173279,-0.950501,0.90865,-0.117556,0.797101,0.10465,-0.044495,...,-0.071702,-0.287133,-0.737531,-0.049157,-1.198784,-0.429766,0.197574,0.236014,0.073183,5.99
94809,65041.0,-4.642982,3.977754,-0.650665,-0.777976,-1.47233,-0.553084,-0.573768,1.593596,1.421184,...,1.216584,-0.490588,-0.509181,0.248724,0.133599,0.144042,-0.04783,-0.405547,-0.881237,1.5
185750,126809.0,2.204838,-1.803297,-0.63711,-1.521885,-1.619358,-0.104144,-1.644262,0.062018,-0.515695,...,-0.400387,0.031637,0.493781,0.140159,0.446878,-0.215839,-0.005354,0.022981,-0.03369,58.0
56148,47289.0,-1.462415,-0.385962,1.17974,-1.517042,0.292525,-0.254485,-0.033292,-0.864329,-0.9508,...,-0.450555,0.808981,0.335552,0.300275,-0.236536,0.425234,-0.229173,0.172809,0.021649,33.74


In [27]:
Y.head()

13763     0
272421    0
94809     0
185750    0
56148     0
Name: Class, dtype: int64

## Splitting the data into Testing and Traning data

In [28]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=2,stratify=Y)

In [29]:
print(X_train.shape,X_test.shape,Y_train.shape,Y_test.shape)

(756, 30) (190, 30) (756,) (190,)


## Model Training

In [30]:
model=LogisticRegression(max_iter=150)

In [31]:
model.fit(X_train,Y_train)

## Model Evaluation

In [32]:
training_predict=model.predict(X_train)
training_predict

array([1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1,
       0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1,
       0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1,
       1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1,

In [33]:
testing_predict=model.predict(X_test)
testing_predict

array([0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1,
       1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1,
       1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0,
       1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0], dtype=int64)

In [34]:
#Precision score of training data
precision_train=metrics.precision_score(training_predict,Y_train)
print("Precision score of training data:",precision_train)

#Precision score of testing data
precision_test=metrics.precision_score(testing_predict,Y_test)
print("Precision score of testing data:",precision_test)

Precision score of training data: 0.9074074074074074
Precision score of testing data: 0.9052631578947369


In [35]:
# Recall score of training data
recall_train=metrics.recall_score(training_predict,Y_train)
print("Recall score of training data:",recall_train)

#Recall score of testing data
recall_test=metrics.recall_score(testing_predict,Y_test)
print("Recall score of testing data:",recall_test)

Recall score of training data: 0.9772079772079773
Recall score of testing data: 0.9662921348314607


In [36]:
# f1-score on training data
f1score_train =metrics.f1_score(training_predict, Y_train)
print('F1-score Score of Training data:',f1score_train)

# f1-score on test data
f1score_test = metrics.f1_score(testing_predict, Y_test)
print('F1-score Score of Testing data:',f1score_test)

F1-score Score of Training data: 0.9410150891632374
F1-score Score of Testing data: 0.9347826086956522
