Importing the Dependencies

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [5]:
# loading the dataset to a Pandas DataFrame
credit_card_data = pd.read_csv('/content/creditcard.csv')

In [6]:
# first 5 rows of the dataset
credit_card_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.0
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0.0


In [7]:
credit_card_data.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
11954,20631,1.504204,-0.411728,0.20009,-0.778753,-0.442232,-0.119677,-0.78266,-0.165178,0.691819,...,-0.136231,-0.217274,-0.14326,-1.057332,0.529188,-0.235062,-0.012089,0.000905,9.0,0.0
11955,20636,1.134994,0.09634,0.277921,0.319692,0.7428,1.611803,-0.458649,0.390012,1.424541,...,-0.395605,-0.743542,0.222256,-1.859104,-0.109777,0.279049,0.012398,-0.00909,0.99,0.0
11956,20638,-6.305012,3.944886,-4.707362,1.539602,-3.934785,-1.730565,-2.104936,3.843447,0.863458,...,0.07314,-0.039935,-0.108896,0.691434,-0.261979,-0.44754,0.2129,-0.031021,89.99,0.0
11957,20638,1.16196,-0.398297,1.123732,-0.474237,-1.226667,-0.519325,-0.804179,0.070134,3.262926,...,-0.121191,0.097255,0.050903,0.330479,0.315692,-0.712765,0.073836,0.028055,11.85,0.0
11958,20642,1.291096,-0.226628,0.708386,-0.719236,-0.659099,-0.273757,-0.612042,-0.111488,3.032258,...,,,,,,,,,,


In [8]:
# dataset informations
credit_card_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11959 entries, 0 to 11958
Data columns (total 31 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    11959 non-null  int64  
 1   V1      11959 non-null  float64
 2   V2      11959 non-null  float64
 3   V3      11959 non-null  float64
 4   V4      11959 non-null  float64
 5   V5      11959 non-null  float64
 6   V6      11959 non-null  float64
 7   V7      11959 non-null  float64
 8   V8      11959 non-null  float64
 9   V9      11959 non-null  float64
 10  V10     11959 non-null  float64
 11  V11     11959 non-null  float64
 12  V12     11959 non-null  float64
 13  V13     11959 non-null  float64
 14  V14     11959 non-null  float64
 15  V15     11959 non-null  float64
 16  V16     11959 non-null  float64
 17  V17     11959 non-null  float64
 18  V18     11959 non-null  float64
 19  V19     11959 non-null  float64
 20  V20     11958 non-null  float64
 21  V21     11958 non-null  float64
 22

In [9]:
# checking the number of missing values in each column
credit_card_data.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       1
V21       1
V22       1
V23       1
V24       1
V25       1
V26       1
V27       1
V28       1
Amount    1
Class     1
dtype: int64

In [10]:
# distribution of legit transactions & fraudulent transactions
credit_card_data['Class'].value_counts()

0.0    11906
1.0       52
Name: Class, dtype: int64

This Dataset is highly unblanced

0 --> Normal Transaction

1 --> fraudulent transaction

In [11]:
# separating the data for analysis
legit = credit_card_data[credit_card_data.Class == 0]
fraud = credit_card_data[credit_card_data.Class == 1]

In [12]:
print(legit.shape)
print(fraud.shape)

(11906, 31)
(52, 31)


In [13]:
# statistical measures of the data
legit.Amount.describe()

count    11906.000000
mean        62.198127
std        177.379105
min          0.000000
25%          5.292500
50%         15.950000
75%         50.000000
max       7712.430000
Name: Amount, dtype: float64

In [14]:
fraud.Amount.describe()

count      52.000000
mean       97.724808
std       321.188775
min         0.000000
25%         1.000000
50%         1.000000
75%         1.772500
max      1809.680000
Name: Amount, dtype: float64

In [15]:
# compare the values for both transactions
credit_card_data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,7993.389048,-0.200839,0.259753,0.929229,0.255987,-0.076685,0.149326,-0.097364,-0.053087,0.927305,...,0.020836,-0.063076,-0.154427,-0.032369,0.018962,0.105861,0.061825,0.00533,1.2e-05,62.198127
1.0,11569.615385,-3.769104,4.258027,-8.202329,6.396396,-2.342431,-1.990562,-5.740207,0.95065,-2.729769,...,0.411236,0.305259,-0.021762,-0.340563,-0.406311,-0.139505,0.306967,0.692927,0.059751,97.724808


Under-Sampling

Build a sample dataset containing similar distribution of normal transactions and Fraudulent Transactions

In [16]:
legit_sample = legit.sample(n=492)

Concatenating two DataFrames

In [17]:
new_dataset = pd.concat([legit_sample, fraud], axis=0)

In [18]:
new_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
10486,17194,1.136472,-1.338895,0.013766,-0.667942,0.64393,4.418659,-1.890689,1.072206,1.499239,...,-0.547356,-0.891355,-0.056419,0.976312,0.291024,0.548713,0.011836,0.03501,106.9,0.0
2541,2099,-1.560814,1.109475,1.283456,-1.617793,0.500639,-0.693838,1.38137,-1.05094,1.556905,...,-0.271451,0.001855,-0.020731,0.106561,-0.271464,0.622213,-1.004605,-0.479473,0.85,0.0
7585,10481,-1.959669,0.85965,1.966415,-1.581699,-0.91321,1.677717,-1.82997,-1.965648,0.629066,...,-1.732163,-0.147844,-0.276269,-0.983788,0.315976,0.348215,-0.346492,0.003971,58.0,0.0
221,145,-2.420413,1.947885,0.553646,0.983069,-0.281518,2.408958,-1.401613,-0.188299,0.675878,...,1.213826,-1.23862,0.006927,-1.724222,0.239603,-0.313703,-0.188281,0.119831,6.0,0.0
723,547,1.066851,-0.464679,1.984353,1.521834,-1.287325,1.113014,-1.314876,0.528953,0.249078,...,-0.080174,0.115724,0.149055,0.419153,-0.36468,3.087444,-0.137987,-0.016174,1.48,0.0


In [19]:
new_dataset.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
10897,18690,-15.398845,7.472324,-19.026912,11.165526,-6.893856,-2.120937,-14.91333,-0.721214,-7.175097,...,-2.444884,0.727495,-0.345078,-0.981749,0.995271,0.816762,2.262942,-1.178063,1.0,1.0
11343,19762,-14.179165,7.42137,-21.405836,11.927512,-7.974281,-2.20271,-15.471612,-0.356595,-6.380125,...,-2.366836,1.130955,0.991153,-1.033132,-0.327179,0.634693,2.171905,-1.395288,1.0,1.0
11710,20011,-14.724627,7.875157,-21.872317,11.90617,-8.348734,-2.262846,-15.833443,0.077874,-6.356833,...,-2.362345,1.099557,1.037199,-1.036359,-0.254777,0.642343,2.161129,-1.401282,1.0,1.0
11841,20332,-15.271362,8.326581,-22.338591,11.885313,-8.721334,-2.324307,-16.196419,0.512882,-6.333685,...,-2.356896,1.068019,1.085617,-1.039797,-0.182006,0.649921,2.149247,-1.406811,1.0,1.0
11880,20451,-15.819179,8.775997,-22.804686,11.864868,-9.092361,-2.386893,-16.560368,0.948349,-6.310658,...,-2.350634,1.036362,1.136051,-1.043414,-0.108923,0.657437,2.136424,-1.411945,1.0,1.0


In [20]:
new_dataset['Class'].value_counts()

0.0    492
1.0     52
Name: Class, dtype: int64

In [21]:
new_dataset.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,7939.534553,-0.179933,0.213584,0.932331,0.252214,-0.068274,0.232671,-0.143118,-0.039756,0.95588,...,-0.002885,-0.029567,-0.127031,-0.009275,0.0309,0.105148,0.035961,-0.010301,-0.006786,59.966463
1.0,11569.615385,-3.769104,4.258027,-8.202329,6.396396,-2.342431,-1.990562,-5.740207,0.95065,-2.729769,...,0.411236,0.305259,-0.021762,-0.340563,-0.406311,-0.139505,0.306967,0.692927,0.059751,97.724808


Splitting the data into Features & Targets

In [22]:
X = new_dataset.drop(columns='Class', axis=1)
Y = new_dataset['Class']

In [23]:
print(X)

        Time         V1        V2         V3         V4        V5        V6  \
10486  17194   1.136472 -1.338895   0.013766  -0.667942  0.643930  4.418659   
2541    2099  -1.560814  1.109475   1.283456  -1.617793  0.500639 -0.693838   
7585   10481  -1.959669  0.859650   1.966415  -1.581699 -0.913210  1.677717   
221      145  -2.420413  1.947885   0.553646   0.983069 -0.281518  2.408958   
723      547   1.066851 -0.464679   1.984353   1.521834 -1.287325  1.113014   
...      ...        ...       ...        ...        ...       ...       ...   
10897  18690 -15.398845  7.472324 -19.026912  11.165526 -6.893856 -2.120937   
11343  19762 -14.179165  7.421370 -21.405836  11.927512 -7.974281 -2.202710   
11710  20011 -14.724627  7.875157 -21.872317  11.906170 -8.348734 -2.262846   
11841  20332 -15.271362  8.326581 -22.338591  11.885313 -8.721334 -2.324307   
11880  20451 -15.819179  8.775997 -22.804686  11.864868 -9.092361 -2.386893   

              V7        V8        V9  ...       V20

In [24]:
print(Y)

10486    0.0
2541     0.0
7585     0.0
221      0.0
723      0.0
        ... 
10897    1.0
11343    1.0
11710    1.0
11841    1.0
11880    1.0
Name: Class, Length: 544, dtype: float64


Split the data into Training data & Testing Data

In [25]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [26]:
print(X.shape, X_train.shape, X_test.shape)

(544, 30) (435, 30) (109, 30)


Model Training

Logistic Regression

In [27]:
model = LogisticRegression()

In [None]:
# training the Logistic Regression Model with Training Data
model.fit(X_train, Y_train)

Model Evaluation

Accuracy Score

In [37]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [36]:
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.993103448275862


In [35]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [33]:
print('Accuracy score on Test Data : ', test_data_accuracy)

Accuracy score on Test Data :  1.0
