Importing the Dependencies

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [3]:
#loading the dataset into a dataframe
credit_card = pd.read_csv('creditcard.csv')

In [4]:
#printing the first 5 rows of the dataset
credit_card.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.0
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0.0


In [5]:
credit_card.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
27814,34711,1.443955,-1.052462,-0.141721,-1.564017,-0.966274,-0.333886,-0.77706,0.023616,-2.168826,...,-0.47994,-1.16522,0.155226,-0.409701,0.179672,-0.446258,0.004329,6e-05,32.39,0.0
27815,34711,-0.263364,0.931818,1.193111,-0.507924,0.862019,0.249381,0.815449,-0.090801,-0.520583,...,-0.310112,-0.648621,-0.253746,-1.030111,-0.046091,0.107328,0.101296,-0.1246,1.49,0.0
27816,34712,0.976345,-1.024867,0.978714,0.639442,-1.413711,0.311635,-0.909035,0.232423,-0.493025,...,-0.279978,-0.599463,-0.081403,-0.059427,0.202311,-0.405753,0.055109,0.052975,151.0,0.0
27817,34712,1.464604,-0.437919,-0.018869,-1.057177,-0.154243,0.251215,-0.584866,-0.025483,-0.841369,...,-0.148384,-0.454571,-0.019732,-1.313689,0.37965,-0.368497,0.031746,0.007238,8.0,0.0
27818,34,,,,,,,,,,...,,,,,,,,,,


In [6]:
#information about the dataset
credit_card.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27819 entries, 0 to 27818
Data columns (total 31 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    27819 non-null  int64  
 1   V1      27818 non-null  float64
 2   V2      27818 non-null  float64
 3   V3      27818 non-null  float64
 4   V4      27818 non-null  float64
 5   V5      27818 non-null  float64
 6   V6      27818 non-null  float64
 7   V7      27818 non-null  float64
 8   V8      27818 non-null  float64
 9   V9      27818 non-null  float64
 10  V10     27818 non-null  float64
 11  V11     27818 non-null  float64
 12  V12     27818 non-null  float64
 13  V13     27818 non-null  float64
 14  V14     27818 non-null  float64
 15  V15     27818 non-null  float64
 16  V16     27818 non-null  float64
 17  V17     27818 non-null  float64
 18  V18     27818 non-null  float64
 19  V19     27818 non-null  float64
 20  V20     27818 non-null  float64
 21  V21     27818 non-null  float64
 22

In [8]:
#checking the number of missing values in each column
credit_card.isnull().sum()

Time      0
V1        1
V2        1
V3        1
V4        1
V5        1
V6        1
V7        1
V8        1
V9        1
V10       1
V11       1
V12       1
V13       1
V14       1
V15       1
V16       1
V17       1
V18       1
V19       1
V20       1
V21       1
V22       1
V23       1
V24       1
V25       1
V26       1
V27       1
V28       1
Amount    1
Class     1
dtype: int64

In [9]:
#distribution of legit transaction & fraudulent transactions
credit_card['Class'].value_counts()

0.0    27725
1.0       93
Name: Class, dtype: int64

The dataset is highly unbalanaced

0 ---> Normal Transaction
1 ---> Fraudulent Transaction

In [10]:
#separating the data for analysis
legit = credit_card[credit_card.Class ==0]
fraud = credit_card[credit_card.Class ==1]

In [13]:
print(legit.shape)
print(fraud.shape)

(27725, 31)
(93, 31)


In [14]:
#statistical measures of the data
legit.Amount.describe()


count    27725.000000
mean        77.232517
std        219.509762
min          0.000000
25%          6.490000
50%         19.950000
75%         69.320000
max       7879.420000
Name: Amount, dtype: float64

In [15]:
fraud.Amount.describe()

count      93.000000
mean       96.609677
std       259.128010
min         0.000000
25%         1.000000
50%         1.100000
75%        99.990000
max      1809.680000
Name: Amount, dtype: float64

In [16]:
#compare the values for both transactions
credit_card.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,20440.754518,-0.190595,0.129284,0.7652,0.201632,-0.180681,0.092904,-0.099318,0.014884,0.405538,...,0.041673,-0.037007,-0.125822,-0.040202,0.01151,0.129192,0.022983,0.01065,0.003732,77.232517
1.0,18829.451613,-8.165086,6.134379,-11.690379,6.070066,-5.753486,-2.388962,-7.986805,4.121062,-2.922478,...,0.697894,0.549621,-0.372248,-0.340859,-0.260055,0.359772,0.179521,0.832824,0.1005,96.609677


Under - Sampling

Build a sample dataset containing similar distribution of Normal transactions and Fraudulent Transactions

No. of Fraudulent Transactions --> 93

In [17]:
legit_sample = legit.sample(n=93)

Concatinating two Dataframes

In [18]:
new_dataset = pd.concat([legit_sample, fraud], axis =0)

In [19]:
new_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
9978,14918,-0.976236,-0.270615,1.641633,1.657924,-0.531906,2.313757,1.279624,0.135497,2.092085,...,-0.472807,-0.978904,0.548617,-1.438807,0.20585,-0.359294,0.126832,-0.035187,358.97,0.0
3658,3126,1.017152,-0.572347,1.058118,0.318101,-1.169185,-0.315466,-0.420007,-0.054179,0.974584,...,-0.155981,-0.347512,-0.025914,0.51716,0.148134,0.926493,-0.043377,0.035135,109.63,0.0
9253,13326,-1.030074,0.96257,1.598449,-0.214944,0.780002,1.533915,0.171687,0.576482,0.837136,...,-0.175747,-0.217049,0.140339,-1.112447,-0.655745,0.120023,0.074389,0.143415,10.66,0.0
12527,21921,-1.708005,-0.657282,2.110471,-1.509254,-0.241803,-0.275357,0.194589,0.019443,2.300442,...,0.085685,0.553508,-0.304119,-0.016753,0.662199,-0.647657,0.007663,0.11934,185.99,0.0
18660,29668,0.795359,-0.85435,-0.41356,0.514078,0.009142,0.517579,0.196598,0.033688,0.492883,...,-0.154939,-0.91816,-0.308064,-1.29565,0.348068,0.408197,-0.07271,0.03988,267.5,0.0


In [20]:
new_dataset['Class'].value_counts()

0.0    93
1.0    93
Name: Class, dtype: int64

In [21]:
new_dataset.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
26802,34256,0.539276,1.55489,-2.06618,3.241617,0.184736,0.02833,-1.515521,0.537035,-1.999846,...,0.371773,0.111955,-0.305225,-1.053835,0.771175,0.240878,0.418435,0.23217,19.02,1.0
27362,34521,1.081234,0.416414,0.862919,2.520863,-0.005021,0.563341,-0.123372,0.223122,-0.673598,...,-0.159387,-0.305154,0.05362,0.011761,0.375146,-0.106299,0.021008,0.010559,1.52,1.0
27627,34634,0.333499,1.699873,-2.596561,3.643945,-0.585068,-0.654659,-2.275789,0.675229,-2.042416,...,0.469212,-0.144363,-0.317981,-0.769644,0.807855,0.228164,0.551002,0.305473,18.96,1.0
27738,34684,-2.439237,2.591458,-2.840126,1.286244,-1.777016,-1.436139,-2.206056,-2.282725,-0.292885,...,1.77446,-0.77139,0.065727,0.103916,-0.057578,0.242652,-0.268649,-0.743713,125.3,1.0
27749,34687,-0.860827,3.13179,-5.052968,5.420941,-2.494141,-1.811287,-5.479117,1.189472,-3.908206,...,1.192694,0.090356,-0.341881,-0.215924,1.053032,0.271139,1.3733,0.691195,19.02,1.0


In [22]:
new_dataset.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,20545.849462,-0.299854,0.027559,1.019748,0.515979,-0.147708,0.175587,0.067642,-0.044381,0.521336,...,0.126649,-0.005326,-0.158295,0.029446,0.001781,0.073045,-0.013514,-0.030163,-0.007459,113.347742
1.0,18829.451613,-8.165086,6.134379,-11.690379,6.070066,-5.753486,-2.388962,-7.986805,4.121062,-2.922478,...,0.697894,0.549621,-0.372248,-0.340859,-0.260055,0.359772,0.179521,0.832824,0.1005,96.609677


Splitting the data into Features and Targets

In [23]:
X = new_dataset.drop(columns = 'Class', axis =1)
Y = new_dataset['Class']

In [24]:
print(X)

        Time        V1        V2        V3        V4        V5        V6  \
9978   14918 -0.976236 -0.270615  1.641633  1.657924 -0.531906  2.313757   
3658    3126  1.017152 -0.572347  1.058118  0.318101 -1.169185 -0.315466   
9253   13326 -1.030074  0.962570  1.598449 -0.214944  0.780002  1.533915   
12527  21921 -1.708005 -0.657282  2.110471 -1.509254 -0.241803 -0.275357   
18660  29668  0.795359 -0.854350 -0.413560  0.514078  0.009142  0.517579   
...      ...       ...       ...       ...       ...       ...       ...   
26802  34256  0.539276  1.554890 -2.066180  3.241617  0.184736  0.028330   
27362  34521  1.081234  0.416414  0.862919  2.520863 -0.005021  0.563341   
27627  34634  0.333499  1.699873 -2.596561  3.643945 -0.585068 -0.654659   
27738  34684 -2.439237  2.591458 -2.840126  1.286244 -1.777016 -1.436139   
27749  34687 -0.860827  3.131790 -5.052968  5.420941 -2.494141 -1.811287   

             V7        V8        V9  ...       V20       V21       V22  \
9978   1.2796

In [25]:
print(Y)

9978     0.0
3658     0.0
9253     0.0
12527    0.0
18660    0.0
        ... 
26802    1.0
27362    1.0
27627    1.0
27738    1.0
27749    1.0
Name: Class, Length: 186, dtype: float64


Splitting the data into Training data & Testing data

In [26]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y, random_state =2)

In [27]:
print(X.shape, X_train.shape, X_test.shape)

(186, 30) (148, 30) (38, 30)


Model Training

Logistic Regression

In [28]:
model = LogisticRegression()

In [29]:
#training the Logistic Regression Model with Training Data
model.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model Evaluation

Accuracy Score

In [31]:
#accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction , Y_train)

In [33]:
print('Accuracy on Training Data :' , training_data_accuracy)

Accuracy on Training Data : 0.9864864864864865


In [34]:
#accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [35]:
print('Accuracy score on Test Data : ', test_data_accuracy)

Accuracy score on Test Data :  0.9210526315789473
