#### Importing the Dependencies

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from pathlib import Path

In [2]:
# Get the current working directory
current_directory = Path.cwd()

# Define the path to the CSV file
csv_file_path = current_directory / "datasets" / "creditcard.csv"

# loading the dataset
credit_card_data = pd.read_csv(csv_file_path)

In [3]:
# first 5 rows of the dataset
credit_card_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0.0


In [4]:
credit_card_data.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
245596,152819.0,-1.554734,0.192476,0.627004,-2.857962,0.408065,1.052658,-0.044063,0.946908,0.517252,...,0.325118,0.951835,-0.310487,-1.563841,-0.11587,0.650874,0.132185,-0.002876,69.0,0.0
245597,152820.0,1.80143,0.097644,-0.420684,3.787639,0.456912,1.391565,-0.372614,0.328466,-0.391756,...,0.111775,0.518605,0.019829,0.237727,0.131745,0.113939,-0.00784,-0.047842,36.89,0.0
245598,152821.0,2.211186,-1.629133,-1.085443,-1.75838,-1.070121,0.069483,-1.331975,0.062199,-1.212618,...,-0.132565,-0.050513,0.18638,0.153017,-0.234748,-0.200091,-0.000999,-0.049697,55.5,0.0
245599,152822.0,1.862801,-0.620087,-0.499155,0.205394,0.050954,1.405691,-0.960836,0.491321,1.247667,...,0.238479,0.983535,0.279639,-0.289275,-0.482972,0.635752,0.028929,-0.051554,11.5,0.0
245600,152822.0,-1.231687,0.235928,-3.124213,-0.433176,2.505621,0.050799,0.088498,0.949696,-0.601462,...,0.646796,,,,,,,,,


In [5]:
# dataset information
credit_card_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 245601 entries, 0 to 245600
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    245601 non-null  float64
 1   V1      245601 non-null  float64
 2   V2      245601 non-null  float64
 3   V3      245601 non-null  float64
 4   V4      245601 non-null  float64
 5   V5      245601 non-null  float64
 6   V6      245601 non-null  float64
 7   V7      245601 non-null  float64
 8   V8      245601 non-null  float64
 9   V9      245601 non-null  float64
 10  V10     245601 non-null  float64
 11  V11     245601 non-null  float64
 12  V12     245601 non-null  float64
 13  V13     245601 non-null  float64
 14  V14     245601 non-null  float64
 15  V15     245601 non-null  float64
 16  V16     245601 non-null  float64
 17  V17     245601 non-null  float64
 18  V18     245601 non-null  float64
 19  V19     245601 non-null  float64
 20  V20     245601 non-null  float64
 21  V21     24

In [6]:
# checking the number of missing values in each column
credit_card_data.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       1
V23       1
V24       1
V25       1
V26       1
V27       1
V28       1
Amount    1
Class     1
dtype: int64

In [7]:
# distribution of legit transactions & fraudulent transactions
credit_card_data['Class'].value_counts()

0.0    245151
1.0       449
Name: Class, dtype: int64

#### The Dataset is highly unbalanced

1 --> Normal Transaction

0 --> Fraudulent transaction

In [8]:
# separation the data for analytics
legit = credit_card_data[credit_card_data.Class == 0]
fraud = credit_card_data[credit_card_data.Class == 1]

In [9]:
legit.shape, fraud.shape

((245151, 31), (449, 31))

In [10]:
# statistical measures of the data
legit.Amount.describe()

count    245151.000000
mean         90.978622
std         252.598904
min           0.000000
25%           6.000000
50%          23.350000
75%          79.950000
max       19656.530000
Name: Amount, dtype: float64

In [11]:
fraud.Amount.describe()

count     449.000000
mean      121.118597
std       252.911251
min         0.000000
25%         1.000000
50%         9.290000
75%       105.890000
max      2125.870000
Name: Amount, dtype: float64

In [12]:
# compare the values for both transactions
credit_card_data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,84119.911964,-0.034588,-0.026619,0.127673,0.021529,-0.035365,0.023012,-0.005869,0.000794,0.005918,...,0.004708,-0.004264,-0.013414,-0.007057,0.000888,0.024308,0.001714,-0.001203,0.000858,90.978622
1.0,73172.202673,-5.113548,3.841349,-7.275478,4.632747,-3.497006,-1.365035,-5.940823,0.615337,-2.631902,...,0.35658,0.75575,0.022223,-0.048957,-0.087154,0.046406,0.040162,0.157467,0.073882,121.118597


Under-Sampling

Build a sample dataset containing similar distribution of normal transactions and fraudulent transactions

Number of Fraudulent Transactions --> 492

In [13]:
legit_sample = legit.sample(n=492)

Concatenating two DataFrames

In [14]:
new_dataset = pd.concat([legit_sample, fraud], axis=0)

In [15]:
new_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
219206,141637.0,0.105191,0.172785,0.697912,-0.732619,0.186242,0.573118,0.119899,0.302033,0.606063,...,0.288672,0.871476,-0.074592,0.198502,-0.246711,-0.306755,0.059032,0.008897,43.0,0.0
154425,101499.0,1.378774,-0.901984,-0.466751,2.003703,-0.534268,-0.085748,0.083531,-0.197995,2.336721,...,-0.492295,-1.4833,0.187313,-0.19748,-0.431731,-1.169124,-0.018984,0.01353,294.94,0.0
58241,48298.0,1.234482,0.306881,0.426585,0.777227,-0.602569,-1.279505,0.037143,-0.178095,0.133904,...,-0.276115,-0.836317,0.180218,0.649095,0.145316,0.077042,-0.027814,0.032217,0.89,0.0
9318,13497.0,1.099169,-0.126592,0.917145,1.583308,-0.429145,0.797578,-0.717981,0.371159,2.213973,...,-0.227569,-0.165969,-0.0891,-0.386758,0.538424,-0.287397,0.029218,-0.001305,9.0,0.0
164174,116510.0,-0.758521,-1.076023,1.602959,-2.674388,-0.288483,-0.119375,0.074683,-0.196538,-2.42378,...,-0.089718,0.023442,-0.167609,-0.506054,0.62935,-0.065123,-0.165849,-0.167487,113.62,0.0


In [16]:
new_dataset.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
243848,152098.0,-4.124316,3.748597,-7.926507,7.763242,-0.769375,-2.031171,-3.474549,0.107121,-1.551352,...,0.547097,0.687854,0.429939,-0.620601,-0.369688,0.367349,-2.664767,0.417101,1.0,1.0
244004,152165.0,-4.673231,4.195976,-8.392423,7.743215,-1.138803,-2.094899,-3.839487,0.543053,-1.528448,...,0.554185,0.656076,0.482417,-0.624399,-0.296289,0.374802,-2.678544,0.412368,1.0,1.0
244333,152307.0,-5.222968,4.641827,-8.858204,7.723502,-1.507035,-2.159484,-4.205164,0.979334,-1.505637,...,0.561892,0.624207,0.536429,-0.628334,-0.222651,0.382208,-2.693036,0.407935,1.0,1.0
245347,152710.0,0.051075,1.310427,0.733222,2.620282,1.402358,0.528489,1.086014,-0.146423,-1.724333,...,0.229936,0.766927,-0.189624,0.766853,-0.141401,-0.077278,-0.297595,-0.221816,2.47,1.0
245556,152802.0,1.322724,-0.843911,-2.096888,0.759759,-0.196377,-1.166353,0.482534,-0.349791,1.045007,...,0.133815,-0.121562,-0.208574,-0.254752,-0.098324,-0.613874,0.002654,0.072386,357.95,1.0


In [17]:
new_dataset['Class'].value_counts()

0.0    492
1.0    449
Name: Class, dtype: int64

In [18]:
new_dataset.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,82515.680894,-0.149059,-0.175388,0.159276,0.047606,-0.060326,0.021041,0.004676,-0.030852,0.087695,...,0.051805,-0.00574,-0.117748,-0.092542,0.019947,0.039382,-0.011295,0.017556,-0.014237,117.005305
1.0,73172.202673,-5.113548,3.841349,-7.275478,4.632747,-3.497006,-1.365035,-5.940823,0.615337,-2.631902,...,0.35658,0.75575,0.022223,-0.048957,-0.087154,0.046406,0.040162,0.157467,0.073882,121.118597


#### Splitting the data into Features & Targets

In [19]:
X = new_dataset.drop(columns='Class', axis=1)
y = new_dataset['Class']

In [20]:
X

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
219206,141637.0,0.105191,0.172785,0.697912,-0.732619,0.186242,0.573118,0.119899,0.302033,0.606063,...,-0.072680,0.288672,0.871476,-0.074592,0.198502,-0.246711,-0.306755,0.059032,0.008897,43.00
154425,101499.0,1.378774,-0.901984,-0.466751,2.003703,-0.534268,-0.085748,0.083531,-0.197995,2.336721,...,0.283665,-0.492295,-1.483300,0.187313,-0.197480,-0.431731,-1.169124,-0.018984,0.013530,294.94
58241,48298.0,1.234482,0.306881,0.426585,0.777227,-0.602569,-1.279505,0.037143,-0.178095,0.133904,...,-0.145453,-0.276115,-0.836317,0.180218,0.649095,0.145316,0.077042,-0.027814,0.032217,0.89
9318,13497.0,1.099169,-0.126592,0.917145,1.583308,-0.429145,0.797578,-0.717981,0.371159,2.213973,...,-0.344675,-0.227569,-0.165969,-0.089100,-0.386758,0.538424,-0.287397,0.029218,-0.001305,9.00
164174,116510.0,-0.758521,-1.076023,1.602959,-2.674388,-0.288483,-0.119375,0.074683,-0.196538,-2.423780,...,0.177311,-0.089718,0.023442,-0.167609,-0.506054,0.629350,-0.065123,-0.165849,-0.167487,113.62
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
243848,152098.0,-4.124316,3.748597,-7.926507,7.763242,-0.769375,-2.031171,-3.474549,0.107121,-1.551352,...,-0.181979,0.547097,0.687854,0.429939,-0.620601,-0.369688,0.367349,-2.664767,0.417101,1.00
244004,152165.0,-4.673231,4.195976,-8.392423,7.743215,-1.138803,-2.094899,-3.839487,0.543053,-1.528448,...,-0.180279,0.554185,0.656076,0.482417,-0.624399,-0.296289,0.374802,-2.678544,0.412368,1.00
244333,152307.0,-5.222968,4.641827,-8.858204,7.723502,-1.507035,-2.159484,-4.205164,0.979334,-1.505637,...,-0.176673,0.561892,0.624207,0.536429,-0.628334,-0.222651,0.382208,-2.693036,0.407935,1.00
245347,152710.0,0.051075,1.310427,0.733222,2.620282,1.402358,0.528489,1.086014,-0.146423,-1.724333,...,-0.125877,0.229936,0.766927,-0.189624,0.766853,-0.141401,-0.077278,-0.297595,-0.221816,2.47


In [21]:
y

219206    0.0
154425    0.0
58241     0.0
9318      0.0
164174    0.0
         ... 
243848    1.0
244004    1.0
244333    1.0
245347    1.0
245556    1.0
Name: Class, Length: 941, dtype: float64

#### Splitting the data into Training & Test Dataset

In [22]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=2
)

In [23]:
X.shape, X_train.shape, X_test.shape

((941, 30), (752, 30), (189, 30))

#### Model Training

Logistic Regression

In [24]:
model = LogisticRegression()

In [25]:
# training the Logistic Regression Model
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


#### Model Evaluation

Accuracy Score

In [26]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, y_train)

print("Accuracy on Training Data = ", training_data_accuracy)

Accuracy on Training Data =  0.9481382978723404


In [27]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, y_test)

print("Accuracy on Test Data = ", test_data_accuracy)

Accuracy on Test Data =  0.9365079365079365


#### Building a Predictive System

In [28]:
input_data = (74, 1.038370334, 0.127486127,	0.184455888,	1.109949791,
              0.4416989,	0.945282527,	-0.036714604,	0.350995,	0.11894954,
              -0.24328924,	0.578062601,	0.67472982,	-0.534230568,	0.446601378,
              1.122884672,	-1.768000514,	1.241156963,	-2.449499863,	-1.747255172,
              -0.335519851,	0.102519797,	0.60508853,	0.023092156,	-0.626462661,
              0.479120272,	-0.166936836,	0.081246718,	0.001191576,	1.18)

# change the input data to a numpy array
input_data_as_numpy_array = np.array(input_data)

# reshape the numpy array as we are predicting for only on instance
input_data_reshaped = input_data_as_numpy_array.reshape(1, -1)

prediction = model.predict(input_data_reshaped)
prediction



array([0.])

In [30]:
if prediction[0] == 0:
  print("The Person is NOT likely to commit fraud")
elif prediction[0] == 1:
  print("The Person is likely to commit FRAUD")

The Person is NOT likely to commit fraud
