**Importing Dependecies**



In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [6]:
# loading the dataset to a Pandas DataFrame
credit_card_data = pd.read_csv('/content/sample_data/creditcard.csv')

In [7]:
# first 5 rows of the dataset
credit_card_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0.0


In [8]:
credit_card_data.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
158597,111499.0,1.753695,0.018874,-0.429072,4.161717,0.21527,0.811187,-0.328388,0.124104,1.297017,...,-0.059761,0.04654,0.032887,0.375723,0.0504,0.075297,-0.062722,-0.037895,89.03,0.0
158598,111499.0,-0.223112,0.969299,0.774386,-0.162188,0.134878,-1.627333,0.917096,-0.39672,1.209086,...,-0.337662,-0.560819,0.282673,0.869122,-0.610198,0.034188,0.09854,0.091763,1.79,0.0
158599,111499.0,-0.324219,1.078371,-0.661833,-1.060278,1.553663,-1.123551,1.788409,-0.653065,0.89969,...,0.053192,0.870073,-0.223665,-0.390476,-0.264169,0.032943,0.325164,0.093798,31.0,0.0
158600,111500.0,-0.909674,1.204345,-0.038838,0.528709,2.760196,1.27091,0.929798,0.373622,0.090421,...,-0.066235,0.123375,-0.53833,-0.920971,0.985594,-0.251958,0.046382,0.055573,1.0,0.0
158601,111502.0,1.939272,-0.528638,-0.457417,0.28969,-0.413718,0.061268,-0.754919,,,...,,,,,,,,,,


In [9]:
# dataset informations
credit_card_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158602 entries, 0 to 158601
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    158602 non-null  float64
 1   V1      158602 non-null  float64
 2   V2      158602 non-null  float64
 3   V3      158602 non-null  float64
 4   V4      158602 non-null  float64
 5   V5      158602 non-null  float64
 6   V6      158602 non-null  float64
 7   V7      158602 non-null  float64
 8   V8      158601 non-null  float64
 9   V9      158601 non-null  float64
 10  V10     158601 non-null  float64
 11  V11     158601 non-null  float64
 12  V12     158601 non-null  float64
 13  V13     158601 non-null  float64
 14  V14     158601 non-null  float64
 15  V15     158601 non-null  float64
 16  V16     158601 non-null  float64
 17  V17     158601 non-null  float64
 18  V18     158601 non-null  float64
 19  V19     158601 non-null  float64
 20  V20     158601 non-null  float64
 21  V21     15

**Data Cleaning**

In [10]:
# checking the number og missing values in each column
credit_card_data.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        1
V9        1
V10       1
V11       1
V12       1
V13       1
V14       1
V15       1
V16       1
V17       1
V18       1
V19       1
V20       1
V21       1
V22       1
V23       1
V24       1
V25       1
V26       1
V27       1
V28       1
Amount    1
Class     1
dtype: int64

due to a small number of missing value in half of the column, we **delete** the row.

Else, we use **imputation** technique to replace the missing data: Mean Imputation, Median Imputation, Mode Imputation, Predictive Imputation

In [15]:
credit_card_data = credit_card_data.dropna()

In [16]:
credit_card_data.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
158596,111498.0,1.969378,-0.242307,-0.068347,0.720586,-0.647182,-0.470082,-0.648397,-0.167131,2.626792,...,0.059016,0.724484,0.127695,-0.026085,-0.107126,-0.251481,0.023643,-0.041423,9.99,0.0
158597,111499.0,1.753695,0.018874,-0.429072,4.161717,0.21527,0.811187,-0.328388,0.124104,1.297017,...,-0.059761,0.04654,0.032887,0.375723,0.0504,0.075297,-0.062722,-0.037895,89.03,0.0
158598,111499.0,-0.223112,0.969299,0.774386,-0.162188,0.134878,-1.627333,0.917096,-0.39672,1.209086,...,-0.337662,-0.560819,0.282673,0.869122,-0.610198,0.034188,0.09854,0.091763,1.79,0.0
158599,111499.0,-0.324219,1.078371,-0.661833,-1.060278,1.553663,-1.123551,1.788409,-0.653065,0.89969,...,0.053192,0.870073,-0.223665,-0.390476,-0.264169,0.032943,0.325164,0.093798,31.0,0.0
158600,111500.0,-0.909674,1.204345,-0.038838,0.528709,2.760196,1.27091,0.929798,0.373622,0.090421,...,-0.066235,0.123375,-0.53833,-0.920971,0.985594,-0.251958,0.046382,0.055573,1.0,0.0


**Data Exploration**

In [22]:
# distribution of legit transactions & fraudulent transactions
credit_card_data['Class'].value_counts()

0.0    158245
1.0       356
Name: Class, dtype: int64

Based on the above, both 'Class' are highly imbalanced


0 --> Normal Transactions

1 --> Fradulent Transactions

In [23]:
# seperating the data for analysis
legit = credit_card_data[credit_card_data.Class == 0]
fraud = credit_card_data[credit_card_data.Class == 1]

In [24]:
print(legit.shape)
print(fraud.shape)

(158245, 31)
(356, 31)


In [59]:
# statistical measure of the legit data
legit.Amount.describe()

count    158245.000000
mean         87.693471
std         245.861341
min           0.000000
25%           5.650000
50%          21.970000
75%          77.190000
max       19656.530000
Name: Amount, dtype: float64

From the data presented, which appears to be a summary statistics output for a variable named `Amount` related to fraud, we can infer several insights:

1. **Transaction Count**: There were 158245 transactions classified as a normal transactions.
2. **Average Transaction Amount**: The average amount involved in these fraudulent transactions was approximately dollar 87.69.
3. **Standard Deviation**: There is a large variation in a normal transaction amounts, as indicated by the standard deviation of around dollar 246.86.
4. **Minimum Amount**: The smallest transaction amount recorded as fraud is dollar 0.00, which suggests that there are entries where no amount was transacted or it's a placeholder for a different type of fraud.
5. **Percentiles**:
   - **25% of transactions** involved dollar 5.65 or less, indicating a large number of low-amount frauds.
   - **The median transaction amount** is dollar 21.97, meaning that half of the fraudulent transactions are below this amount.
   - **75% of transactions** are dollar 77.19 or less, which shows that high-value frauds are less common.
6. **Maximum Amount**: The largest fraudulent transaction amount is dollar 19656.53, suggesting that while most normal transactions are for smaller amounts.

In [58]:
# statistical measure of fraud data
fraud.Amount.describe()

count     356.000000
mean      112.005000
std       228.517058
min         0.000000
25%         1.000000
50%         9.905000
75%       102.125000
max      1809.680000
Name: Amount, dtype: float64

From the data presented, which appears to be a summary statistics output for a variable named `Amount` related to fraud, we can infer several insights:

1. **Transaction Count**: There were 356 transactions classified as fraud.
2. **Average Transaction Amount**: The average amount involved in these fraudulent transactions was approximately dollar 112.005.
3. **Standard Deviation**: There is a large variation in transaction amounts, as indicated by the standard deviation of around dollar 228.52.
4. **Minimum Amount**: The smallest transaction amount recorded as fraud is dollar 0.00, which suggests that there are entries where no amount was transacted or it's a placeholder for a different type of fraud.
5. **Percentiles**:
   - **25% of transactions** involved dollar 1.00 or less, indicating a large number of low-amount frauds.
   - **The median transaction amount** is dollar 9.91, meaning that half of the fraudulent transactions are below this amount.
   - **75% of transactions** are dollar 102.13 or less, which shows that high-value frauds are less common.
6. **Maximum Amount**: The largest fraudulent transaction amount is dollar 1809.68, suggesting that while most fraudulent transactions are for smaller amounts, there can occasionally be quite large transactions.

In [27]:
# compare the value for both transactions
credit_card_data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,56794.272988,-0.204776,0.041254,0.602823,0.129288,-0.2083,0.071758,-0.083543,0.041708,0.025439,...,0.03502,-0.039806,-0.106025,-0.026515,0.008905,0.107902,0.016377,0.001036,0.002282,87.693471
1.0,55968.873596,-6.057284,4.419181,-8.006879,4.902106,-4.402531,-1.44596,-7.057336,0.778068,-2.884994,...,0.413879,0.767732,-0.017997,-0.04505,-0.073773,0.080805,0.036841,0.181541,0.044143,112.005


**Potential Predictors:** The features with the largest differences in mean values between classes could be strong candidates for predicting fraud in a machine learning model.

Under-Sampling

Build a sample dataset containing similiar distribution of normal transactions and Fraudulent Transactions

Number of Fraudulent Transactions --> 356

since number of faudulent transactions are 356, to balance it out, we set legit transactions to 359 as well.

In [32]:
legit_sample = legit.sample(n=356)

Concatenating two Dataframes

In [33]:
new_dataset = pd.concat([legit_sample, fraud], axis=0)

In [34]:
new_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
27320,34498.0,-0.276963,0.989269,1.438328,1.199569,0.463889,-0.108903,0.907105,-0.251096,-0.197925,...,-0.134107,0.136893,-0.272678,-0.087069,0.04563,-0.252358,0.250046,-0.017981,13.08,0.0
67458,52543.0,-5.718193,-4.629232,0.018121,2.533407,-1.97994,1.619826,1.580104,0.222791,-0.20354,...,-0.102007,0.398741,-1.39635,-0.050602,-0.248192,-0.123538,1.143683,-0.768914,970.0,0.0
83772,60020.0,1.041054,-0.358149,0.19759,0.440109,-0.364994,-0.008968,-0.184395,0.04626,0.108729,...,0.203969,0.340168,-0.304666,-0.446257,0.496268,0.510873,-0.046954,0.017122,118.5,0.0
91849,63653.0,0.961936,-1.865977,0.525449,-1.262634,-1.866709,-0.174023,-1.008187,0.084908,-1.918779,...,-0.147901,-0.634248,0.031368,0.167577,-0.109944,-0.43223,0.007571,0.056203,240.65,0.0
69867,53618.0,-3.411858,-4.752182,2.519218,-0.192814,-1.320169,0.422772,0.420778,0.207099,0.222096,...,0.772237,0.788833,1.787582,0.366431,0.775974,-0.157396,-0.376669,-0.057185,750.0,0.0


In [35]:
new_dataset['Class'].value_counts()

0.0    356
1.0    356
Name: Class, dtype: int64

In [36]:
new_dataset.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,55696.910112,-0.174285,-0.061044,0.56958,0.13558,-0.240152,-0.005501,-0.002063,0.025394,-0.022065,...,0.056448,-0.045636,-0.096208,-0.004379,-0.011889,0.124821,0.023646,0.007962,0.021632,111.388764
1.0,55968.873596,-6.057284,4.419181,-8.006879,4.902106,-4.402531,-1.44596,-7.057336,0.778068,-2.884994,...,0.413879,0.767732,-0.017997,-0.04505,-0.073773,0.080805,0.036841,0.181541,0.044143,112.005


Splitting the data into Features & Targets

In [38]:
X = new_dataset.drop(columns='Class', axis=1)
Y = new_dataset['Class']

In [39]:
print(Y)

27320     0.0
67458     0.0
83772     0.0
91849     0.0
69867     0.0
         ... 
156990    1.0
157585    1.0
157868    1.0
157871    1.0
157918    1.0
Name: Class, Length: 712, dtype: float64


**Split the data into Training Data & Testing Data**

In [47]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [48]:
print(X.shape, X_train.shape, X_test.shape)

(712, 30) (569, 30) (143, 30)


**Model Training**

Logistic Regression

In [49]:
model = LogisticRegression()

In [51]:
# training the Logisitic Regression Model with Training Data
model.fit(X_train, Y_train)

**Model Evaluation**

Accuracy Score

In [53]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [54]:
print('Accuracy on Training Data: ', training_data_accuracy)

Accuracy on Training Data:  0.9121265377855887


Comparing X_train_prediction to Y_train, it achieved 91% accuracy.

In [56]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [57]:
print('Accuracy on Testing Data: ', test_data_accuracy)

Accuracy on Testing Data:  0.9370629370629371


Comparing X_test_prediction to Y_test, it achieved 94% accuracy.