# Importing Necessary Libraries:

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE

In [2]:
data = pd.read_csv(r"D:\Data Is Good\Projects\Credit Card Fruad ML Project\creditcard.csv")
data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


# Studying the Data Set:

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [4]:
# Counting null values and their percentages:

pd.DataFrame({
    'Col': data.columns,
    'Null Count': data.isna().sum().values,
    '%tage Null': data.isna().sum().values/data.shape[0]
})

# we don't have any null values

Unnamed: 0,Col,Null Count,%tage Null
0,Time,0,0.0
1,V1,0,0.0
2,V2,0,0.0
3,V3,0,0.0
4,V4,0,0.0
5,V5,0,0.0
6,V6,0,0.0
7,V7,0,0.0
8,V8,0,0.0
9,V9,0,0.0


In [5]:
# Checking & removing duplicate records:

data.drop_duplicates(inplace = True)

In [6]:
data.shape

(283726, 31)

In [7]:
# since Time column is not relevant
data.drop(columns = ['Time'], inplace = True)

In [8]:
# Checking the balance of our target variable:

data['Class'].value_counts()
# here our data is highly imbalanced

0    283253
1       473
Name: Class, dtype: int64

In [9]:
# Let's divide the data between legal and fraudulent classes

legal = data[data['Class'] == 0]

fraud = data[data['Class'] == 1]


In [10]:
# checking both datasets for stats:

legal.describe()

# here focusing on the amount:

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,283253.0,283253.0,283253.0,283253.0,283253.0,283253.0,283253.0,283253.0,283253.0,283253.0,...,283253.0,283253.0,283253.0,283253.0,283253.0,283253.0,283253.0,283253.0,283253.0,283253.0
mean,0.013439,-0.009829,0.012853,-0.01044,0.006769,0.001251,0.010447,-0.002448,0.002613,0.007663,...,-0.00115,-0.00016,0.00036,0.000393,-0.000301,6.5e-05,0.001409,0.000418,88.413575,0.0
std,1.922179,1.63352,1.457593,1.398575,1.355816,1.329914,1.17748,1.15714,1.086902,1.036321,...,0.715629,0.723541,0.621165,0.605748,0.520612,0.48208,0.3927,0.327563,250.379023,0.0
min,-56.40751,-72.715728,-48.325589,-5.683171,-113.743307,-26.160506,-31.764946,-73.216718,-6.29073,-14.741096,...,-34.830382,-10.933144,-44.807735,-2.836627,-10.295397,-2.604551,-22.565679,-15.430084,0.0,0.0
25%,-0.913431,-0.601398,-0.883966,-0.851605,-0.687888,-0.767543,-0.550146,-0.208841,-0.641649,-0.533155,...,-0.228406,-0.542737,-0.16149,-0.354306,-0.317476,-0.326853,-0.07065,-0.052808,5.67,0.0
50%,0.022562,0.062561,0.182247,-0.0245,-0.052807,-0.274172,0.041664,0.021633,-0.051368,-0.09212,...,-0.029798,0.006675,-0.011077,0.041115,0.01619,-0.052293,0.001368,0.011238,22.0,0.0
75%,1.316788,0.797012,1.028261,0.734231,0.612442,0.397678,0.571029,0.324473,0.596969,0.454792,...,0.18547,0.528136,0.147633,0.440051,0.35049,0.240023,0.09076,0.077961,77.46,0.0
max,2.45493,18.902453,9.382558,16.875344,34.801666,73.301626,120.589494,18.709255,15.594995,23.745136,...,22.614889,10.50309,22.528412,4.584549,7.519589,3.517346,31.612198,33.847808,25691.16,0.0


In [11]:
fraud.describe()

# here focusing on the amount:

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,473.0,473.0,473.0,473.0,473.0,473.0,473.0,473.0,473.0,473.0,...,473.0,473.0,473.0,473.0,473.0,473.0,473.0,473.0,473.0,473.0
mean,-4.49828,3.405965,-6.729599,4.472591,-2.957197,-1.432518,-5.175912,0.953255,-2.522124,-5.453274,...,0.46655,0.086639,-0.096464,-0.106643,0.040615,0.050456,0.213774,0.07827,123.87186,1.0
std,6.593145,4.1225,6.909647,2.871523,5.278831,1.715347,6.858024,5.58595,2.465047,4.706451,...,2.731191,1.181295,1.50857,0.5179,0.806785,0.463016,1.245779,0.5331,260.211041,0.0
min,-30.55238,-8.402154,-31.103685,-1.313275,-22.105532,-6.406267,-43.557242,-41.044261,-13.434066,-24.588262,...,-22.797604,-8.887017,-19.254328,-2.028024,-4.781606,-1.152671,-7.263482,-1.86929,0.0,1.0
25%,-5.60369,1.145381,-7.926507,2.288644,-4.278983,-2.450444,-6.989195,-0.161518,-3.79676,-7.297803,...,0.027935,-0.521934,-0.341881,-0.436539,-0.320311,-0.263078,-0.015551,-0.097223,1.0,1.0
50%,-2.271755,2.617105,-4.875397,4.100098,-1.372245,-1.420468,-2.902079,0.617738,-2.099049,-4.466284,...,0.573898,0.055179,-0.075034,-0.061263,0.077913,0.012792,0.394682,0.145895,9.82,1.0
75%,-0.361428,4.571743,-2.171454,6.290918,0.260821,-0.413647,-0.907188,1.709417,-0.788388,-2.447469,...,1.192694,0.616383,0.287659,0.28203,0.463827,0.395528,0.821048,0.372393,105.89,1.0
max,2.132386,22.057729,2.25021,12.114672,11.095089,6.474115,5.802537,20.007208,3.353525,4.031435,...,27.202839,8.361985,5.46623,1.091435,2.208209,2.745261,3.052358,1.779364,2125.87,1.0


# Data Balancing:
* Since we know the number of non-fraudulant transcations are very high as compared to fraud transactions.
* Feeding this data directly to model will lead in bias towards the legal transactions