In [1]:

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import joblib

### Loading and Initial Exploration of the Dataset
We begin by loading the credit card fraud dataset and performing an initial exploration of its structure.

In [2]:
data = pd.read_csv("creditcard.csv")

In [3]:
data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [4]:
# Display dataset information to understand its structure and check for any potential data issues
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [5]:
# Check for missing values in the dataset
data.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [6]:
sc = StandardScaler()
data['Amount'] = sc.fit_transform(pd.DataFrame(data['Amount']))

In [7]:
data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0.244964,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,-0.342475,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,1.160686,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,0.140534,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,-0.073403,0


In [8]:
data = data.drop(['Time'], axis =1)

In [9]:
data.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0.244964,0
1,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,-0.342475,0
2,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,1.160686,0
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,0.140534,0
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,-0.073403,0


In [10]:
data.duplicated().any()

True

In [11]:
data = data.drop_duplicates()

In [12]:
data.shape

(275663, 30)

In [13]:
# Checking the distribution of classes (legitimate vs fraudulent transactions)
data['Class'].value_counts()

Class
0    275190
1       473
Name: count, dtype: int64

In [14]:
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [15]:
X = data.drop('Class', axis = 1)
y = data['Class']

In [16]:
# Split the data into training and testing sets to ensure we have separate data for training and evaluation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

#### Model Training with Logistic Regression & Decision Tree Classifier

In [17]:
classifier = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree Classifier": DecisionTreeClassifier()
}

for name, clf in classifier.items():
    print(f"\n=========={name}===========")
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(f"\n Accuaracy: {accuracy_score(y_test, y_pred)}")
    print(f"\n Precision: {precision_score(y_test, y_pred)}")
    print(f"\n Recall: {recall_score(y_test, y_pred)}")
    print(f"\n F1 Score: {f1_score(y_test, y_pred)}")



 Accuaracy: 0.9992563437505668

 Precision: 0.890625

 Recall: 0.6263736263736264

 F1 Score: 0.7354838709677419


 Accuaracy: 0.9989479984764116

 Precision: 0.6774193548387096

 Recall: 0.6923076923076923

 F1 Score: 0.6847826086956522


### Handling Class Imbalance with Undersampling
In this section, we will undersample the majority class (legitimate transactions) to match the number of fraudulent transactions. This helps balance the dataset but can potentially lead to information loss.

In [18]:
normal = data[data['Class']==0]
fraud = data[data['Class']==1]

In [19]:
normal.shape

(275190, 30)

In [20]:
fraud.shape

(473, 30)

In [21]:
# Balancing the dataset by sampling legitimate transactions
normal_sample = normal.sample(n=473)

In [22]:
normal_sample.shape

(473, 30)

In [23]:
new_data = pd.concat([normal_sample,fraud], ignore_index=True)

In [24]:
new_data.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,-0.328157,-0.356,1.106703,-1.547082,-0.474511,0.736896,0.131409,0.180283,-0.98937,0.028202,...,0.384445,0.796021,-0.051942,-1.335016,0.132072,-0.030952,0.042062,0.052351,0.210501,0
1,0.620091,1.108726,-1.508007,0.446946,0.940444,0.118048,0.110622,0.452583,-0.749159,-0.574432,...,0.022775,0.106847,0.086846,-1.406598,-0.657539,0.338089,-0.164037,-0.194032,-0.350191,0
2,1.13378,-0.013509,0.338259,0.727307,-0.111522,0.012412,0.04112,-0.062363,0.274371,-0.296708,...,-0.186935,-0.348384,-0.055188,-0.202263,0.459786,0.311717,-0.005437,0.016172,-0.157323,0
3,-2.277286,-7.610063,-4.236749,2.384822,-2.154222,0.136204,3.144186,-0.870913,-1.317378,0.190101,...,1.091493,-1.3937,-1.985233,-0.263785,-0.470549,-0.732943,-0.428577,0.326388,8.79919,0
4,0.177147,0.633098,1.786562,4.481495,0.856708,1.241945,-0.288915,0.140227,-1.586393,1.854879,...,0.265213,1.036814,-0.181712,-0.501455,-0.229221,0.507824,-0.117119,-0.268739,-0.353229,0


In [25]:
# Re-checking the distribution of classes (legitimate vs fraudulent transactions)
new_data['Class'].value_counts()

Class
0    473
1    473
Name: count, dtype: int64

In [26]:
X = new_data.drop('Class', axis = 1)
y= new_data['Class']

In [27]:
# Split the data into training and testing sets to ensure we have separate data for training and evaluation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [28]:
classifier = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree Classifier": DecisionTreeClassifier()
}

for name, clf in classifier.items():
    print(f"\n=========={name}===========")
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(f"\n Accuaracy: {accuracy_score(y_test, y_pred)}")
    print(f"\n Precision: {precision_score(y_test, y_pred)}")
    print(f"\n Recall: {recall_score(y_test, y_pred)}")
    print(f"\n F1 Score: {f1_score(y_test, y_pred)}")



 Accuaracy: 0.9473684210526315

 Precision: 0.9791666666666666

 Recall: 0.9215686274509803

 F1 Score: 0.9494949494949495


 Accuaracy: 0.8894736842105263

 Precision: 0.9175257731958762

 Recall: 0.8725490196078431

 F1 Score: 0.8944723618090452


### Handling Class Imbalance with Oversampling (SMOTE)
Here, we address the class imbalance issue using SMOTE (Synthetic Minority Over-sampling Technique), which generates synthetic examples of the minority class to balance the dataset.

In [29]:
X = data.drop('Class', axis = 1)
y= data['Class']

In [30]:
X.shape

(275663, 29)

In [31]:
y.shape

(275663,)

In [32]:
# Apply SMOTE to balance the dataset by generating synthetic samples for the minority class
from imblearn.over_sampling import SMOTE

In [33]:
# Balancing the dataset by sampling legitimate transactions
X_res, y_res = SMOTE().fit_resample(X,y)

In [34]:
y_res.value_counts()

Class
0    275190
1    275190
Name: count, dtype: int64

In [35]:
# Split the data into training and testing sets to ensure we have separate data for training and evaluation
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size = 0.2, random_state = 42)

In [36]:
classifier = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree Classifier": DecisionTreeClassifier()
}

for name, clf in classifier.items():
    print(f"\n=========={name}===========")
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(f"\n Accuaracy: {accuracy_score(y_test, y_pred)}")
    print(f"\n Precision: {precision_score(y_test, y_pred)}")
    print(f"\n Recall: {recall_score(y_test, y_pred)}")
    print(f"\n F1 Score: {f1_score(y_test, y_pred)}")



 Accuaracy: 0.9452287510447327

 Precision: 0.9736913374859751

 Recall: 0.9151137210697599

 F1 Score: 0.9434941938386271


 Accuaracy: 0.9981013118209238

 Precision: 0.9974037763253449

 Recall: 0.9988000654509754

 F1 Score: 0.9981014325554355


In [37]:
dtc = DecisionTreeClassifier()
dtc.fit(X_res, y_res)

In [38]:
# Save the trained model to disk using joblib for future use
joblib.dump(dtc, "creditcardmodel.pkl")

['creditcardmodel.pkl']

In [39]:
# Load the saved model from disk for prediction or further evaluation
model = joblib.load("creditcardmodel.pkl")

### Real-Time Application of the Trained Model
In this section, we demonstrate how the trained model can be used in a real-time scenario to predict whether a specific transaction is fraudulent or not based on its features. This showcases the practical application of the model in a real-world setting.

In [40]:
pred = model.predict([[-1.3598071336738,-0.0727811733098497,2.53634673796914,1.37815522427443,-0.338320769942518,0.462387777762292,0.239598554061257,0.0986979012610507,0.363786969611213,0.0907941719789316,-0.551599533260813,-0.617800855762348,-0.991389847235408,-0.311169353699879,1.46817697209427,-0.470400525259478,0.207971241929242,0.0257905801985591,0.403992960255733,0.251412098239705,-0.018306777944153,0.277837575558899,-0.110473910188767,0.0669280749146731,0.128539358273528,-0.189114843888824,0.133558376740387,-0.0210530534538215,149.62]])



In [41]:
pred[0]

0

In [42]:
if pred[0] == 0:
    print("Normal Transcation")
else:
    print("Fraud Transcation")

Normal Transcation


### Conclusion
In this notebook, we successfully implemented credit card fraud detection using 2 different machine learning models. We handled class imbalance using both undersampling and oversampling techniques (SMOTE). Logistic Regression and Decision Tree Classifier were used for model training, and the models were evaluated on the test set. The trained model was saved using joblib for future use.