In [1]:
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
from tensorflow.keras.callbacks import Callback
from imblearn.over_sampling import SMOTE

In [2]:
# Load the dataset
data = pd.read_csv(Path('Training_Fraudulent_E-Commerce_Transaction_Data_2.csv'))

# Review the DataFrame
print(data.nunique())
data.head()


Transaction ID        23634
Customer ID           23634
Transaction Amount    18375
Transaction Date      23607
Payment Method            4
Product Category          5
Quantity                  5
Customer Age             74
Customer Location     14868
Device Used               3
IP Address            23634
Shipping Address      23634
Billing Address       23634
Is Fraudulent             2
Account Age Days        365
Transaction Hour         24
dtype: int64


Unnamed: 0,Transaction ID,Customer ID,Transaction Amount,Transaction Date,Payment Method,Product Category,Quantity,Customer Age,Customer Location,Device Used,IP Address,Shipping Address,Billing Address,Is Fraudulent,Account Age Days,Transaction Hour
0,c12e07a0-8a06-4c0d-b5cc-04f3af688570,8ca9f102-02a4-4207-ab63-484e83a1bdf0,42.32,2024-03-24 23:42:43,PayPal,electronics,1,40,East Jameshaven,desktop,110.87.246.85,5399 Rachel Stravenue Suite 718\nNorth Blakebu...,5399 Rachel Stravenue Suite 718\nNorth Blakebu...,0,282,23
1,7d187603-7961-4fce-9827-9698e2b6a201,4d158416-caae-4b09-bd5b-15235deb9129,301.34,2024-01-22 00:53:31,credit card,electronics,3,35,Kingstad,tablet,14.73.104.153,"5230 Stephanie Forge\nCollinsbury, PR 81853","5230 Stephanie Forge\nCollinsbury, PR 81853",0,223,0
2,f2c14f9d-92df-4aaf-8931-ceaf4e63ed72,ccae47b8-75c7-4f5a-aa9e-957deced2137,340.32,2024-01-22 08:06:03,debit card,toys & games,5,29,North Ryan,desktop,67.58.94.93,"195 Cole Oval\nPort Larry, IA 58422","4772 David Stravenue Apt. 447\nVelasquezside, ...",0,360,8
3,e9949bfa-194d-486b-84da-9565fca9e5ce,b04960c0-aeee-4907-b1cd-4819016adcef,95.77,2024-01-16 20:34:53,credit card,electronics,5,45,Kaylaville,mobile,202.122.126.216,"7609 Cynthia Square\nWest Brenda, NV 23016","7609 Cynthia Square\nWest Brenda, NV 23016",0,325,20
4,7362837c-7538-434e-8731-0df713f5f26d,de9d6351-b3a7-4bc7-9a55-8f013eb66928,77.45,2024-01-16 15:47:23,credit card,clothing,5,42,North Edwardborough,desktop,96.77.232.76,"2494 Robert Ramp Suite 313\nRobinsonport, AS 5...","2494 Robert Ramp Suite 313\nRobinsonport, AS 5...",0,116,15


In [3]:
# Display dataset structure
print(data.info())

# View the first few rows
print(data.head())

# Check for missing values
print(data.isnull().sum())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23634 entries, 0 to 23633
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Transaction ID      23634 non-null  object 
 1   Customer ID         23634 non-null  object 
 2   Transaction Amount  23634 non-null  float64
 3   Transaction Date    23634 non-null  object 
 4   Payment Method      23634 non-null  object 
 5   Product Category    23634 non-null  object 
 6   Quantity            23634 non-null  int64  
 7   Customer Age        23634 non-null  int64  
 8   Customer Location   23634 non-null  object 
 9   Device Used         23634 non-null  object 
 10  IP Address          23634 non-null  object 
 11  Shipping Address    23634 non-null  object 
 12  Billing Address     23634 non-null  object 
 13  Is Fraudulent       23634 non-null  int64  
 14  Account Age Days    23634 non-null  int64  
 15  Transaction Hour    23634 non-null  int64  
dtypes: f

In [4]:
# Drop irrelevant columns
data_cleaned = data.drop(columns=['Transaction ID', 'Customer ID', 'IP Address', 'Shipping Address', 'Billing Address', 'Customer Location'])
data_cleaned


# Convert 'Transaction Date' to datetime and extract features
data_cleaned['Transaction Date'] = pd.to_datetime(data_cleaned['Transaction Date'])
data_cleaned['Year'] = data_cleaned['Transaction Date'].dt.year
data_cleaned['Month'] = data_cleaned['Transaction Date'].dt.month
data_cleaned['Day'] = data_cleaned['Transaction Date'].dt.day
data_cleaned['Hour'] = data_cleaned['Transaction Date'].dt.hour
data_cleaned = data_cleaned.drop(columns=['Transaction Date'])


data_cleaned = pd.get_dummies(data_cleaned, columns=['Payment Method', 'Product Category', 'Device Used'], drop_first=True)
data_cleaned



Unnamed: 0,Transaction Amount,Quantity,Customer Age,Is Fraudulent,Account Age Days,Transaction Hour,Year,Month,Day,Hour,Payment Method_bank transfer,Payment Method_credit card,Payment Method_debit card,Product Category_electronics,Product Category_health & beauty,Product Category_home & garden,Product Category_toys & games,Device Used_mobile,Device Used_tablet
0,42.32,1,40,0,282,23,2024,3,24,23,False,False,False,True,False,False,False,False,False
1,301.34,3,35,0,223,0,2024,1,22,0,False,True,False,True,False,False,False,False,True
2,340.32,5,29,0,360,8,2024,1,22,8,False,False,True,False,False,False,True,False,False
3,95.77,5,45,0,325,20,2024,1,16,20,False,True,False,True,False,False,False,True,False
4,77.45,5,42,0,116,15,2024,1,16,15,False,True,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23629,53.73,5,32,0,363,16,2024,1,26,16,False,False,False,False,False,False,True,False,True
23630,47.42,4,41,0,296,17,2024,2,25,17,False,True,False,False,False,False,False,False,False
23631,1045.23,1,9,0,329,23,2024,3,28,23,True,False,False,False,True,False,False,False,True
23632,34.25,2,39,0,347,11,2024,2,9,11,False,False,True,False,False,True,False,True,False


In [5]:
# Define features and target
X = data_cleaned.drop(columns=['Is Fraudulent'])
y = data_cleaned['Is Fraudulent']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

Features shape: (23634, 18)
Target shape: (23634,)


In [6]:
X

Unnamed: 0,Transaction Amount,Quantity,Customer Age,Account Age Days,Transaction Hour,Year,Month,Day,Hour,Payment Method_bank transfer,Payment Method_credit card,Payment Method_debit card,Product Category_electronics,Product Category_health & beauty,Product Category_home & garden,Product Category_toys & games,Device Used_mobile,Device Used_tablet
0,42.32,1,40,282,23,2024,3,24,23,False,False,False,True,False,False,False,False,False
1,301.34,3,35,223,0,2024,1,22,0,False,True,False,True,False,False,False,False,True
2,340.32,5,29,360,8,2024,1,22,8,False,False,True,False,False,False,True,False,False
3,95.77,5,45,325,20,2024,1,16,20,False,True,False,True,False,False,False,True,False
4,77.45,5,42,116,15,2024,1,16,15,False,True,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23629,53.73,5,32,363,16,2024,1,26,16,False,False,False,False,False,False,True,False,True
23630,47.42,4,41,296,17,2024,2,25,17,False,True,False,False,False,False,False,False,False
23631,1045.23,1,9,329,23,2024,3,28,23,True,False,False,False,True,False,False,False,True
23632,34.25,2,39,347,11,2024,2,9,11,False,False,True,False,False,True,False,True,False


In [7]:
#Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)
X_train, X_test, y_train, y_test

(       Transaction Amount  Quantity  Customer Age  Account Age Days  \
 9446               294.72         5            19                91   
 14163              371.32         4            39                76   
 7619                50.27         3            21               169   
 22974              283.50         5            30                 8   
 21205               53.65         1            37                10   
 ...                   ...       ...           ...               ...   
 14815               76.42         5            23                27   
 9897               395.55         1            31               206   
 493                104.36         4            61               241   
 10041               43.74         2            14               347   
 117                150.43         4            39                43   
 
        Transaction Hour  Year  Month  Day  Hour  Payment Method_bank transfer  \
 9446                  6  2024      3   22     6    

In [8]:
# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [13]:
# Apply SMOTE
smote = SMOTE(random_state=42)
X_resampled: X
y_resampled: y
X_resampled, y_resampled = smote.fit_resample(X, y)

In [14]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [15]:
# intitialize and train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [16]:
# Make predictions
y_pred = model.predict(X_test)

In [17]:
# Classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97      4555
           1       0.97      0.97      0.97      4410

    accuracy                           0.97      8965
   macro avg       0.97      0.97      0.97      8965
weighted avg       0.97      0.97      0.97      8965

