In [1]:
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
from tensorflow.keras.callbacks import Callback
from imblearn.over_sampling import SMOTE

In [2]:
# Load the dataset
data = pd.read_csv(Path('Training_Fraudulent_E-Commerce_Transaction_Data_2.csv'))

# Review the DataFrame
print(data.nunique())
data.head()


Transaction ID        23634
Customer ID           23634
Transaction Amount    18375
Transaction Date      23607
Payment Method            4
Product Category          5
Quantity                  5
Customer Age             74
Customer Location     14868
Device Used               3
IP Address            23634
Shipping Address      23634
Billing Address       23634
Is Fraudulent             2
Account Age Days        365
Transaction Hour         24
dtype: int64


Unnamed: 0,Transaction ID,Customer ID,Transaction Amount,Transaction Date,Payment Method,Product Category,Quantity,Customer Age,Customer Location,Device Used,IP Address,Shipping Address,Billing Address,Is Fraudulent,Account Age Days,Transaction Hour
0,c12e07a0-8a06-4c0d-b5cc-04f3af688570,8ca9f102-02a4-4207-ab63-484e83a1bdf0,42.32,2024-03-24 23:42:43,PayPal,electronics,1,40,East Jameshaven,desktop,110.87.246.85,5399 Rachel Stravenue Suite 718\nNorth Blakebu...,5399 Rachel Stravenue Suite 718\nNorth Blakebu...,0,282,23
1,7d187603-7961-4fce-9827-9698e2b6a201,4d158416-caae-4b09-bd5b-15235deb9129,301.34,2024-01-22 00:53:31,credit card,electronics,3,35,Kingstad,tablet,14.73.104.153,"5230 Stephanie Forge\nCollinsbury, PR 81853","5230 Stephanie Forge\nCollinsbury, PR 81853",0,223,0
2,f2c14f9d-92df-4aaf-8931-ceaf4e63ed72,ccae47b8-75c7-4f5a-aa9e-957deced2137,340.32,2024-01-22 08:06:03,debit card,toys & games,5,29,North Ryan,desktop,67.58.94.93,"195 Cole Oval\nPort Larry, IA 58422","4772 David Stravenue Apt. 447\nVelasquezside, ...",0,360,8
3,e9949bfa-194d-486b-84da-9565fca9e5ce,b04960c0-aeee-4907-b1cd-4819016adcef,95.77,2024-01-16 20:34:53,credit card,electronics,5,45,Kaylaville,mobile,202.122.126.216,"7609 Cynthia Square\nWest Brenda, NV 23016","7609 Cynthia Square\nWest Brenda, NV 23016",0,325,20
4,7362837c-7538-434e-8731-0df713f5f26d,de9d6351-b3a7-4bc7-9a55-8f013eb66928,77.45,2024-01-16 15:47:23,credit card,clothing,5,42,North Edwardborough,desktop,96.77.232.76,"2494 Robert Ramp Suite 313\nRobinsonport, AS 5...","2494 Robert Ramp Suite 313\nRobinsonport, AS 5...",0,116,15


In [3]:
# Display dataset structure
print(data.info())

# View the first few rows
print(data.head())

# Check for missing values
print(data.isnull().sum())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23634 entries, 0 to 23633
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Transaction ID      23634 non-null  object 
 1   Customer ID         23634 non-null  object 
 2   Transaction Amount  23634 non-null  float64
 3   Transaction Date    23634 non-null  object 
 4   Payment Method      23634 non-null  object 
 5   Product Category    23634 non-null  object 
 6   Quantity            23634 non-null  int64  
 7   Customer Age        23634 non-null  int64  
 8   Customer Location   23634 non-null  object 
 9   Device Used         23634 non-null  object 
 10  IP Address          23634 non-null  object 
 11  Shipping Address    23634 non-null  object 
 12  Billing Address     23634 non-null  object 
 13  Is Fraudulent       23634 non-null  int64  
 14  Account Age Days    23634 non-null  int64  
 15  Transaction Hour    23634 non-null  int64  
dtypes: f

In [4]:
# Drop irrelevant columns
data_cleaned = data.drop(columns=['Transaction ID', 'Customer ID', 'IP Address', 'Shipping Address', 'Billing Address'])
data_cleaned


# Convert 'Transaction Date' to datetime and extract features
data_cleaned['Transaction Date'] = pd.to_datetime(data_cleaned['Transaction Date'])
data_cleaned['Year'] = data_cleaned['Transaction Date'].dt.year
data_cleaned['Month'] = data_cleaned['Transaction Date'].dt.month
data_cleaned['Day'] = data_cleaned['Transaction Date'].dt.day
data_cleaned['Hour'] = data_cleaned['Transaction Date'].dt.hour
data_cleaned = data_cleaned.drop(columns=['Transaction Date'])


data_cleaned = pd.get_dummies(data_cleaned, columns=['Payment Method', 'Product Category', 'Device Used'], drop_first=True)
data_cleaned



Unnamed: 0,Transaction Amount,Quantity,Customer Age,Customer Location,Is Fraudulent,Account Age Days,Transaction Hour,Year,Month,Day,Hour,Payment Method_bank transfer,Payment Method_credit card,Payment Method_debit card,Product Category_electronics,Product Category_health & beauty,Product Category_home & garden,Product Category_toys & games,Device Used_mobile,Device Used_tablet
0,42.32,1,40,East Jameshaven,0,282,23,2024,3,24,23,False,False,False,True,False,False,False,False,False
1,301.34,3,35,Kingstad,0,223,0,2024,1,22,0,False,True,False,True,False,False,False,False,True
2,340.32,5,29,North Ryan,0,360,8,2024,1,22,8,False,False,True,False,False,False,True,False,False
3,95.77,5,45,Kaylaville,0,325,20,2024,1,16,20,False,True,False,True,False,False,False,True,False
4,77.45,5,42,North Edwardborough,0,116,15,2024,1,16,15,False,True,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23629,53.73,5,32,Rebeccafurt,0,363,16,2024,1,26,16,False,False,False,False,False,False,True,False,True
23630,47.42,4,41,Nataliefort,0,296,17,2024,2,25,17,False,True,False,False,False,False,False,False,False
23631,1045.23,1,9,East Shannonville,0,329,23,2024,3,28,23,True,False,False,False,True,False,False,False,True
23632,34.25,2,39,Lake Nicole,0,347,11,2024,2,9,11,False,False,True,False,False,True,False,True,False


In [5]:
# Define features and target
X = data_cleaned.drop(columns=['Is Fraudulent'])
y = data_cleaned['Is Fraudulent']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

Features shape: (23634, 19)
Target shape: (23634,)


In [6]:
X

Unnamed: 0,Transaction Amount,Quantity,Customer Age,Customer Location,Account Age Days,Transaction Hour,Year,Month,Day,Hour,Payment Method_bank transfer,Payment Method_credit card,Payment Method_debit card,Product Category_electronics,Product Category_health & beauty,Product Category_home & garden,Product Category_toys & games,Device Used_mobile,Device Used_tablet
0,42.32,1,40,East Jameshaven,282,23,2024,3,24,23,False,False,False,True,False,False,False,False,False
1,301.34,3,35,Kingstad,223,0,2024,1,22,0,False,True,False,True,False,False,False,False,True
2,340.32,5,29,North Ryan,360,8,2024,1,22,8,False,False,True,False,False,False,True,False,False
3,95.77,5,45,Kaylaville,325,20,2024,1,16,20,False,True,False,True,False,False,False,True,False
4,77.45,5,42,North Edwardborough,116,15,2024,1,16,15,False,True,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23629,53.73,5,32,Rebeccafurt,363,16,2024,1,26,16,False,False,False,False,False,False,True,False,True
23630,47.42,4,41,Nataliefort,296,17,2024,2,25,17,False,True,False,False,False,False,False,False,False
23631,1045.23,1,9,East Shannonville,329,23,2024,3,28,23,True,False,False,False,True,False,False,False,True
23632,34.25,2,39,Lake Nicole,347,11,2024,2,9,11,False,False,True,False,False,True,False,True,False


In [7]:
#Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)
X_train, X_test, y_train, y_test

(       Transaction Amount  Quantity  Customer Age  Customer Location  \
 9446               294.72         5            19        Melindafurt   
 14163              371.32         4            39        East Jeremy   
 7619                50.27         3            21       North Brandi   
 22974              283.50         5            30        Phillipside   
 21205               53.65         1            37    West Steveville   
 ...                   ...       ...           ...                ...   
 14815               76.42         5            23  New Christinaland   
 9897               395.55         1            31       Franklinland   
 493                104.36         4            61        Lake Robert   
 10041               43.74         2            14      Lake Jacktown   
 117                150.43         4            39          Jasonport   
 
        Account Age Days  Transaction Hour  Year  Month  Day  Hour  \
 9446                 91                 6  2024    

In [8]:
print("Customer Location" in X_train.columns)

True


In [9]:
X_train = X_train.drop(columns=['Customer Location'])
X_test = X_test.drop(columns=['Customer Location'])


In [10]:
# Check data types of all columns in X_train
print(X_train.dtypes)

# Check for non-numeric columns
non_numeric_columns = X_train.select_dtypes(include=['object']).columns
print("Non-numeric columns:", non_numeric_columns)

Transaction Amount                  float64
Quantity                              int64
Customer Age                          int64
Account Age Days                      int64
Transaction Hour                      int64
Year                                  int32
Month                                 int32
Day                                   int32
Hour                                  int32
Payment Method_bank transfer           bool
Payment Method_credit card             bool
Payment Method_debit card              bool
Product Category_electronics           bool
Product Category_health & beauty       bool
Product Category_home & garden         bool
Product Category_toys & games          bool
Device Used_mobile                     bool
Device Used_tablet                     bool
dtype: object
Non-numeric columns: Index([], dtype='object')


In [11]:
# Initialize SMOTE
smote = SMOTE(random_state=42)

# Apply SMOTE to training data
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Check the class distribution after SMOTE
print("Before SMOTE:", y_train.value_counts())
print("After SMOTE:", y_train_resampled.value_counts())

Before SMOTE: Is Fraudulent
0    17929
1      978
Name: count, dtype: int64
After SMOTE: Is Fraudulent
0    17929
1    17929
Name: count, dtype: int64


In [12]:
print(X_train_resampled.shape)
print(y_train_resampled.value_counts())

(35858, 18)
Is Fraudulent
0    17929
1    17929
Name: count, dtype: int64


In [13]:
scaler = StandardScaler()
X_train_resampled_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)

In [14]:
# Initialize the model
model = RandomForestClassifier(
    n_estimators=100, 
    max_depth=20, 
    class_weight={0: 1, 1: 10},
    random_state=42
)

# Train the model
model.fit(X_train_resampled_scaled, y_train_resampled)

# Predict on the test set
y_pred = model.predict(X_test_scaled)

# Evaluate the model
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[4219  264]
 [ 135  109]]
              precision    recall  f1-score   support

           0       0.97      0.94      0.95      4483
           1       0.29      0.45      0.35       244

    accuracy                           0.92      4727
   macro avg       0.63      0.69      0.65      4727
weighted avg       0.93      0.92      0.92      4727



In [15]:
# Examine feature importance
importances = model.feature_importances_
feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns, 
    'Importance': importances
}).sort_values(by='Importance', ascending=False)
print(feature_importance_df)

                             Feature  Importance
3                   Account Age Days    0.165812
0                 Transaction Amount    0.152478
4                   Transaction Hour    0.102212
2                       Customer Age    0.084552
7                                Day    0.082997
8                               Hour    0.077400
1                           Quantity    0.064295
6                              Month    0.046879
16                Device Used_mobile    0.032577
17                Device Used_tablet    0.028637
9       Payment Method_bank transfer    0.027348
11         Payment Method_debit card    0.026753
14    Product Category_home & garden    0.024496
10        Payment Method_credit card    0.024376
13  Product Category_health & beauty    0.020107
15     Product Category_toys & games    0.020033
12      Product Category_electronics    0.019047
5                               Year    0.000000


In [16]:
X_train = X_train.drop(columns=['Year'])
X_test = X_test.drop(columns=['Year'])

In [17]:
# Reinitialize the model
model = RandomForestClassifier(
    n_estimators=200, 
    max_depth=25, 
    class_weight={0: 1, 1: 20},
    random_state=42
)

# Train the model
model.fit(X_train_resampled_scaled, y_train_resampled)

# Predict on the test set
y_pred = model.predict(X_test_scaled)

# Evaluate the model
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[4316  167]
 [ 165   79]]
              precision    recall  f1-score   support

           0       0.96      0.96      0.96      4483
           1       0.32      0.32      0.32       244

    accuracy                           0.93      4727
   macro avg       0.64      0.64      0.64      4727
weighted avg       0.93      0.93      0.93      4727



In [18]:
# Reinitialize the model
model = RandomForestClassifier(
    n_estimators=200, 
    max_depth=25, 
    class_weight={0: 1, 1: 12},
    random_state=42
)

# Train the model
model.fit(X_train_resampled_scaled, y_train_resampled)

# Predict on the test set
# Adjust probability threshold for classifying 1
y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
threshold = 0.4
y_pred = (y_pred_proba >= threshold).astype(int)

# Evaluate the model
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[4248  235]
 [ 126  118]]
              precision    recall  f1-score   support

           0       0.97      0.95      0.96      4483
           1       0.33      0.48      0.40       244

    accuracy                           0.92      4727
   macro avg       0.65      0.72      0.68      4727
weighted avg       0.94      0.92      0.93      4727



In [None]:
## BELOW MODEL HAS THE BEST BALANCE BETWEEN PRECISION AND RECALL FOR MINORITY CLASS
## WHILE MAINTAINING OVERALL ACCURACY

In [19]:
# Reinitialize the model
model = RandomForestClassifier(
    n_estimators=200, 
    max_depth=25, 
    class_weight={0: 1, 1: 15},
    random_state=42
)

# Train the model
model.fit(X_train_resampled_scaled, y_train_resampled)

# Predict on the test set
# Adjust probability threshold for classifying 1
y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
threshold = 0.3
y_pred = (y_pred_proba >= threshold).astype(int)

# Evaluate the model
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[4056  427]
 [ 109  135]]
              precision    recall  f1-score   support

           0       0.97      0.90      0.94      4483
           1       0.24      0.55      0.33       244

    accuracy                           0.89      4727
   macro avg       0.61      0.73      0.64      4727
weighted avg       0.94      0.89      0.91      4727



In [20]:
# Reinitialize the model
model = RandomForestClassifier(
    n_estimators=200, 
    max_depth=25, 
    class_weight={0: 1, 1: 18},
    random_state=42
)

# Train the model
model.fit(X_train_resampled_scaled, y_train_resampled)

# Predict on the test set
# Adjust probability threshold for classifying 1
y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
threshold = 0.35
y_pred = (y_pred_proba >= threshold).astype(int)

# Evaluate the model
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[4133  350]
 [ 116  128]]
              precision    recall  f1-score   support

           0       0.97      0.92      0.95      4483
           1       0.27      0.52      0.35       244

    accuracy                           0.90      4727
   macro avg       0.62      0.72      0.65      4727
weighted avg       0.94      0.90      0.92      4727

