In [22]:
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder
import tensorflow as tf

In [23]:
# Load the dataset
data = pd.read_csv(r"C:\Users\ksarn\OneDrive\Desktop\Project 4\Fraudulent_E-Commerce_Transaction_Data.csv")
# Review the DataFrame
print(data.nunique())
data.head()


Transaction ID        1472952
Customer ID           1472952
Transaction Amount     108998
Transaction Date      1346684
Payment Method              4
Product Category            5
Quantity                    5
Customer Age               97
Customer Location       99135
Device Used                 3
IP Address            1472651
Shipping Address      1472948
Billing Address       1472949
Is Fraudulent               2
Account Age Days          365
Transaction Hour           24
dtype: int64


Unnamed: 0,Transaction ID,Customer ID,Transaction Amount,Transaction Date,Payment Method,Product Category,Quantity,Customer Age,Customer Location,Device Used,IP Address,Shipping Address,Billing Address,Is Fraudulent,Account Age Days,Transaction Hour
0,15d2e414-8735-46fc-9e02-80b472b2580f,d1b87f62-51b2-493b-ad6a-77e0fe13e785,58.09,2024-02-20 05:58:41,bank transfer,electronics,1,17,Amandaborough,tablet,212.195.49.198,Unit 8934 Box 0058\nDPO AA 05437,Unit 8934 Box 0058\nDPO AA 05437,0,30,5
1,0bfee1a0-6d5e-40da-a446-d04e73b1b177,37de64d5-e901-4a56-9ea0-af0c24c069cf,389.96,2024-02-25 08:09:45,debit card,electronics,2,40,East Timothy,desktop,208.106.249.121,"634 May Keys\nPort Cherylview, NV 75063","634 May Keys\nPort Cherylview, NV 75063",0,72,8
2,e588eef4-b754-468e-9d90-d0e0abfc1af0,1bac88d6-4b22-409a-a06b-425119c57225,134.19,2024-03-18 03:42:55,PayPal,home & garden,2,22,Davismouth,tablet,76.63.88.212,"16282 Dana Falls Suite 790\nRothhaven, IL 15564","16282 Dana Falls Suite 790\nRothhaven, IL 15564",0,63,3
3,4de46e52-60c3-49d9-be39-636681009789,2357c76e-9253-4ceb-b44e-ef4b71cb7d4d,226.17,2024-03-16 20:41:31,bank transfer,clothing,5,31,Lynnberg,desktop,207.208.171.73,"828 Strong Loaf Apt. 646\nNew Joshua, UT 84798","828 Strong Loaf Apt. 646\nNew Joshua, UT 84798",0,124,20
4,074a76de-fe2d-443e-a00c-f044cdb68e21,45071bc5-9588-43ea-8093-023caec8ea1c,121.53,2024-01-15 05:08:17,bank transfer,clothing,2,51,South Nicole,tablet,190.172.14.169,"29799 Jason Hills Apt. 439\nWest Richardtown, ...","29799 Jason Hills Apt. 439\nWest Richardtown, ...",0,158,5


In [24]:
# Display dataset structure
print(data.info())

# View the first few rows
print(data.head())

# Check for missing values
print(data.isnull().sum())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1472952 entries, 0 to 1472951
Data columns (total 16 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   Transaction ID      1472952 non-null  object 
 1   Customer ID         1472952 non-null  object 
 2   Transaction Amount  1472952 non-null  float64
 3   Transaction Date    1472952 non-null  object 
 4   Payment Method      1472952 non-null  object 
 5   Product Category    1472952 non-null  object 
 6   Quantity            1472952 non-null  int64  
 7   Customer Age        1472952 non-null  int64  
 8   Customer Location   1472952 non-null  object 
 9   Device Used         1472952 non-null  object 
 10  IP Address          1472952 non-null  object 
 11  Shipping Address    1472952 non-null  object 
 12  Billing Address     1472952 non-null  object 
 13  Is Fraudulent       1472952 non-null  int64  
 14  Account Age Days    1472952 non-null  int64  
 15  Transaction Hou

In [25]:
# Drop irrelevant columns
data_cleaned = data.drop(columns=['Transaction ID', 'Customer ID', 'IP Address', 'Shipping Address', 'Billing Address'])
data_cleaned


# Convert 'Transaction Date' to datetime and extract features
data_cleaned['Transaction Date'] = pd.to_datetime(data_cleaned['Transaction Date'])
data_cleaned['Year'] = data_cleaned['Transaction Date'].dt.year
data_cleaned['Month'] = data_cleaned['Transaction Date'].dt.month
data_cleaned['Day'] = data_cleaned['Transaction Date'].dt.day
data_cleaned['Hour'] = data_cleaned['Transaction Date'].dt.hour
data_cleaned = data_cleaned.drop(columns=['Transaction Date'])


data_cleaned = pd.get_dummies(data_cleaned, columns=['Payment Method', 'Product Category', 'Device Used'], drop_first=True)
data_cleaned



Unnamed: 0,Transaction Amount,Quantity,Customer Age,Customer Location,Is Fraudulent,Account Age Days,Transaction Hour,Year,Month,Day,Hour,Payment Method_bank transfer,Payment Method_credit card,Payment Method_debit card,Product Category_electronics,Product Category_health & beauty,Product Category_home & garden,Product Category_toys & games,Device Used_mobile,Device Used_tablet
0,58.09,1,17,Amandaborough,0,30,5,2024,2,20,5,True,False,False,True,False,False,False,False,True
1,389.96,2,40,East Timothy,0,72,8,2024,2,25,8,False,False,True,True,False,False,False,False,False
2,134.19,2,22,Davismouth,0,63,3,2024,3,18,3,False,False,False,False,False,True,False,False,True
3,226.17,5,31,Lynnberg,0,124,20,2024,3,16,20,True,False,False,False,False,False,False,False,False
4,121.53,2,51,South Nicole,0,158,5,2024,1,15,5,True,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1472947,208.48,2,29,North Laurafort,0,149,23,2024,1,12,23,False,False,False,False,True,False,False,False,False
1472948,231.57,2,32,Burnsville,0,132,4,2024,3,27,4,False,True,False,False,False,False,False,False,True
1472949,101.80,4,36,Garciafort,0,98,23,2024,1,31,23,True,False,False,True,False,False,False,True,False
1472950,61.80,5,34,Lake Josephberg,0,191,16,2024,1,12,16,True,False,False,True,False,False,False,False,False


In [41]:
# Define features and target
X = data_cleaned.drop(columns=['Is Fraudulent'])
y = data_cleaned['Is Fraudulent']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

Features shape: (1472952, 19)
Target shape: (1472952,)


In [27]:
X

Unnamed: 0,Transaction Amount,Quantity,Customer Age,Customer Location,Account Age Days,Transaction Hour,Year,Month,Day,Hour,Payment Method_bank transfer,Payment Method_credit card,Payment Method_debit card,Product Category_electronics,Product Category_health & beauty,Product Category_home & garden,Product Category_toys & games,Device Used_mobile,Device Used_tablet
0,58.09,1,17,Amandaborough,30,5,2024,2,20,5,True,False,False,True,False,False,False,False,True
1,389.96,2,40,East Timothy,72,8,2024,2,25,8,False,False,True,True,False,False,False,False,False
2,134.19,2,22,Davismouth,63,3,2024,3,18,3,False,False,False,False,False,True,False,False,True
3,226.17,5,31,Lynnberg,124,20,2024,3,16,20,True,False,False,False,False,False,False,False,False
4,121.53,2,51,South Nicole,158,5,2024,1,15,5,True,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1472947,208.48,2,29,North Laurafort,149,23,2024,1,12,23,False,False,False,False,True,False,False,False,False
1472948,231.57,2,32,Burnsville,132,4,2024,3,27,4,False,True,False,False,False,False,False,False,True
1472949,101.80,4,36,Garciafort,98,23,2024,1,31,23,True,False,False,True,False,False,False,True,False
1472950,61.80,5,34,Lake Josephberg,191,16,2024,1,12,16,True,False,False,True,False,False,False,False,False


In [28]:
#Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)

In [29]:
print("Customer Location" in X_train.columns)

True


In [36]:
# X_train = X_train.drop(columns=['Customer Location'])
# X_test = X_test.drop(columns=['Customer Location'])

# # # Encode the 'Customer Location' column using one-hot encoding
# X_train = pd.get_dummies(X_train, columns=['Customer Location'], drop_first=True)
# X_test = pd.get_dummies(X_test, columns=['Customer Location'], drop_first=True)

In [37]:
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [21]:
## Compile, Train and Evaluate the Model


In [42]:
# Define the number of input features
number_input_features = X_train.shape[1]

# Define the number of nodes for each layer
hidden_nodes_layer1 =40
hidden_nodes_layer2 = 50
hidden_nodes_layer3 = 20
hidden_nodes_layer4 = 70

# Initialize the sequential model
nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Third hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation="relu"))

# Fourth hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer4, activation="relu"))

# Output layer (binary classification)
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [43]:
# Compile the model
#  YOUR CODE GOES HERE
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
# Train the model
#  YOUR CODE GOES HERE
fit_model = nn.fit(X_train_scaled,y_train,epochs=100)

Epoch 1/100
[1m36824/36824[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 601us/step - accuracy: 0.9667 - loss: 0.1329
Epoch 2/100
[1m36824/36824[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 600us/step - accuracy: 0.9712 - loss: 0.1141
Epoch 3/100
[1m36824/36824[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 613us/step - accuracy: 0.9715 - loss: 0.1133
Epoch 4/100
[1m36824/36824[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 619us/step - accuracy: 0.9716 - loss: 0.1125
Epoch 5/100
[1m36824/36824[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 626us/step - accuracy: 0.9718 - loss: 0.1120
Epoch 6/100
[1m36824/36824[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 646us/step - accuracy: 0.9718 - loss: 0.1122
Epoch 7/100
[1m36824/36824[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 616us/step - accuracy: 0.9715 - loss: 0.1130
Epoch 8/100
[1m36824/36824[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 637us/step - accuracy

In [None]:
scaler = StandardScaler()
numerical_columns = ['transaction_amount', 'other_numeric_feature']
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

In [None]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

In [None]:
# Export our model to HDF5 file
#  YOUR CODE GOES HERE
#nn.save("AlphabetSoupCharity.h5")

nn.save(" Fraudulent transaction_Optimization.h5")