# Task 2 - Model Building and Training

**Loading the datasets for modeling**

In [1]:
# Import necessary libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os, sys
# Add the 'scripts' directory to the Python path for module imports
sys.path.append(os.path.abspath(os.path.join('..', 'scripts')))

# Set max rows and columns to display
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)

# Configure logging
from logger import SetupLogger
# Assuming this class is defined in scripts/
from data_processor import DataPreprocessor  

logger = SetupLogger(log_file='../logs/notebooks.log').get_logger()

Load the datasets

In [2]:
# Initialize the DataPreprocessor with the logger and the path to the dataset
load_fraud = DataPreprocessor(filepath='../data/processed_fraud_data.csv', logger=logger)
load_credit = DataPreprocessor(filepath='../data/creditcard.csv', logger=logger)
fraud_data = load_fraud.load_dataset().set_index('user_id')
credit_data = load_credit.load_dataset()

In [3]:
fraud_data.head()

Unnamed: 0_level_0,Unnamed: 0,signup_time,purchase_time,purchase_value,device_id,age,ip_address,class,purchase_delay,hour_of_day,day_of_week,user_transaction_frequency,device_transaction_frequency,user_transaction_velocity,source_Direct,source_SEO,browser_FireFox,browser_IE,browser_Opera,browser_Safari,sex_M
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
22058,0,2015-02-24 22:55:49,2015-04-18 02:47:11,-0.160204,QVPSPJUOCKZAR,0.679914,732758400.0,0,-0.136057,-1.377455,0.99102,0.0,-0.261514,-0.230128,False,True,False,False,False,False,True
333320,1,2015-06-07 20:39:50,2015-06-08 01:38:54,-1.142592,EOGFQPIZPYXFZ,2.304476,350311400.0,0,-1.571877,-1.522122,-1.501259,0.0,-0.261514,-0.229874,False,False,False,False,False,False,False
1359,2,2015-01-01 18:52:44,2015-01-01 18:52:45,-1.197169,YSSKYOSJHPPLJ,2.304476,2621474000.0,1,-1.577617,0.937208,-0.005891,0.0,3.941861,4.345476,False,True,False,False,True,False,True
150084,3,2015-04-28 21:13:25,2015-05-04 13:54:50,0.385567,ATGTXKYKUDUQN,0.911994,3840542000.0,0,-1.420213,0.213876,-1.501259,0.0,-0.261514,-0.23012,False,True,False,False,False,True,True
221365,4,2015-07-21 07:09:52,2015-09-09 18:40:53,0.112681,NAUITBZFJKHWW,1.376155,415583100.0,0,-0.182509,0.937208,-0.504347,0.0,-0.261514,-0.230128,False,False,False,False,False,True,True


In [4]:
# Explore the few rows
credit_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,-0.5516,-0.617801,-0.99139,-0.311169,1.468177,-0.470401,0.207971,0.025791,0.403993,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,1.612727,1.065235,0.489095,-0.143772,0.635558,0.463917,-0.114805,-0.183361,-0.145783,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,0.624501,0.066084,0.717293,-0.165946,2.345865,-2.890083,1.109969,-0.121359,-2.261857,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,-0.226487,0.178228,0.507757,-0.287924,-0.631418,-1.059647,-0.684093,1.965775,-1.232622,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,-0.822843,0.538196,1.345852,-1.11967,0.175121,-0.451449,-0.237033,-0.038195,0.803487,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [5]:
# Understand the shapes
credit_data.shape, fraud_data.shape

((284807, 31), (151112, 21))

In [6]:
# Check any missing values
print(credit_data.isnull().sum())
print(fraud_data.isnull().sum())

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64
Unnamed: 0                      0
signup_time                     0
purchase_time                   0
purchase_value                  0
device_id                       0
age                             0
ip_address                      0
class                           0
purchase_delay                  0
hour_of_day                     0
day_of_week                     0
user_transaction_frequency      0
device_transaction_frequency    0
user_transaction_velocity       0
source_Direct                   0
source_SEO                      0
browser_FireFox                 0
browser_IE                      0
bro

In [7]:
# Convert datetime columns
fraud_data['purchase_time'] = pd.to_datetime(fraud_data['purchase_time'],format='%Y-%m-%d %H:%M:%S'  # Specify format if needed
)
fraud_data['signup_time'] = pd.to_datetime(
    fraud_data['signup_time'], 
    format='%Y-%m-%d %H:%M:%S'
)

# Extract numerical features
fraud_data['purchase_hour'] = fraud_data['purchase_time'].dt.hour
fraud_data['purchase_day'] = fraud_data['purchase_time'].dt.dayofweek
fraud_data['signup_hour'] = fraud_data['signup_time'].dt.hour
fraud_data['signup_day'] = fraud_data['signup_time'].dt.dayofweek

# Drop original datetime columns
fraud_data = fraud_data.drop(columns=['purchase_time', 'signup_time'])

In [8]:
# Drop device_id
fraud_data = fraud_data.drop(columns=['device_id'])
# Drop ip_address
fraud_data = fraud_data.drop(columns=['ip_address'])

In [9]:
bool_cols = fraud_data.select_dtypes(include='bool').columns.tolist()
fraud_data[bool_cols] = fraud_data[bool_cols].astype('int64')

**Data Preparation:**

* Feature and Target Separation [‘Class’(creditcard), ‘class’(Fraud_Data)] Train-Test Split
  
For creditcard dataset (target column 'Class'):

In [10]:
from data_preparation import DataPreparation
# Assuming df_creditcard is the DataFrame for the credit card dataset
_creditcard = DataPreparation(credit_data, target_column='Class')
_creditcard.train_test_split(test_size=0.2, random_state=42)

# Retrieving the train and test sets
X_train_cc, X_test_cc, y_train_cc, y_test_cc = _creditcard.get_train_test_data()

Data split into training and testing sets successfully.


For Fraud_Data dataset (target column 'class'):

In [11]:
# Assuming df_fraud is the DataFrame for the fraud dataset
_fraud = DataPreparation(fraud_data, target_column='class')
_fraud.train_test_split(test_size=0.2, random_state=42)

# Retrieving the train and test sets
X_train_fd, X_test_fd, y_train_fd, y_test_fd = _fraud.get_train_test_data()

Data split into training and testing sets successfully.


## Model Selection
* Import ModelPipeline class from model_pipeline
* Train multiple models
* hyperparameter tune
* evaluate the model
* compare the model

In [12]:
import os
import mlflow
os.environ['CUDA_VISIBLE_DEVICES'] = '-1' # Disable CUDA

# Import the class
from model_pipeline import ModelPipeline
mlflow.set_tracking_uri("http://localhost:5000")

**Train and Evaluate the models on the e-commerce fruad dataset**

In [13]:
# Credit Card Dataset
mlflow.set_experiment("CreditCard Detection")
model_pipeline_cc = ModelPipeline(X_train_cc, X_test_cc, y_train_cc, y_test_cc)
best_model_cc, best_model_name_cc = model_pipeline_cc.train_and_evaluate()
model_pipeline_cc.save_best_models(best_model_cc, best_model_name_cc, 'creditcard')

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Quick tuning for Logistic Regression...
Quick tuning for Decision Tree...
Quick tuning for Random Forest...
Quick tuning for Gradient Boosting...




Logistic Regression took 9.98 seconds to train


Successfully registered model 'logistic_regression'.
2025/02/11 12:03:11 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: logistic_regression, version 1
Created version '1' of model 'logistic_regression'.


Logistic Regression model trained and logged with MLflow
🏃 View run Logistic Regression at: http://localhost:5000/#/experiments/1/runs/9a553cd33a744bd2b42137e3861e9fac
🧪 View experiment at: http://localhost:5000/#/experiments/1




Decision Tree took 15.51 seconds to train


Successfully registered model 'decision_tree'.
2025/02/11 12:03:30 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: decision_tree, version 1
Created version '1' of model 'decision_tree'.


Decision Tree model trained and logged with MLflow
🏃 View run Decision Tree at: http://localhost:5000/#/experiments/1/runs/77b2ffbabdb6453488b55380bb7c54e5
🧪 View experiment at: http://localhost:5000/#/experiments/1




Random Forest took 148.33 seconds to train


Successfully registered model 'random_forest'.
2025/02/11 12:06:03 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: random_forest, version 1
Created version '1' of model 'random_forest'.


Random Forest model trained and logged with MLflow
🏃 View run Random Forest at: http://localhost:5000/#/experiments/1/runs/b5eabfab3d094af2828c60f3877c4fc5
🧪 View experiment at: http://localhost:5000/#/experiments/1




Gradient Boosting took 344.48 seconds to train


Successfully registered model 'gradient_boosting'.
2025/02/11 12:11:51 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: gradient_boosting, version 1
Created version '1' of model 'gradient_boosting'.


Gradient Boosting model trained and logged with MLflow
🏃 View run Gradient Boosting at: http://localhost:5000/#/experiments/1/runs/a702413a4cec493a9e7112490b56ae27
🧪 View experiment at: http://localhost:5000/#/experiments/1
[1m1781/1781[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 478us/step




MLP took 21.73 seconds to train


Successfully registered model 'mlp'.
2025/02/11 12:12:21 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: mlp, version 1
Created version '1' of model 'mlp'.


MLP model trained and logged with MLflow
🏃 View run MLP at: http://localhost:5000/#/experiments/1/runs/2bec209f8ff94282830735d108fb1f7e
🧪 View experiment at: http://localhost:5000/#/experiments/1
[1m1781/1781[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step




RNN took 40.66 seconds to train


Successfully registered model 'rnn'.
2025/02/11 12:13:09 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: rnn, version 1
Created version '1' of model 'rnn'.


RNN model trained and logged with MLflow
🏃 View run RNN at: http://localhost:5000/#/experiments/1/runs/0b753d2ff23740a983029c5f394b6d69
🧪 View experiment at: http://localhost:5000/#/experiments/1
[1m1781/1781[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step




LSTM took 88.72 seconds to train


Successfully registered model 'lstm'.
2025/02/11 12:14:45 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: lstm, version 1
Created version '1' of model 'lstm'.


LSTM model trained and logged with MLflow
🏃 View run LSTM at: http://localhost:5000/#/experiments/1/runs/d8ae6c1aa67247c3ba6fd4d763695412
🧪 View experiment at: http://localhost:5000/#/experiments/1
[1m1781/1781[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 596us/step




CNN took 22.16 seconds to train


Successfully registered model 'cnn'.
2025/02/11 12:15:14 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: cnn, version 1


CNN model trained and logged with MLflow
🏃 View run CNN at: http://localhost:5000/#/experiments/1/runs/4c9a9deb4b154559b7d6456cd84726dc
🧪 View experiment at: http://localhost:5000/#/experiments/1
Logistic Regression best model saved.


Created version '1' of model 'cnn'.


In [14]:
# Get the results 
results_fraud, y_probs_fraud = model_pipeline_cc.get_results()

In [15]:
pd.DataFrame(results_fraud).T

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,ROC AUC
Logistic Regression,0.999122,0.863636,0.581633,0.695122,0.97467
Decision Tree,0.999386,0.879518,0.744898,0.80663,0.861737
Random Forest,0.999561,0.974026,0.765306,0.857143,0.974598
Gradient Boosting,0.998947,0.7375,0.602041,0.662921,0.785511
MLP,0.995716,0.212598,0.55102,0.306818,0.865403
RNN,0.998789,0.914286,0.326531,0.481203,0.973361
LSTM,0.999333,0.826087,0.77551,0.8,0.973326
CNN,0.999263,0.833333,0.714286,0.769231,0.912078


In [16]:
# Fraud Dataset
mlflow.set_experiment("Fraud Detection")
#mlflow.set_tracking_uri("https://dagshub.com/Jenber-Ligab/e-commerce-and-banking-fraud-detection.mlflow")
model_pipeline_fd = ModelPipeline(X_train_fd, X_test_fd, y_train_fd, y_test_fd)
best_model_fd, best_model_name_fd = model_pipeline_fd.train_and_evaluate()
model_pipeline_fd.save_best_models(best_model_fd, best_model_name_fd, 'fraud')

2025/02/11 12:15:14 INFO mlflow.tracking.fluent: Experiment with name 'Fraud Detection' does not exist. Creating a new experiment.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Quick tuning for Logistic Regression...
Quick tuning for Decision Tree...
Quick tuning for Random Forest...
Quick tuning for Gradient Boosting...




Logistic Regression took 4.14 seconds to train


Registered model 'logistic_regression' already exists. Creating a new version of this model...
2025/02/11 12:16:12 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: logistic_regression, version 2
Created version '2' of model 'logistic_regression'.


Logistic Regression model trained and logged with MLflow
🏃 View run Logistic Regression at: http://localhost:5000/#/experiments/2/runs/32b1823004c240c0bdaf6056e287694f
🧪 View experiment at: http://localhost:5000/#/experiments/2




Decision Tree took 3.90 seconds to train


Registered model 'decision_tree' already exists. Creating a new version of this model...
2025/02/11 12:16:19 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: decision_tree, version 2
Created version '2' of model 'decision_tree'.


Decision Tree model trained and logged with MLflow
🏃 View run Decision Tree at: http://localhost:5000/#/experiments/2/runs/4bbe49dfde6547bab33419cf391652e2
🧪 View experiment at: http://localhost:5000/#/experiments/2




Random Forest took 9.13 seconds to train


Registered model 'random_forest' already exists. Creating a new version of this model...
2025/02/11 12:16:33 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: random_forest, version 2
Created version '2' of model 'random_forest'.


Random Forest model trained and logged with MLflow
🏃 View run Random Forest at: http://localhost:5000/#/experiments/2/runs/5384c63dec7c4465b049030024055c34
🧪 View experiment at: http://localhost:5000/#/experiments/2




Gradient Boosting took 25.19 seconds to train


Registered model 'gradient_boosting' already exists. Creating a new version of this model...
2025/02/11 12:17:02 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: gradient_boosting, version 2
Created version '2' of model 'gradient_boosting'.


Gradient Boosting model trained and logged with MLflow
🏃 View run Gradient Boosting at: http://localhost:5000/#/experiments/2/runs/c00b30355b854a399a1b3082e50024ce
🧪 View experiment at: http://localhost:5000/#/experiments/2
[1m945/945[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 506us/step




MLP took 13.67 seconds to train


Registered model 'mlp' already exists. Creating a new version of this model...
2025/02/11 12:17:23 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: mlp, version 2
Created version '2' of model 'mlp'.


MLP model trained and logged with MLflow
🏃 View run MLP at: http://localhost:5000/#/experiments/2/runs/2129a8844b5941cf8735e026ba92a5f4
🧪 View experiment at: http://localhost:5000/#/experiments/2
[1m945/945[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 944us/step




RNN took 21.42 seconds to train


Registered model 'rnn' already exists. Creating a new version of this model...
2025/02/11 12:17:52 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: rnn, version 2
Created version '2' of model 'rnn'.


RNN model trained and logged with MLflow
🏃 View run RNN at: http://localhost:5000/#/experiments/2/runs/747412e9b78943f3a038f71dba6f70e2
🧪 View experiment at: http://localhost:5000/#/experiments/2
[1m945/945[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step




LSTM took 35.55 seconds to train


Registered model 'lstm' already exists. Creating a new version of this model...
2025/02/11 12:18:35 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: lstm, version 2
Created version '2' of model 'lstm'.


LSTM model trained and logged with MLflow
🏃 View run LSTM at: http://localhost:5000/#/experiments/2/runs/271e2cede25544d7b44e0b9c112c5246
🧪 View experiment at: http://localhost:5000/#/experiments/2
[1m945/945[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 603us/step




CNN took 15.74 seconds to train


Registered model 'cnn' already exists. Creating a new version of this model...
2025/02/11 12:18:58 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: cnn, version 2


CNN model trained and logged with MLflow
🏃 View run CNN at: http://localhost:5000/#/experiments/2/runs/00a2275649be461fa9a427c55fdc23cd
🧪 View experiment at: http://localhost:5000/#/experiments/2
Gradient Boosting best model saved.


Created version '2' of model 'cnn'.


In [17]:
# Get the results 
results_fraud, y_probs_fraud = model_pipeline_fd.get_results()

In [18]:
pd.DataFrame(results_fraud).T

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,ROC AUC
Logistic Regression,0.956292,0.997398,0.537895,0.698883,0.829598
Decision Tree,0.956391,0.999348,0.537895,0.699361,0.851547
Random Forest,0.956424,1.0,0.537895,0.699521,0.844066
Gradient Boosting,0.956424,1.0,0.537895,0.699521,0.851655
MLP,0.911193,0.941489,0.062105,0.116524,0.53603
RNN,0.956424,1.0,0.537895,0.699521,0.845892
LSTM,0.956424,1.0,0.537895,0.699521,0.775576
CNN,0.950369,0.885714,0.54386,0.673913,0.773826
