## Fraud Detection

### Fraud Detection models for ecommerce

#### Importing necessary libraries

In [1]:
import os 
import time
import sys
import random
from datetime import datetime
import pandas as pd
import sidetable as stb
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from mlflow import MlflowClient
from pprint import pprint

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import recall_score, accuracy_score, precision_score, f1_score
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from imblearn.over_sampling import SMOTE

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import InputLayer, LSTM, Dense, Conv1D, MaxPooling1D, Flatten,Reshape, SimpleRNN, RNN
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping 
from tensorflow.keras.metrics import Accuracy, Precision, F1Score, Recall
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam
import tensorflow as tf

import mlflow

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
sys.path.append(os.path.abspath('../scripts'))

from Utils import DataUtils
from Model_utils import ModelUtils
from Logger import LOGGER

utils = DataUtils()
model_utils = ModelUtils()
logger = LOGGER

In [4]:
utils.set_seeds(42)

#### Loading data

In [5]:
fraud_data = utils.load_data('Clean_FraudData.csv')
creditCard_data = utils.load_data('Clean_creditCard_data.csv')

2024-10-25 09:34 - DEBUG :: Loading data from file...
2024-10-25 09:34 - INFO :: Loading Clean_FraudData.csv took 4.45 seconds


2024-10-25 09:34 - DEBUG :: Loading data from file...
2024-10-25 09:34 - INFO :: Loading Clean_creditCard_data.csv took 1.53 seconds




#### Setting up Mlflow

In [6]:
client, fraud_experiment, creditCard_experiment = model_utils.setUp_mlflow()

2024-10-25 09:34 - INFO :: Setting up Mlflow
2024-10-25 09:34 - INFO :: Found existing experiment name: Ecommerce-Fraud-Data-forecasting
2024-10-25 09:34 - INFO :: Found existing experiment name: creditCard-Fraud-Data-forecasting


#### Preparing before training

In [7]:
X_train_fraud, X_val_fraud, y_train_fraud, y_val_fraud = model_utils.split_data(fraud_data)
X_train_creditCard, X_val_creditCard, y_train_creditCard, y_val_creditCard = model_utils.split_data(creditCard_data)

2024-10-25 09:35 - INFO :: Splitting Fraud data...
2024-10-25 09:35 - INFO :: Splitting credit card data...


#### Traditional models

In [9]:
lr_model = LogisticRegression()
tree_model = DecisionTreeClassifier()
forest_model = RandomForestClassifier()
boosting_model = GradientBoostingClassifier()
mlp_model = MLPClassifier()

for model in [lr_model, tree_model, forest_model, boosting_model, mlp_model]:
    model_utils.best_model(X_train_fraud, y_train_fraud, X_val_fraud, y_val_fraud, model, False)

2024-10-24 10:33 - INFO :: Start searching for the best Params of a LogisticRegression model


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2024-10-24 10:44 - INFO :: Searching for the best params for LogisticRegression took 619.64 seconds


2024-10-24 10:44 - INFO :: Start searching for the best Params of a DecisionTreeClassifier model


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2024-10-24 10:48 - INFO :: Searching for the best params for DecisionTreeClassifier took 255.07 seconds


2024-10-24 10:48 - INFO :: Start searching for the best Params of a RandomForestClassifier model


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2024-10-24 10:55 - INFO :: Searching for the best params for RandomForestClassifier took 400.64 seconds


2024-10-24 10:55 - INFO :: Start searching for the best Params of a GradientBoostingClassifier model


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2024-10-24 11:25 - INFO :: Searching for the best params for GradientBoostingClassifier took 1822.57 seconds


2024-10-24 11:25 - INFO :: Start searching for the best Params of a MLPClassifier model


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2024-10-24 12:03 - INFO :: Searching for the best params for MLPClassifier took 2270.77 seconds




In [10]:
lr_model = LogisticRegression()
tree_model = DecisionTreeClassifier()
forest_model = RandomForestClassifier()
boosting_model = GradientBoostingClassifier()
mlp_model = MLPClassifier()

logger.info("Start Searching for best models using the creditcard dataset....\n\n")
start_time = time.time()

for model in [lr_model, tree_model, forest_model, boosting_model, mlp_model]:
    model_utils.best_model(X_train_creditCard, y_train_creditCard, X_val_creditCard, y_val_creditCard, model, True)

end_time = time.time()

logger.info(f"Training 5 different models took {round(end_time - start_time, 2)} seconds")

2024-10-24 13:58 - INFO :: Start Searching for best models using the creditcard dataset....


2024-10-24 13:58 - INFO :: Start searching for the best Params of a LogisticRegression model


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2024-10-24 14:04 - INFO :: Searching for the best params for LogisticRegression took 335.65 seconds


2024-10-24 14:04 - INFO :: Start searching for the best Params of a DecisionTreeClassifier model


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2024-10-24 14:06 - INFO :: Searching for the best params for DecisionTreeClassifier took 122.95 seconds


2024-10-24 14:06 - INFO :: Start searching for the best Params of a RandomForestClassifier model


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2024-10-24 14:33 - INFO :: Searching for the best params for RandomForestClassifier took 1597.9 seconds


2024-10-24 14:33 - INFO :: Start searching for the best Params of a GradientBoostingClassifier model


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2024-10-24 16:23 - INFO :: Searching for the best params for GradientBoostingClassifier took 6632.0 seconds


2024-10-24 16:23 - INFO :: Start searching for the best Params of a MLPClassifier model


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2024-10-24 17:55 - INFO :: Searching for the best params for MLPClassifier took 5502.71 seconds


2024-10-24 17:55 - INFO :: Training 5 different models took 14191.37 seconds


#### Building neural network models

In [8]:
model_utils.train_neural_models(X_train_creditCard, y_train_creditCard, X_val_creditCard, y_val_creditCard, True)

2024-10-25 09:35 - INFO :: Start training 3 models, with credit card dataset....


2024-10-25 09:35 - INFO :: Start training LSTM model...


2024-10-25 09:35 - INFO :: Training LSTM took 16.66 seconds


2024-10-25 09:35 - INFO :: Start training CNN model...


2024-10-25 09:35 - INFO :: Training CNN took 17.21 seconds


2024-10-25 09:35 - INFO :: Start training RNN model...


2024-10-25 09:35 - INFO :: Training RNN took 19.01 seconds


2024-10-25 09:35 - INFO :: Training 3 different models took 53.13 seconds


In [9]:
model_utils.train_neural_models(X_train_fraud, y_train_fraud, X_val_fraud, y_val_fraud, False)  

2024-10-25 09:35 - INFO :: Start training 3 models, with ecommerce fraud dataset....


2024-10-25 09:35 - INFO :: Start training LSTM model...


2024-10-25 09:36 - INFO :: Training LSTM took 17.12 seconds


2024-10-25 09:36 - INFO :: Start training CNN model...


2024-10-25 09:36 - INFO :: Training CNN took 26.99 seconds


2024-10-25 09:36 - INFO :: Start training RNN model...


2024-10-25 09:37 - INFO :: Training RNN took 46.03 seconds


2024-10-25 09:37 - INFO :: Training 3 different models took 90.46 seconds
