In [7]:
import sys
import os

# Get the parent directory of the current notebook
parent_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))

# Add the parent directory to sys.path
sys.path.append(parent_dir)

#import from scripts
from scripts.load_and_prepare import load_and_prepare_data  
from scripts.scale import scale_features 
from scripts.train_sklearn import train_sklearn_model 
from scripts.train_keras import train_keras_model     




In [8]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    classification_report, roc_auc_score, confusion_matrix, 
    precision_score, recall_score, f1_score
)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from imblearn.over_sampling import SMOTE
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, LSTM, Reshape
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

In [9]:
#Initialize MLflow
mlflow.set_experiment("Fraud_Detection")

2025/02/11 20:57:30 INFO mlflow.tracking.fluent: Experiment with name 'Fraud_Detection' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///c:/Users/Administrator.MMCY/OneDrive%20-%20MMCYTECH/Desktop/10a/Adey-Innovations-Inc-W8%269/notebooks/mlruns/160772129970918094', creation_time=1739296650189, experiment_id='160772129970918094', last_update_time=1739296650189, lifecycle_stage='active', name='Fraud_Detection', tags={}>

Running the Pipeline for Fraud_Data

In [None]:

# Load and prepare data
X, y = load_and_prepare_data("../src/fraud_data_processed.csv", "class")

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

In [None]:
# Identify numerical features to scale
numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
pre_scaled = [col for col in numerical_features if 'scaled' in col]
numerical_features = list(set(numerical_features) - set(pre_scaled))

In [None]:
# Scale features
X_train_scaled, X_test_scaled = scale_features(X_train, X_test, numerical_features)

In [None]:
# Apply SMOTE to balance classes
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_train_scaled, y_train)

In [None]:
# Define scikit-learn models
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "DecisionTree": DecisionTreeClassifier(),
    "RandomForest": RandomForestClassifier(n_estimators=100),
    "GradientBoosting": GradientBoostingClassifier()
}

In [None]:
# Train scikit-learn models
for name, model in models.items():
    train_sklearn_model(model, name, X_res, y_res, X_test_scaled, y_test)

In [None]:
# Train Keras models
n_features = X_res.shape[1]
X_res_reshaped = X_res.values.reshape(-1, n_features, 1)  # For CNN
X_test_reshaped = X_test_scaled.values.reshape(-1, n_features, 1)

In [None]:
# MLP
mlp = Sequential([
    Dense(64, activation='relu', input_dim=n_features),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])
train_keras_model(mlp, "MLP", X_res, y_res, X_test_scaled, y_test)

In [None]:
# CNN
cnn = Sequential([
    Conv1D(32, 3, activation='relu', input_shape=(n_features, 1)),
    MaxPooling1D(2),
    Flatten(),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])
train_keras_model(cnn, "CNN", X_res_reshaped, y_res, X_test_reshaped, y_test)

In [None]:
# LSTM
lstm = Sequential([
    Reshape((1, n_features), input_shape=(n_features,)),
    LSTM(64),
    Dense(1, activation='sigmoid')
])
train_keras_model(lstm, "LSTM", X_res, y_res, X_test_scaled, y_test)

Run the Pipeline for Credit Card Data

In [None]:
# Load and prepare data
X, y = load_and_prepare_data("creditcard_processed.csv", "Class")

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)
