In [12]:
import sys
import os

# Get the parent directory of the current notebook
parent_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))

# Add the parent directory to sys.path
sys.path.append(parent_dir)

#import from scripts
from scripts.load_and_prepare import load_and_prepare_data  
from scripts.scale import scale_features 
from scripts.train_sklearn import train_sklearn_model 
from scripts.train_keras import train_keras_model     




In [13]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    classification_report, roc_auc_score, confusion_matrix, 
    precision_score, recall_score, f1_score
)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from imblearn.over_sampling import SMOTE
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, LSTM, Reshape
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

In [14]:
#Initialize MLflow
mlflow.set_experiment("Fraud_Detection")

<Experiment: artifact_location='file:///c:/Users/Administrator.MMCY/OneDrive%20-%20MMCYTECH/Desktop/10a/Adey-Innovations-Inc-W8%269/notebooks/mlruns/927946428263586000', creation_time=1739368183212, experiment_id='927946428263586000', last_update_time=1739368183212, lifecycle_stage='active', name='Fraud_Detection', tags={}>

Running the Pipeline for Fraud_Data

In [15]:

# Load and prepare data
X, y = load_and_prepare_data("../src/data/fraud_data_processed.csv", "class")

In [16]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

In [17]:
# Identify numerical features to scale
numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
pre_scaled = [col for col in numerical_features if 'scaled' in col]
numerical_features = list(set(numerical_features) - set(pre_scaled))

In [18]:
# Scale features
X_train_scaled, X_test_scaled = scale_features(X_train, X_test, numerical_features)

In [19]:
# Apply SMOTE to balance classes
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_train_scaled, y_train)

In [20]:
# Define scikit-learn models
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "DecisionTree": DecisionTreeClassifier(),
    "RandomForest": RandomForestClassifier(n_estimators=100),
    "GradientBoosting": GradientBoostingClassifier()
}

In [21]:
# Train scikit-learn models
for name, model in models.items():
    train_sklearn_model(model, name, X_res, y_res, X_test_scaled, y_test)



In [22]:
# Train Keras models
n_features = X_res.shape[1]
X_res_reshaped = X_res.values.reshape(-1, n_features, 1)  # For CNN
X_test_reshaped = X_test_scaled.values.reshape(-1, n_features, 1)

In [23]:
# MLP
mlp = Sequential([
    Dense(64, activation='relu', input_dim=n_features),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])
train_keras_model(mlp, "MLP", X_res, y_res, X_test_scaled, y_test)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1417/1417[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step




In [24]:
# 1. Convert data to numeric types
X_res = X_res.astype(float)
X_test_scaled = X_test_scaled.astype(float)

# 2. Reshape data for CNN
n_features = X_res.shape[1]
X_res_reshaped = X_res.values.reshape(-1, n_features, 1).astype(np.float32)
X_test_reshaped = X_test_scaled.values.reshape(-1, n_features, 1).astype(np.float32)

# 3. Define and train the CNN
cnn = Sequential([
    Conv1D(32, 3, activation='relu', input_shape=(n_features, 1)),
    MaxPooling1D(2),
    Flatten(),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

train_keras_model(cnn, "CNN", X_res_reshaped, y_res, X_test_reshaped, y_test)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1417/1417[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step




In [25]:
# LSTM
lstm = Sequential([
    Reshape((1, n_features), input_shape=(n_features,)),
    LSTM(64),
    Dense(1, activation='sigmoid')
])
train_keras_model(lstm, "LSTM", X_res, y_res, X_test_scaled, y_test)

  super().__init__(**kwargs)


[1m1417/1417[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step




Run the Pipeline for Credit Card Data

In [26]:
# Load and prepare data
X, y = load_and_prepare_data("../src/data/creditcard_processed.csv", "Class")

In [27]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)


In [28]:
# Identify numerical features to scale
numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
pre_scaled = [col for col in numerical_features if 'scaled' in col]
numerical_features = list(set(numerical_features) - set(pre_scaled))

In [29]:
# Scale features
X_train_scaled, X_test_scaled = scale_features(X_train, X_test, numerical_features)


In [30]:
# Apply SMOTE to balance classes
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_train_scaled, y_train)

In [31]:
# Train scikit-learn models
for name, model in models.items():
    train_sklearn_model(model, name, X_res, y_res, X_test_scaled, y_test)



In [32]:
# Train Keras models
n_features = X_res.shape[1]
X_res_reshaped = X_res.values.reshape(-1, n_features, 1)  # For CNN
X_test_reshaped = X_test_scaled.values.reshape(-1, n_features, 1)

In [33]:
# MLP
mlp = Sequential([
    Dense(64, activation='relu', input_dim=n_features),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])
train_keras_model(mlp, "MLP", X_res, y_res, X_test_scaled, y_test)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2660/2660[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step




In [34]:
# CNN
cnn = Sequential([
    Conv1D(32, 3, activation='relu', input_shape=(n_features, 1)),
    MaxPooling1D(2),
    Flatten(),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])
train_keras_model(cnn, "CNN", X_res_reshaped, y_res, X_test_reshaped, y_test)




  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2660/2660[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step




In [35]:
# LSTM
lstm = Sequential([
    Reshape((1, n_features), input_shape=(n_features,)),
    LSTM(64),
    Dense(1, activation='sigmoid')
])
train_keras_model(lstm, "LSTM", X_res, y_res, X_test_scaled, y_test)

  super().__init__(**kwargs)


[1m2660/2660[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step




In [None]:
# View MLflow Results(on localhost)
!mlflow ui

^C
