#### Task 3 Model Building and Model Explainability


In [None]:
# Import python library
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler,LabelEncoder
import os,sys

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
import mlflow
import mlflow.sklearn
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, LSTM, Dropout, Flatten
import numpy as np
from mlflow import log_metric

In [3]:
# Get the absolute path of the parent directory
rpath = os.path.abspath('..')
if rpath not in sys.path:
    sys.path.insert(0, rpath)

In [4]:
# import util functions
from scripts.utils import  *

In [5]:
# Load the datasets
fraud_data = pd.read_csv("../data/Fraud_Data.csv")
credit_data = pd.read_csv("../data/creditcard.csv")

In [6]:
# Credit data Information
credit_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [7]:
# Fraud data information
fraud_data.head()

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758400.0,0
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311400.0,0
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2621474000.0,1
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542000.0,0
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583100.0,0


In [7]:
# Set up MLflow
mlflow.set_experiment("Fraud Detection")

<Experiment: artifact_location='file:///home/brook/Music/10Academy/week-8/Adey-Innovations/notebooks/mlruns/105863702443849380', creation_time=1730276744241, experiment_id='105863702443849380', last_update_time=1730276744241, lifecycle_stage='active', name='Fraud Detection', tags={}>

Convert datetime strings to datetime objects


In [8]:
# Convert datetime strings to datetime objects
fraud_data['signup_time'] = pd.to_datetime(fraud_data['signup_time'])
fraud_data['purchase_time'] = pd.to_datetime(fraud_data['purchase_time'])

In [9]:
# Extract useful datetime components
fraud_data['signup_hour'] = fraud_data['signup_time'].dt.hour
fraud_data['signup_day'] = fraud_data['signup_time'].dt.dayofweek
fraud_data['purchase_hour'] = fraud_data['purchase_time'].dt.hour
fraud_data['purchase_day'] = fraud_data['purchase_time'].dt.dayofweek

# Drop the original datetime columns
fraud_data = fraud_data.drop(columns=['signup_time', 'purchase_time'])

Model training

Preprocess Fraud Data

In [10]:
# Preprocess Fraud Data
X_fraud = fraud_data.drop(columns=['class'])
y_fraud = fraud_data['class']

Preprocess Credit-card Data

In [11]:
# Preprocess creditcard.csv
X_credit = credit_data.drop(columns=['Class'])
y_credit = credit_data['Class']

Train-test split

In [12]:
# Train-test split
X_train_fraud, X_test_fraud, y_train_fraud, y_test_fraud = train_test_split(X_fraud, y_fraud, test_size=0.2, random_state=42, stratify=y_fraud)
X_train_cc, X_test_cc, y_train_cc, y_test_cc = train_test_split(X_credit, y_credit, test_size=0.2, random_state=42, stratify=y_credit)

In [13]:
# Convert datetime columns if any
if 'TransactionDate' in X_train_fraud.columns:  # Replace with your datetime column
    X_train_fraud['TransactionDate'] = pd.to_datetime(X_train_fraud['TransactionDate'])
    X_train_fraud['TransactionYear'] = X_train_fraud['TransactionDate'].dt.year
    X_train_fraud['TransactionMonth'] = X_train_fraud['TransactionDate'].dt.month
    X_train_fraud['TransactionDay'] = X_train_fraud['TransactionDate'].dt.day
    X_train_fraud['TransactionHour'] = X_train_fraud['TransactionDate'].dt.hour
    # Drop the original datetime column
    X_train_fraud = X_train_fraud.drop('TransactionDate', axis=1)

# Repeat for X_test_fraud if needed

# Select only numeric columns for scaling
numeric_cols_fraud = X_train_fraud.select_dtypes(include=['float64', 'int64']).columns
numeric_cols_cc = X_train_cc.select_dtypes(include=['float64', 'int64']).columns

Feature scaling (standardization)

In [14]:
# Feature scaling (standardization)
scaler = StandardScaler()

X_train_fraud_scaled = scaler.fit_transform(X_train_fraud[numeric_cols_fraud])
X_test_fraud_scaled = scaler.transform(X_test_fraud[numeric_cols_fraud])

X_train_cc_scaled = scaler.fit_transform(X_train_cc[numeric_cols_cc])
X_test_cc_scaled = scaler.transform(X_test_cc[numeric_cols_cc])

Evaluation Function

In [15]:
# Define Evaluation Function
def evaluate_model(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, zero_division=1)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_pred)
    return accuracy, precision, recall, f1, roc_auc

Log function

In [16]:
def log_metrics(model_name, accuracy, precision, recall, f1, roc_auc):
    mlflow.log_param("Model", model_name)
    mlflow.log_metric("Accuracy", accuracy)
    mlflow.log_metric("Precision", precision)
    mlflow.log_metric("Recall", recall)
    mlflow.log_metric("F1 Score", f1)
    mlflow.log_metric("ROC-AUC", roc_auc)

Train and Evaluate Models

In [17]:
# Train and Evaluate Models
def train_and_evaluate(model, X_train, X_test, y_train, y_test, model_name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy, precision, recall, f1, roc_auc = evaluate_model(y_test, y_pred)
    
    with mlflow.start_run():
        mlflow.sklearn.log_model(model, model_name)
        log_metrics(model_name, accuracy, precision, recall, f1, roc_auc)
        
    print(f"{model_name} - Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1: {f1}, ROC-AUC: {roc_auc}")

In [18]:
# Initialize models
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(n_estimators=100),
    'Gradient Boosting': GradientBoostingClassifier(),
    'MLP': MLPClassifier(max_iter=500)
}

In [19]:
from sklearn.preprocessing import LabelEncoder

# Label encode 'sex' column
label_encoder = LabelEncoder()
X_train_fraud['sex'] = label_encoder.fit_transform(X_train_fraud['sex'])
X_test_fraud['sex'] = label_encoder.transform(X_test_fraud['sex'])

In [20]:
# Frequency encoding for 'device_id', 'source', and 'browser'
for col in ['device_id', 'source', 'browser']:
    freq_encoding = X_train_fraud[col].value_counts(normalize=True)
    X_train_fraud[col] = X_train_fraud[col].map(freq_encoding)
    X_test_fraud[col] = X_test_fraud[col].map(freq_encoding)

In [21]:
from category_encoders import TargetEncoder

# Target encode 'device_id', 'source', and 'browser'
target_encoder = TargetEncoder(cols=['device_id', 'source', 'browser'])
X_train_fraud = target_encoder.fit_transform(X_train_fraud, y_train_fraud)
X_test_fraud = target_encoder.transform(X_test_fraud)

In [22]:
# Encode only the top N frequent categories and group others
N = 10  # Choose the number of top categories to keep

for col in ['device_id', 'source', 'browser']:
    top_categories = X_train_fraud[col].value_counts().nlargest(N).index
    X_train_fraud[col] = X_train_fraud[col].where(X_train_fraud[col].isin(top_categories), other='Other')
    X_test_fraud[col] = X_test_fraud[col].where(X_test_fraud[col].isin(top_categories), other='Other')

# Then, apply one-hot encoding after reducing categories
X_train_fraud = pd.get_dummies(X_train_fraud, columns=['device_id', 'source', 'browser'], drop_first=True)
X_test_fraud = pd.get_dummies(X_test_fraud, columns=['device_id', 'source', 'browser'], drop_first=True)

Train and evaluate on Fraud Data and Creditcard

In [23]:
# Train and evaluate on Fraud_Data.csv
for model_name, model in models.items():
    train_and_evaluate(model, X_train_fraud, X_test_fraud, y_train_fraud, y_test_fraud, model_name)

# Train and evaluate on creditcard.csv
for model_name, model in models.items():
    train_and_evaluate(model, X_train_cc, X_test_cc, y_train_cc, y_test_cc, model_name)



Logistic Regression - Accuracy: 0.9063627039010026, Precision: 1.0, Recall: 0.0, F1: 0.0, ROC-AUC: 0.5




Decision Tree - Accuracy: 0.6365350891704993, Precision: 0.15488785442234448, Recall: 0.6466431095406361, F1: 0.2499146466370775, ROC-AUC: 0.6410669641814815




Random Forest - Accuracy: 0.6231677861231513, Precision: 0.15136456211812627, Recall: 0.6565371024734983, F1: 0.24601125455147302, ROC-AUC: 0.638128734495246




Gradient Boosting - Accuracy: 0.18654666975482248, Precision: 0.0882698058215678, Recall: 0.8240282685512368, F1: 0.1594584430236931, ROC-AUC: 0.4723580177494986




MLP - Accuracy: 0.9061310922145386, Precision: 0.1111111111111111, Recall: 0.00035335689045936394, F1: 0.0007044734061289186, ROC-AUC: 0.5000306557386989


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression - Accuracy: 0.9989993328885924, Precision: 0.6915887850467289, Recall: 0.7551020408163265, F1: 0.7219512195121951, ROC-AUC: 0.8772608543980338




Decision Tree - Accuracy: 0.9991046662687406, Precision: 0.7422680412371134, Recall: 0.7346938775510204, F1: 0.7384615384615385, ROC-AUC: 0.8671271160405637




Random Forest - Accuracy: 0.9996137776061234, Precision: 0.9318181818181818, Recall: 0.8367346938775511, F1: 0.8817204301075269, ROC-AUC: 0.9183145894823884




Gradient Boosting - Accuracy: 0.9983146659176293, Precision: 0.5294117647058824, Recall: 0.1836734693877551, F1: 0.2727272727272727, ROC-AUC: 0.5916960481435117




MLP - Accuracy: 0.9989642217618764, Precision: 0.656, Recall: 0.8367346938775511, F1: 0.7354260089686099, ROC-AUC: 0.9179892518346676


Convolutional Neural Network (CNN)

In [24]:
# Convolutional Neural Network (CNN)
def build_cnn_model(input_shape):
    model = Sequential()
    model.add(Conv1D(64, kernel_size=3, activation='relu', input_shape=input_shape))
    model.add(Flatten())  # Flatten the output from Conv1D before feeding into Dense layers
    model.add(Dense(64, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))  # For binary classification
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [25]:
# Ensure y_train_fraud and y_test_fraud are NumPy arrays with the correct dtype
y_train_fraud = np.array(y_train_fraud, dtype='float32')
y_test_fraud = np.array(y_test_fraud, dtype='float32')

# Convert the DataFrame to a NumPy array and reshape it for CNN input
X_train_fraud_reshaped = X_train_fraud.values.reshape(-1, X_train_fraud.shape[1], 1).astype('float32')
X_test_fraud_reshaped = X_test_fraud.values.reshape(-1, X_test_fraud.shape[1], 1).astype('float32')

# Build and train the CNN model
cnn_model = build_cnn_model((X_train_fraud.shape[1], 1))
cnn_model.fit(X_train_fraud_reshaped, y_train_fraud, epochs=10, batch_size=64)

# Make predictions on the test set
y_pred_cnn = cnn_model.predict(X_test_fraud_reshaped)
y_pred_cnn = (y_pred_cnn > 0.5).astype(int)

# Evaluate the model
accuracy, precision, recall, f1, roc_auc = evaluate_model(y_test_fraud, y_pred_cnn)

# Log the model and metrics using MLflow
with mlflow.start_run():
    mlflow.keras.log_model(cnn_model, "CNN")
    log_metrics("CNN", accuracy, precision, recall, f1, roc_auc)


# Print the evaluation results
print(f"CNN - Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1: {f1}, ROC-AUC: {roc_auc}")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
2024-10-30 16:12:59.623917: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


Epoch 1/10
[1m1889/1889[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 10ms/step - accuracy: 0.8327 - loss: 2654066.5000
Epoch 2/10
[1m1889/1889[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 8ms/step - accuracy: 0.8306 - loss: 767229.3750
Epoch 3/10
[1m1889/1889[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 10ms/step - accuracy: 0.8285 - loss: 233463.3906
Epoch 4/10
[1m1889/1889[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 9ms/step - accuracy: 0.8286 - loss: 110172.6094
Epoch 5/10
[1m1889/1889[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 10ms/step - accuracy: 0.8339 - loss: 52973.3320
Epoch 6/10
[1m1889/1889[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 9ms/step - accuracy: 0.8318 - loss: 22956.1172
Epoch 7/10
[1m1889/1889[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 9ms/step - accuracy: 0.8315 - loss: 11324.3857
Epoch 8/10
[1m1889/1889[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 8ms/step - accuracy: 0.



CNN - Accuracy: 0.9063627039010026, Precision: 1.0, Recall: 0.0, F1: 0.0, ROC-AUC: 0.5


Build and train the LSTM model

In [28]:
# Convert DataFrames to NumPy arrays and reshape
X_train_fraud_reshaped = X_train_fraud.values.reshape(-1, X_train_fraud.shape[1], 1).astype('float32')
X_test_fraud_reshaped = X_test_fraud.values.reshape(-1, X_test_fraud.shape[1], 1).astype('float32')

# Ensure the target variables are NumPy arrays with the correct dtype
y_train_fraud = np.array(y_train_fraud, dtype='float32')
y_test_fraud = np.array(y_test_fraud, dtype='float32')

# Define the LSTM model building function
def build_lstm_model(input_shape):
    model = models.Sequential()
    model.add(layers.LSTM(64, input_shape=input_shape, return_sequences=True))
    model.add(layers.LSTM(32))
    model.add(layers.Dense(1, activation='sigmoid'))  # Binary classification
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Build and train the LSTM model
lstm_model = build_lstm_model((X_train_fraud.shape[1], 1))
lstm_model.fit(X_train_fraud_reshaped, y_train_fraud, epochs=10, batch_size=64)

# Make predictions on the test set
y_pred_lstm = lstm_model.predict(X_test_fraud_reshaped)
y_pred_lstm = (y_pred_lstm > 0.5).astype(int)

# Evaluate the model
def evaluate_model(y_true, y_pred):
    accuracy = np.mean(y_true == y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_pred)
    return accuracy, precision, recall, f1, roc_auc

accuracy, precision, recall, f1, roc_auc = evaluate_model(y_test_fraud, y_pred_lstm)

# Log the model and metrics using MLflow
with mlflow.start_run():
    mlflow.keras.log_model(lstm_model, "LSTM")
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("f1", f1)
    mlflow.log_metric("roc_auc", roc_auc)

# Print the evaluation results
print(f"LSTM - Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1: {f1}, ROC-AUC: {roc_auc}")


  super().__init__(**kwargs)


Epoch 1/10
[1m1889/1889[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 32ms/step - accuracy: 0.9146 - loss: 0.2912
Epoch 2/10
[1m1889/1889[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 32ms/step - accuracy: 0.9469 - loss: 0.1938
Epoch 3/10
[1m1889/1889[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 33ms/step - accuracy: 0.9495 - loss: 0.1864
Epoch 4/10
[1m1889/1889[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 32ms/step - accuracy: 0.9497 - loss: 0.1841
Epoch 5/10
[1m1889/1889[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 32ms/step - accuracy: 0.9498 - loss: 0.1847
Epoch 6/10
[1m1889/1889[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 34ms/step - accuracy: 0.9506 - loss: 0.1812
Epoch 7/10
[1m1889/1889[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 38ms/step - accuracy: 0.9513 - loss: 0.1805
Epoch 8/10
[1m1889/1889[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 44ms/step - accuracy: 0.9514 - loss: 0.1791
Epoch 9/



LSTM - Accuracy: 0.42654734965386704, Precision: 0.11455472734405649, Recall: 0.72226148409894, F1: 0.19774585207758913, ROC-AUC: 0.5727541494893269


In [34]:
import joblib
# Assuming train_and_evaluate is a defined function
def train_and_evaluate(model, X_train, X_test, y_train, y_test, model_name):
    # Placeholder for the actual training and evaluation logic
    pass

# Your models dictionary
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
}

# Train and evaluate models
for model_name, model in models.items():
    train_and_evaluate(model, X_train_fraud, X_test_fraud, y_train_fraud, y_test_fraud, model_name)
    # Save the model to a .jolib file using the model_name
    joblib.dump(model, f"{model_name.replace(' ', '_')}.jolib")

# Save CNN model
joblib.dump(cnn_model, "CNN.jolib")

# Save LSTM model
joblib.dump(lstm_model, "LSTM.jolib")


['LSTM.jolib']