In [31]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, Flatten, LSTM, SimpleRNN

In [32]:

# import the data
fraud_data = pd.read_csv('../../data/fraud_data_preprocessed.csv')
creditcard_data = pd.read_csv('../../data/creditcard.csv')

# convert signup_time and purchase_time to datetime
fraud_data['signup_time'] = pd.to_datetime(fraud_data['signup_time']).astype(np.int64)
fraud_data['purchase_time'] = pd.to_datetime(fraud_data['purchase_time']).astype(np.int64)
fraud_data.drop(columns=['user_id'], inplace=True)
fraud_data.drop(columns=['device_id'], inplace=True)

X_fraud = fraud_data.drop(columns=['class'])
y_fraud = fraud_data['class']

X_creditcard = creditcard_data.drop(columns=['Class'])  # Features from the credit card dataset
y_creditcard = creditcard_data['Class']    


X_fraud_train, X_fraud_test, y_fraud_train, y_fraud_test = train_test_split(X_fraud, y_fraud, test_size=0.3, random_state=42)
X_creditcard_train, X_creditcard_test, y_creditcard_train, y_creditcard_test = train_test_split(X_creditcard, y_creditcard, test_size=0.3, random_state=42)

### Train the models

In [33]:
# Define models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'MLP': MLPClassifier(max_iter=1000)
}

### Train and evaluate the models on the fraud dataset

In [34]:
# Train models on fraud_data
for name, model in models.items():
    print(f"Training {name} model on fraud data...")
    model.fit(X_fraud_train, y_fraud_train)
    y_pred_fraud = model.predict(X_fraud_test)
    print(f"{name} model performance on fraud data:")
    print(classification_report(y_fraud_test, y_pred_fraud))
    print("\n")

Training Logistic Regression model on fraud data...
Logistic Regression model performance on fraud data:
              precision    recall  f1-score   support

           0       0.91      1.00      0.95     41117
           1       0.00      0.00      0.00      4217

    accuracy                           0.91     45334
   macro avg       0.45      0.50      0.48     45334
weighted avg       0.82      0.91      0.86     45334



Training Decision Tree model on fraud data...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Decision Tree model performance on fraud data:
              precision    recall  f1-score   support

           0       0.96      0.94      0.95     41117
           1       0.49      0.57      0.53      4217

    accuracy                           0.91     45334
   macro avg       0.72      0.75      0.74     45334
weighted avg       0.91      0.91      0.91     45334



Training Random Forest model on fraud data...
Random Forest model performance on fraud data:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98     41117
           1       1.00      0.54      0.70      4217

    accuracy                           0.96     45334
   macro avg       0.98      0.77      0.84     45334
weighted avg       0.96      0.96      0.95     45334



Training Gradient Boosting model on fraud data...
Gradient Boosting model performance on fraud data:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Train models on creditcard_data


In [35]:
for name, model in models.items():
    print(f"Training {name} model on credit card data...")
    model.fit(X_creditcard_train, y_creditcard_train)
    y_pred_creditcard = model.predict(X_creditcard_test)
    print(f"{name} model performance on credit card data:")
    print(classification_report(y_creditcard_test, y_pred_creditcard))
    print("\n")

Training Logistic Regression model on credit card data...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression model performance on credit card data:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85307
           1       0.80      0.62      0.70       136

    accuracy                           1.00     85443
   macro avg       0.90      0.81      0.85     85443
weighted avg       1.00      1.00      1.00     85443



Training Decision Tree model on credit card data...
Decision Tree model performance on credit card data:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85307
           1       0.73      0.82      0.77       136

    accuracy                           1.00     85443
   macro avg       0.86      0.91      0.89     85443
weighted avg       1.00      1.00      1.00     85443



Training Random Forest model on credit card data...
Random Forest model performance on credit card data:
              precision    recall  f1-score   support

           0      

### Neural Network Models

In [36]:


X_fraud_train_reshaped = X_fraud_train.values.reshape(X_fraud_train.shape[0], X_fraud_train.shape[1], 1)
X_fraud_test_reshaped = X_fraud_test.values.reshape(X_fraud_test.shape[0], X_fraud_test.shape[1], 1)

cnn_model = Sequential([
    Conv1D(64, kernel_size=3, activation='relu', input_shape=(X_fraud_train.shape[1], 1)),
    Flatten(),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])


cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
cnn_model.fit(X_fraud_train_reshaped, y_fraud_train, epochs=10, batch_size=32)


y_pred_cnn = (cnn_model.predict(X_fraud_test_reshaped) > 0.5).astype("int32")
print("CNN model performance on fraud data:")
print(classification_report(y_fraud_test, y_pred_cnn))

Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m3306/3306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 662us/step - accuracy: 0.8290 - loss: 1177715734478848.0000
Epoch 2/10
[1m3306/3306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 632us/step - accuracy: 0.8328 - loss: 145289583263744.0000
Epoch 3/10
[1m3306/3306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 603us/step - accuracy: 0.8316 - loss: 12827342405632.0000
Epoch 4/10
[1m3306/3306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 601us/step - accuracy: 0.8532 - loss: 67041710080.0000
Epoch 5/10
[1m3306/3306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 613us/step - accuracy: 0.9154 - loss: 0.3361
Epoch 6/10
[1m3306/3306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 683us/step - accuracy: 0.9298 - loss: 0.2651
Epoch 7/10
[1m3306/3306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 631us/step - accuracy: 0.9293 - loss: 15892473.0000
Epoch 8/10
[1m3306/3306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 611us

In [37]:
# RNN and LSTM

rnn_model = Sequential([
    SimpleRNN(64, input_shape=(X_fraud_train.shape[1], 1), activation='relu'),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

rnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
rnn_model.fit(X_fraud_train_reshaped, y_fraud_train, epochs=10, batch_size=32)

# LSTM Model
lstm_model = Sequential([
    LSTM(64, input_shape=(X_fraud_train.shape[1], 1), activation='relu'),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lstm_model.fit(X_fraud_train_reshaped, y_fraud_train, epochs=10, batch_size=32)


Epoch 1/10


  super().__init__(**kwargs)


[1m3306/3306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step - accuracy: 0.8256 - loss: 9679227846656.0000
Epoch 2/10
[1m3306/3306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - accuracy: 0.8289 - loss: 182239002624.0000
Epoch 3/10
[1m3306/3306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step - accuracy: 0.8361 - loss: 7422373888.0000
Epoch 4/10
[1m3306/3306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step - accuracy: 0.8292 - loss: 3403005440.0000
Epoch 5/10
[1m3306/3306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step - accuracy: 0.8307 - loss: 4839215104.0000
Epoch 6/10
[1m3306/3306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step - accuracy: 0.8374 - loss: 577618944.0000
Epoch 7/10
[1m3306/3306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step - accuracy: 0.8322 - loss: 109927144.0000
Epoch 8/10
[1m3306/3306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - a

<keras.src.callbacks.history.History at 0x17620a6f0>

### MLOps

In [66]:
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Set tracking URI to point to the local MLflow server
mlflow.set_tracking_uri("http://localhost:5000")



# Try using experiment_id instead of experiment name
try:
    mlflow.set_experiment(experiment_id="1")  # Replace with your actual experiment ID
except Exception as e:
    print(f"Experiment setting failed: {e}")

# Start MLflow run
with mlflow.start_run():
    # Train the model
    model = RandomForestClassifier(n_estimators=100, max_depth=None)
    model.fit(X_fraud_train, y_fraud_train)

    # Log the trained model
    mlflow.sklearn.log_model(model, "random_forest_model")

    # Make predictions and calculate accuracy
    y_pred_fraud = model.predict(X_fraud_test)
    accuracy = accuracy_score(y_fraud_test, y_pred_fraud)
    
    # Log metrics and parameters
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_param("n_estimators", 100)
    mlflow.log_param("max_depth", None)



Experiment setting failed: API request to endpoint /api/2.0/mlflow/experiments/get failed with error code 403 != 200. Response body: ''


MlflowException: API request to endpoint /api/2.0/mlflow/runs/create failed with error code 403 != 200. Response body: ''