In [1]:
!pip install mlflow
!pip install scikit-learn tensorflow pandas matplotlib


Collecting mlflow
  Downloading mlflow-2.17.0-py3-none-any.whl.metadata (29 kB)
Collecting mlflow-skinny==2.17.0 (from mlflow)
  Downloading mlflow_skinny-2.17.0-py3-none-any.whl.metadata (30 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.13.3-py3-none-any.whl.metadata (7.4 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4-py2.py3-none-any.whl.metadata (6.7 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.17.0->mlflow)
  Downloading databricks_sdk-0.36.0-py3-none-any.whl.metadata (38 kB)
Collecting Mako (from alembic!=1.10.0,<2->mlflow)
  Downloading Mako-1.3.6-py3-none-any.whl.metadata (2.9 kB)
Collecting graphql-core<3.3,>=3.1 (from graphene<4->mlflow)
  Downloading graphql_core-3.2.5-py3-none-any.whl.metadata (10 kB)
Collect

In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
import mlflow
import mlflow.sklearn
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [3]:
creditcard_data = pd.read_csv('creditcard.csv')

In [8]:
fraud_data = pd.read_csv('preprocessed_fraud_data.csv')


In [10]:
# Reduce data size by sampling (e.g., 10% of data)
fraud_data = fraud_data.sample(frac=0.1, random_state=42)
creditcard_data = creditcard_data.sample(frac=0.1, random_state=42)


In [11]:
# Convert signup_time and purchase_time to datetime
fraud_data['signup_time'] = pd.to_datetime(fraud_data['signup_time'])
fraud_data['purchase_time'] = pd.to_datetime(fraud_data['purchase_time'])

# Create a transaction velocity column (time difference in seconds)
fraud_data['transaction_velocity'] = (fraud_data['purchase_time'] - fraud_data['signup_time']).dt.total_seconds()

# Now drop the original datetime columns if they are not needed
fraud_data = fraud_data.drop(columns=['signup_time', 'purchase_time'])


In [12]:
# For Fraud Data
X_fraud = fraud_data.drop(columns=['class'])  # Features
y_fraud = fraud_data['class']  # Target variable

# For Credit Card Data
X_creditcard = creditcard_data.drop(columns=['Class'])  # Features
y_creditcard = creditcard_data['Class']  # Target variable


In [13]:
print(f"Number of samples in X_fraud: {len(X_fraud)}")
print(f"Number of samples in y_fraud: {len(y_fraud)}")


Number of samples in X_fraud: 15111
Number of samples in y_fraud: 15111


In [15]:
X_train_fraud, X_test_fraud, y_train_fraud, y_test_fraud = train_test_split(X_fraud, y_fraud, test_size=0.2, random_state=42)


In [16]:
# Step 1: One-hot encode the training and test sets separately
X_train_fraud_encoded = pd.get_dummies(X_train_fraud, columns=['device_id', 'source', 'browser', 'sex'], drop_first=True)
X_test_fraud_encoded = pd.get_dummies(X_test_fraud, columns=['device_id', 'source', 'browser', 'sex'], drop_first=True)

# Step 2: Ensure both train and test sets have the same columns
X_train_fraud_encoded, X_test_fraud_encoded = X_train_fraud_encoded.align(X_test_fraud_encoded, join='left', axis=1, fill_value=0)

# Check the shape of the processed train and test data
print(f"Train shape: {X_train_fraud_encoded.shape}")
print(f"Test shape: {X_test_fraud_encoded.shape}")


Train shape: (12088, 11864)
Test shape: (3023, 11864)


In [22]:
# Apply one-hot encoding to all categorical columns
categorical_cols = ['device_id', 'source', 'browser', 'sex']

# Apply one-hot encoding to both train and test data for fraud dataset
X_fraud_encoded = pd.get_dummies(X_fraud, columns=categorical_cols, drop_first=True)
X_test_fraud_encoded = pd.get_dummies(X_test_fraud, columns=categorical_cols, drop_first=True)

# Align columns in both train and test sets to ensure they have the same columns
X_fraud_encoded, X_test_fraud_encoded = X_fraud_encoded.align(X_test_fraud_encoded, join='left', axis=1, fill_value=0)


In [25]:
def train_evaluate_model(model, X_train, X_test, y_train, y_test):
    # Train the model
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Evaluate the model
    print(f"Model: {model.__class__.__name__}")
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("Classification Report:")
    print(classification_report(y_test, y_pred))


In [26]:
# Initialize models
log_reg = LogisticRegression(max_iter=1000)
decision_tree = DecisionTreeClassifier()
random_forest = RandomForestClassifier(n_estimators=100)
gradient_boost = GradientBoostingClassifier()

# Train models on fraud dataset
print("Training on Fraud Dataset")
for model in [log_reg, decision_tree, random_forest, gradient_boost]:
    train_evaluate_model(model, X_fraud_encoded, X_test_fraud_encoded, y_fraud, y_test_fraud)

# Train models on credit card dataset (assuming X_train_credit and X_test_credit are already prepared)
print("Training on Credit Card Dataset")
for model in [log_reg, decision_tree, random_forest, gradient_boost]:
    train_evaluate_model(model, X_train_credit, X_test_credit, y_train_credit, y_test_credit)


Training on Fraud Dataset
Model: LogisticRegression
Accuracy: 0.9113463446907046
Confusion Matrix:
[[2755    0]
 [ 268    0]]
Classification Report:
              precision    recall  f1-score   support

           0       0.91      1.00      0.95      2755
           1       0.00      0.00      0.00       268

    accuracy                           0.91      3023
   macro avg       0.46      0.50      0.48      3023
weighted avg       0.83      0.91      0.87      3023



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: DecisionTreeClassifier
Accuracy: 1.0
Confusion Matrix:
[[2755    0]
 [   0  268]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2755
           1       1.00      1.00      1.00       268

    accuracy                           1.00      3023
   macro avg       1.00      1.00      1.00      3023
weighted avg       1.00      1.00      1.00      3023

Model: RandomForestClassifier
Accuracy: 1.0
Confusion Matrix:
[[2755    0]
 [   0  268]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2755
           1       1.00      1.00      1.00       268

    accuracy                           1.00      3023
   macro avg       1.00      1.00      1.00      3023
weighted avg       1.00      1.00      1.00      3023

Model: GradientBoostingClassifier
Accuracy: 0.959311941779689
Confusion Matrix:
[[2755    0]
 [ 123  145]]
Classification 

NameError: name 'X_train_credit' is not defined

In [28]:
# Features and target
X_credit = creditcard_data.drop('Class', axis=1)
y_credit = creditcard_data['Class']


In [29]:
# Split the dataset into train and test sets (80% train, 20% test)
X_train_credit, X_test_credit, y_train_credit, y_test_credit = train_test_split(X_credit, y_credit, test_size=0.2, random_state=42)

# Check the shapes of the splits
print(X_train_credit.shape, X_test_credit.shape)


(1592, 30) (398, 30)


In [30]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the training data, and transform the test data
X_train_credit = scaler.fit_transform(X_train_credit)
X_test_credit = scaler.transform(X_test_credit)


In [31]:
print("Training on Credit Card Dataset")
for model in [log_reg, decision_tree, random_forest, gradient_boost]:
    train_evaluate_model(model, X_train_credit, X_test_credit, y_train_credit, y_test_credit)


Training on Credit Card Dataset
Model: LogisticRegression
Accuracy: 0.9974874371859297
Confusion Matrix:
[[394   0]
 [  1   3]]
Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       394
         1.0       1.00      0.75      0.86         4

    accuracy                           1.00       398
   macro avg       1.00      0.88      0.93       398
weighted avg       1.00      1.00      1.00       398

Model: DecisionTreeClassifier
Accuracy: 0.9974874371859297
Confusion Matrix:
[[394   0]
 [  1   3]]
Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       394
         1.0       1.00      0.75      0.86         4

    accuracy                           1.00       398
   macro avg       1.00      0.88      0.93       398
weighted avg       1.00      1.00      1.00       398

Model: RandomForestClassifier
Accuracy: 0.9974874371859297
Confusion Mat

In [34]:
import mlflow
import mlflow.sklearn
from sklearn.metrics import accuracy_score

# Start an MLflow experiment
mlflow.set_experiment('fraud_detection_models')

# Ensure X_fraud is encoded
categorical_columns = ['device_id', 'source', 'browser', 'sex']  # Adjust based on your data
X_fraud_encoded = pd.get_dummies(X_fraud, columns=categorical_columns, drop_first=True)

for model in [log_reg, decision_tree, random_forest, gradient_boost]:
    with mlflow.start_run():
        # Log model and parameters
        mlflow.log_param("model_name", model.__class__.__name__)

        # Train the model on the encoded dataset
        model.fit(X_fraud_encoded, y_fraud)
        y_pred = model.predict(X_test_fraud_encoded)

        # Log metrics
        mlflow.log_metric("accuracy", accuracy_score(y_test_fraud, y_pred))

        # Log model to MLflow
        mlflow.sklearn.log_model(model, f"model_{model.__class__.__name__}")


