In [1]:
import pandas as pd

# Load the datasets
fraud_data = pd.read_csv('D:/week8&9 data/Fraud_Data.csv')
ip_to_country = pd.read_csv('D:/week8&9 data/IpAddress_to_Country.csv')
credit_card = pd.read_csv('D:/week8&9 data/creditcard.csv')

In [2]:
# Display column names of fraud_data
print(fraud_data.columns)


Index(['user_id', 'signup_time', 'purchase_time', 'purchase_value',
       'device_id', 'source', 'browser', 'sex', 'age', 'ip_address', 'class'],
      dtype='object')


In [3]:
# Separate features and target for creditcard dataset
X_creditcard = credit_card.drop(columns=['Class'])
y_creditcard = credit_card['Class']


In [4]:
from sklearn.model_selection import train_test_split

# Split creditcard dataset
X_creditcard_train, X_creditcard_test, y_creditcard_train, y_creditcard_test = train_test_split(
    X_creditcard, y_creditcard, test_size=0.3, random_state=42, stratify=y_creditcard)


In [5]:
from sklearn.preprocessing import StandardScaler

# Define the scaler
scaler = StandardScaler()

# Scale the features for Credit Card dataset
X_creditcard_train_scaled = scaler.fit_transform(X_creditcard_train)
X_creditcard_test_scaled = scaler.transform(X_creditcard_test)


Model Training and Evaluation

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report

models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'Multi-Layer Perceptron': MLPClassifier()
}

# Train and evaluate models for Credit Card dataset with scaled data
for name, model in models.items():
    model.fit(X_creditcard_train_scaled, y_creditcard_train)
    y_pred = model.predict(X_creditcard_test_scaled)
    accuracy = accuracy_score(y_creditcard_test, y_pred)
    print(f'{name} Accuracy (Credit Card): {accuracy}')
    print(f'{name} Classification Report:\n{classification_report(y_creditcard_test, y_pred)}')


Logistic Regression Accuracy (Credit Card): 0.9991456292499094
Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85295
           1       0.85      0.61      0.71       148

    accuracy                           1.00     85443
   macro avg       0.92      0.81      0.86     85443
weighted avg       1.00      1.00      1.00     85443

Decision Tree Accuracy (Credit Card): 0.99916903666772
Decision Tree Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85295
           1       0.79      0.72      0.75       148

    accuracy                           1.00     85443
   macro avg       0.89      0.86      0.87     85443
weighted avg       1.00      1.00      1.00     85443

Random Forest Accuracy (Credit Card): 0.9995201479348805
Random Forest Classification Report:
              precision    recall  f1-score   support


Tracking Experiments with MLflow

In [7]:
import mlflow
import mlflow.sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Initialize MLflow
mlflow.set_experiment("Credit Card Fraud Detection")

# Define the model
log_reg_creditcard = LogisticRegression(max_iter=1000)

# Start an MLflow run
with mlflow.start_run():
    # Train the model
    log_reg_creditcard.fit(X_creditcard_train_scaled, y_creditcard_train)
    
    # Make predictions
    y_pred = log_reg_creditcard.predict(X_creditcard_test_scaled)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_creditcard_test, y_pred)
    
    # Log the accuracy
    mlflow.log_metric("accuracy", accuracy)
    
    # Log the classification report
    report = classification_report(y_creditcard_test, y_pred, output_dict=True)
    for label, metrics in report.items():
        if isinstance(metrics, dict):
            for metric, value in metrics.items():
                mlflow.log_metric(f"{label}_{metric}", value)
    
    # Log the model
    mlflow.sklearn.log_model(log_reg_creditcard, "logistic_regression_model")


2025/02/10 10:43:33 INFO mlflow.tracking.fluent: Experiment with name 'Credit Card Fraud Detection' does not exist. Creating a new experiment.
