<a href="https://colab.research.google.com/github/cathmac/Sprott/blob/main/DATA_5000_final_project_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [54]:
!pip install -q xgboost imbalanced-learn matplotlib seaborn scikit-learn fpdf


In [55]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    average_precision_score, roc_auc_score, confusion_matrix, precision_score,
    recall_score, f1_score, precision_recall_curve, roc_curve,
    matthews_corrcoef, ConfusionMatrixDisplay
)

# Unsupervised models
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.neighbors import LocalOutlierFactor
from sklearn.cluster import KMeans, DBSCAN
from sklearn.covariance import EllipticEnvelope
from sklearn.mixture import GaussianMixture

# Supervised models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

# Imbalanced data handling
from imblearn.under_sampling import RandomUnderSampler

# PDF generation
from fpdf import FPDF
from google.colab import files


In [56]:
# Load dataset into a DataFrame (update the path as needed)
df = pd.read_csv('/content/drive/MyDrive/DATA 5000/creditcard.csv')

# Separate features and target
X = df.drop(columns=['Class'])
y = df['Class']

# Standardize 'Amount' and 'Time' columns
scaler = StandardScaler()
X[['Amount', 'Time']] = scaler.fit_transform(X[['Amount', 'Time']])

# Handle class imbalance using Random Under Sampling
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.3, random_state=42, stratify=y_resampled
)

# Verify the shapes of the splits
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")


X_train shape: (688, 30)
y_train shape: (688,)
X_test shape: (296, 30)
y_test shape: (296,)


In [57]:
os.makedirs("charts", exist_ok=True)


In [58]:
def train_and_evaluate_model(model, model_name, is_supervised=True):
    print(f"\nTraining and evaluating {model_name}...")

    X_train_np = X_train.to_numpy()
    X_test_np = X_test.to_numpy()

    # Training and predicting
    if is_supervised:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_scores = model.predict_proba(X_test)[:, 1]
    else:
        if model_name == "DBSCAN":
            model.fit(X_train_np)
            y_pred = model.fit_predict(X_test_np)
            y_scores = np.zeros(len(y_pred))
        elif model_name == "LocalOutlierFactor":
            model.fit(X_train_np)
            y_pred = model.predict(X_test_np)
            y_scores = -model._decision_function(X_test_np)
        elif model_name == "KMeans":
            model.fit(X_train_np)
            y_pred = model.predict(X_test_np)
            y_scores = -model.transform(X_test_np).min(axis=1)
        else:
            model.fit(X_train_np)
            y_pred = model.predict(X_test_np)
            y_scores = model.decision_function(X_test_np) if hasattr(model, "decision_function") else model.score_samples(X_test_np)

    # Map predictions for unsupervised models (-1 -> 1 for fraud, 1 -> 0 for normal)
    if not is_supervised:
        y_pred = [0 if pred == 1 else 1 for pred in y_pred]

    # Metrics
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_scores)
    pr_auc = average_precision_score(y_test, y_scores)
    mcc = matthews_corrcoef(y_test, y_pred)

    # Save Precision-Recall Curve
    precision_vals, recall_vals, _ = precision_recall_curve(y_test, y_scores)
    plt.figure()
    plt.plot(recall_vals, precision_vals, marker='.', label=f'{model_name} (PR AUC={pr_auc:.4f})')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(f'Precision-Recall Curve: {model_name}')
    plt.legend()
    plt.grid()
    pr_curve_path = f'charts/{model_name}_pr_curve.png'
    plt.savefig(pr_curve_path)
    plt.close()

    # Save ROC Curve
    fpr, tpr, _ = roc_curve(y_test, y_scores)
    plt.figure()
    plt.plot(fpr, tpr, marker='.', label=f'{model_name} (ROC AUC={roc_auc:.4f})')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve: {model_name}')
    plt.legend()
    plt.grid()
    roc_curve_path = f'charts/{model_name}_roc_curve.png'
    plt.savefig(roc_curve_path)
    plt.close()

    # Save Confusion Matrix
    plt.figure(figsize=(6, 5))
    disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y_test, y_pred), display_labels=[0, 1])
    disp.plot(cmap='Blues')
    plt.title(f'Confusion Matrix: {model_name}')
    cm_path = f'charts/{model_name}_confusion_matrix.png'
    plt.savefig(cm_path)
    plt.close()

    return {
        "Model": model_name,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1,
        "ROC AUC": roc_auc,
        "PR AUC": pr_auc,
        "MCC": mcc,
        "PR Curve": pr_curve_path,
        "ROC Curve": roc_curve_path,
        "Confusion Matrix": cm_path
    }


In [59]:
# Unsupervised models
unsupervised_models = {
    "IsolationForest": IsolationForest(n_estimators=100, contamination=0.01, random_state=42),
    "OneClassSVM": OneClassSVM(kernel='rbf', nu=0.01, gamma='auto'),
    "LocalOutlierFactor": LocalOutlierFactor(n_neighbors=10),
    "EllipticEnvelope": EllipticEnvelope(contamination=0.01, support_fraction=0.9),
    "KMeans": KMeans(n_clusters=2, init='k-means++', random_state=42),
    "GaussianMixture": GaussianMixture(n_components=2, covariance_type='full', random_state=42),
    "DBSCAN": DBSCAN(eps=0.5, min_samples=5)
}

# Supervised models
supervised_models = {
    "RandomForest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(eval_metric='logloss', random_state=42),
    "LogisticRegression": LogisticRegression(max_iter=1000, random_state=42)
}

# Train and evaluate all models
all_results = []

# Evaluate unsupervised models
for model_name, model in unsupervised_models.items():
    try:
        result = train_and_evaluate_model(model, model_name, is_supervised=False)
        all_results.append(result)
    except Exception as e:
        print(f"Error training {model_name}: {e}")

# Evaluate supervised models
for model_name, model in supervised_models.items():
    try:
        result = train_and_evaluate_model(model, model_name, is_supervised=True)
        all_results.append(result)
    except Exception as e:
        print(f"Error training {model_name}: {e}")



Training and evaluating IsolationForest...

Training and evaluating OneClassSVM...

Training and evaluating LocalOutlierFactor...
Error training LocalOutlierFactor: This 'LocalOutlierFactor' has no attribute 'predict'

Training and evaluating EllipticEnvelope...

Training and evaluating KMeans...

Training and evaluating GaussianMixture...

Training and evaluating DBSCAN...

Training and evaluating RandomForest...

Training and evaluating XGBoost...

Training and evaluating LogisticRegression...


<Figure size 600x500 with 0 Axes>

<Figure size 600x500 with 0 Axes>

<Figure size 600x500 with 0 Axes>

<Figure size 600x500 with 0 Axes>

<Figure size 600x500 with 0 Axes>

<Figure size 600x500 with 0 Axes>

<Figure size 600x500 with 0 Axes>

<Figure size 600x500 with 0 Axes>

<Figure size 600x500 with 0 Axes>

In [60]:
def generate_pdf_report(results, filename="model_performance_report.pdf"):
    pdf = FPDF()
    pdf.set_auto_page_break(auto=1, margin=15)

    # Title Page
    pdf.add_page()
    pdf.set_font("Arial", 'B', 20)
    pdf.cell(0, 10, "Model Performance Report", ln=True, align="C")
    pdf.ln(10)
    pdf.set_font("Arial", '', 14)
    pdf.multi_cell(0, 10, "This report evaluates various supervised and unsupervised machine learning models for anomaly detection in credit card transactions. The models are compared based on the Precision-Recall AUC (PR AUC) metric, which is well-suited for imbalanced datasets.")

    # Add each model's details with metrics and charts
    for result in results:
        pdf.add_page()
        pdf.set_font("Arial", 'B', 16)
        pdf.cell(0, 10, f"Algorithm: {result['Model']}", ln=True)
        pdf.ln(5)

        # Add metrics
        pdf.set_font("Arial", '', 12)
        pdf.cell(0, 10, f"Precision: {result['Precision']:.4f}", ln=True)
        pdf.cell(0, 10, f"Recall: {result['Recall']:.4f}", ln=True)
        pdf.cell(0, 10, f"F1 Score: {result['F1 Score']:.4f}", ln=True)
        pdf.cell(0, 10, f"ROC AUC: {result['ROC AUC']:.4f}", ln=True)
        pdf.cell(0, 10, f"PR AUC: {result['PR AUC']:.4f}", ln=True)
        pdf.cell(0, 10, f"Matthews Correlation Coefficient (MCC): {result['MCC']:.4f}", ln=True)
        pdf.ln(5)

        # Add PR Curve
        pdf.set_font("Arial", 'B', 12)
        pdf.cell(0, 10, "Precision-Recall Curve:", ln=True)
        pdf.image(result['PR Curve'], x=10, w=190)
        pdf.ln(5)

        # Add ROC Curve
        pdf.set_font("Arial", 'B', 12)
        pdf.cell(0, 10, "ROC Curve:", ln=True)
        pdf.image(result['ROC Curve'], x=10, w=190)
        pdf.ln(5)

        # Add Confusion Matrix
        pdf.set_font("Arial", 'B', 12)
        pdf.cell(0, 10, "Confusion Matrix:", ln=True)
        pdf.image(result['Confusion Matrix'], x=10, w=190)
        pdf.ln(10)

    # Conclusion Page
    pdf.add_page()
    pdf.set_font("Arial", 'B', 20)
    pdf.cell(0, 10, "Conclusion", ln=True)
    pdf.ln(10)
    pdf.set_font("Arial", '', 14)
    best_model = max(results, key=lambda x: x['PR AUC'])
    pdf.multi_cell(0, 10, f"The best-performing model based on PR AUC is '{best_model['Model']}' with a PR AUC score of {best_model['PR AUC']:.4f}.\n\nThis indicates that '{best_model['Model']}' is the most effective model for identifying anomalies in this imbalanced dataset.")

    # Output the PDF
    pdf.output(filename)
    print(f"Report saved as {filename}")

    # Download the PDF
    files.download(filename)

    # Clean up saved images
    for result in results:
        os.remove(result['PR Curve'])
        os.remove(result['ROC Curve'])
        os.remove(result['Confusion Matrix'])

# Generate and download the report
generate_pdf_report(all_results, filename="model_performance_report.pdf")


Report saved as model_performance_report.pdf


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>