# Model Evaluation and Interpretation

This notebook evaluates the trained fraud detection models and interprets their results.

In [None]:
# Import necessary libraries
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_curve, roc_auc_score, precision_recall_curve, average_precision_score,
    confusion_matrix, classification_report
)
import tensorflow as tf

# Add the src directory to the path to import our modules
sys.path.append('..')
from src.models.fraud_model import compute_anomaly_scores

# Set plot style
plt.style.use('seaborn-whitegrid')
sns.set_palette('viridis')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

## 1. Load Test Data and Models

First, we'll load the test data and the trained models.

In [None]:
# Load test data
test_data_path = '../data/processed/test_data.parquet'
if os.path.exists(test_data_path):
    test_df = pd.read_parquet(test_data_path)
    print(f"Test data loaded with {test_df.shape[0]} samples and {test_df.shape[1]} columns")
    
    # Separate features and target
    y_test = test_df['is_fraud'].values
    X_test = test_df.drop(columns=['is_fraud']).values
else:
    print(f"Test data not found: {test_data_path}")
    print("Please run the model training notebook first.")

In [None]:
# Load models
model_dir = '../results/models'
classification_model_path = os.path.join(model_dir, 'classification_model.h5')
autoencoder_model_path = os.path.join(model_dir, 'autoencoder_model.h5')

# Load classification model
if os.path.exists(classification_model_path):
    classification_model = tf.keras.models.load_model(classification_model_path)
    print("Classification model loaded successfully.")
else:
    print(f"Classification model not found: {classification_model_path}")

# Load autoencoder model
if os.path.exists(autoencoder_model_path):
    autoencoder_model = tf.keras.models.load_model(autoencoder_model_path)
    print("Autoencoder model loaded successfully.")
else:
    print(f"Autoencoder model not found: {autoencoder_model_path}")

## 2. Evaluate Classification Model

Let's evaluate the performance of the classification model.

In [None]:
# Get predictions from classification model
if 'classification_model' in locals():
    y_pred_proba = classification_model.predict(X_test).flatten()
    y_pred = (y_pred_proba >= 0.5).astype(int)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba)
    
    # Print metrics
    print("Classification Model Performance:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"AUC: {auc:.4f}")
    
    # Print classification report
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    # Plot confusion matrix
    plt.figure(figsize=(8, 6))
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix - Classification Model')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()
    
    # Plot ROC curve
    plt.figure(figsize=(8, 6))
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
    plt.plot(fpr, tpr, label=f'AUC = {auc:.4f}')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve - Classification Model')
    plt.legend(loc='lower right')
    plt.show()
    
    # Plot Precision-Recall curve
    plt.figure(figsize=(8, 6))
    precision_curve, recall_curve, _ = precision_recall_curve(y_test, y_pred_proba)
    ap = average_precision_score(y_test, y_pred_proba)
    plt.plot(recall_curve, precision_curve, label=f'AP = {ap:.4f}')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve - Classification Model')
    plt.legend(loc='upper right')
    plt.show()

## 3. Evaluate Autoencoder Model

Now, let's evaluate the performance of the autoencoder model for anomaly detection.

In [None]:
# Get anomaly scores from autoencoder model
if 'autoencoder_model' in locals():
    # Compute reconstruction error (anomaly scores)
    anomaly_scores = compute_anomaly_scores(autoencoder_model, X_test)
    
    # Determine threshold (95th percentile of non-fraud scores)
    non_fraud_indices = (y_test == 0)
    non_fraud_scores = anomaly_scores[non_fraud_indices]
    threshold = np.percentile(non_fraud_scores, 95)
    print(f"Anomaly threshold (95th percentile): {threshold:.4f}")
    
    # Classify based on threshold
    y_pred_autoencoder = (anomaly_scores >= threshold).astype(int)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred_autoencoder)
    precision = precision_score(y_test, y_pred_autoencoder)
    recall = recall_score(y_test, y_pred_autoencoder)
    f1 = f1_score(y_test, y_pred_autoencoder)
    
    # Print metrics
    print("Autoencoder Model Performance:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    
    # Print classification report
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred_autoencoder))
    
    # Plot confusion matrix
    plt.figure(figsize=(8, 6))
    cm = confusion_matrix(y_test, y_pred_autoencoder)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix - Autoencoder Model')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()
    
    # Plot anomaly score distribution
    plt.figure(figsize=(10, 6))
    sns.histplot(anomaly_scores[y_test == 0], label='Normal', alpha=0.5, kde=True)
    sns.histplot(anomaly_scores[y_test == 1], label='Fraud', alpha=0.5, kde=True)
    plt.axvline(threshold, color='red', linestyle='--', label=f'Threshold: {threshold:.4f}')
    plt.xlabel('Anomaly Score (Reconstruction Error)')
    plt.ylabel('Count')
    plt.title('Anomaly Score Distribution - Autoencoder Model')
    plt.legend()
    plt.show()
    
    # Plot ROC curve for anomaly scores
    plt.figure(figsize=(8, 6))
    fpr, tpr, thresholds = roc_curve(y_test, anomaly_scores)
    auc = roc_auc_score(y_test, anomaly_scores)
    plt.plot(fpr, tpr, label=f'AUC = {auc:.4f}')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve - Autoencoder Model')
    plt.legend(loc='lower right')
    plt.show()

## 4. Compare Models

Let's compare the performance of both models.

In [None]:
# Compare model performance if both models are available
if 'classification_model' in locals() and 'autoencoder_model' in locals():
    # Prepare metrics for comparison
    metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score']
    classification_metrics = [
        accuracy_score(y_test, y_pred),
        precision_score(y_test, y_pred),
        recall_score(y_test, y_pred),
        f1_score(y_test, y_pred)
    ]
    autoencoder_metrics = [
        accuracy_score(y_test, y_pred_autoencoder),
        precision_score(y_test, y_pred_autoencoder),
        recall_score(y_test, y_pred_autoencoder),
        f1_score(y_test, y_pred_autoencoder)
    ]
    
    # Create comparison dataframe
    comparison_df = pd.DataFrame({
        'Metric': metrics,
        'Classification Model': classification_metrics,
        'Autoencoder Model': autoencoder_metrics
    })
    
    # Display comparison
    print("Model Performance Comparison:")
    print(comparison_df.to_string(index=False, float_format=lambda x: f"{x:.4f}"))
    
    # Plot comparison
    plt.figure(figsize=(10, 6))
    comparison_df.set_index('Metric').plot(kind='bar')
    plt.title('Model Performance Comparison')
    plt.ylabel('Score')
    plt.ylim(0, 1)
    plt.legend(title='Model')
    plt.tight_layout()
    plt.show()

## 5. Feature Importance Analysis

Let's analyze which features are most important for fraud detection.

In [None]:
# Analyze feature importance for classification model
if 'classification_model' in locals():
    # Get feature names
    feature_names = test_df.drop(columns=['is_fraud']).columns.tolist()
    
    # For a simple model, we can use permutation importance
    from sklearn.inspection import permutation_importance
    
    # Define a wrapper function for the TensorFlow model
    def model_predict(X):
        return classification_model.predict(X).flatten()
    
    # Calculate permutation importance
    result = permutation_importance(
        model_predict, X_test, y_test,
        n_repeats=10,
        random_state=42,
        scoring='roc_auc'
    )
    
    # Create importance dataframe
    importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': result.importances_mean,
        'Std': result.importances_std
    })
    
    # Sort by importance
    importance_df = importance_df.sort_values('Importance', ascending=False).reset_index(drop=True)
    
    # Display top 20 features
    print("Top 20 Important Features:")
    print(importance_df.head(20).to_string(index=False, float_format=lambda x: f"{x:.4f}"))
    
    # Plot feature importance
    plt.figure(figsize=(12, 8))
    sns.barplot(x='Importance', y='Feature', data=importance_df.head(20))
    plt.title('Feature Importance (Permutation Importance)')
    plt.xlabel('Importance (Mean Decrease in AUC)')
    plt.tight_layout()
    plt.show()

## 6. Threshold Analysis

Let's analyze how different threshold values affect model performance.

In [None]:
# Analyze classification threshold for classification model
if 'classification_model' in locals():
    # Define threshold range
    thresholds = np.linspace(0.1, 0.9, 9)
    
    # Calculate metrics for each threshold
    threshold_metrics = []
    for threshold in thresholds:
        y_pred_threshold = (y_pred_proba >= threshold).astype(int)
        threshold_metrics.append({
            'Threshold': threshold,
            'Accuracy': accuracy_score(y_test, y_pred_threshold),
            'Precision': precision_score(y_test, y_pred_threshold),
            'Recall': recall_score(y_test, y_pred_threshold),
            'F1 Score': f1_score(y_test, y_pred_threshold)
        })
    
    # Create threshold dataframe
    threshold_df = pd.DataFrame(threshold_metrics)
    
    # Display threshold metrics
    print("Metrics at Different Thresholds:")
    print(threshold_df.to_string(index=False, float_format=lambda x: f"{x:.4f}"))
    
    # Plot threshold metrics
    plt.figure(figsize=(12, 6))
    for metric in ['Accuracy', 'Precision', 'Recall', 'F1 Score']:
        plt.plot(threshold_df['Threshold'], threshold_df[metric], marker='o', label=metric)
    plt.title('Metrics vs. Classification Threshold')
    plt.xlabel('Threshold')
    plt.ylabel('Score')
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    plt.show()

## 7. Error Analysis

Let's analyze the errors made by the models to understand where they fail.

In [None]:
# Analyze errors made by the classification model
if 'classification_model' in locals():
    # Create a dataframe with actual and predicted values
    error_df = test_df.copy()
    error_df['predicted_proba'] = y_pred_proba
    error_df['predicted'] = y_pred
    error_df['correct'] = (error_df['is_fraud'] == error_df['predicted'])
    
    # Separate into different error types
    false_positives = error_df[(error_df['is_fraud'] == 0) & (error_df['predicted'] == 1)]
    false_negatives = error_df[(error_df['is_fraud'] == 1) & (error_df['predicted'] == 0)]
    
    print(f"Number of false positives: {len(false_positives)}")
    print(f"Number of false negatives: {len(false_negatives)}")
    
    # Analyze false positives
    if len(false_positives) > 0:
        print("\nFalse Positive Analysis:")
        print("Sample of false positives (legitimate transactions classified as fraud):")
        print(false_positives.head(5))
        
        # Analyze feature distributions for false positives
        plt.figure(figsize=(12, 8))
        for i, feature in enumerate(feature_names[:5]):  # Analyze first 5 features
            plt.subplot(2, 3, i+1)
            sns.histplot(error_df[error_df['is_fraud'] == 0][feature], label='Normal', alpha=0.5, kde=True)
            sns.histplot(false_positives[feature], label='False Positive', alpha=0.5, kde=True)
            plt.title(f'{feature} Distribution')
            plt.legend()
        plt.tight_layout()
        plt.show()
    
    # Analyze false negatives
    if len(false_negatives) > 0:
        print("\nFalse Negative Analysis:")
        print("Sample of false negatives (fraudulent transactions classified as legitimate):")
        print(false_negatives.head(5))
        
        # Analyze feature distributions for false negatives
        plt.figure(figsize=(12, 8))
        for i, feature in enumerate(feature_names[:5]):  # Analyze first 5 features
            plt.subplot(2, 3, i+1)
            sns.histplot(error_df[error_df['is_fraud'] == 1][feature], label='Fraud', alpha=0.5, kde=True)
            sns.histplot(false_negatives[feature], label='False Negative', alpha=0.5, kde=True)
            plt.title(f'{feature} Distribution')
            plt.legend()
        plt.tight_layout()
        plt.show()

## 8. Summary

In this notebook, we've:
1. Evaluated the performance of both classification and autoencoder models
2. Compared the models using various metrics
3. Analyzed feature importance to understand which features are most predictive of fraud
4. Performed threshold analysis to find the optimal classification threshold
5. Analyzed errors made by the models to understand their limitations

These insights can help improve the models and guide deployment decisions.