In [None]:
from __future__ import annotations

from pathlib import Path
from typing import Tuple, Dict, Any, Callable

import numpy as np
import torch
from numpy.typing import ArrayLike

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    roc_auc_score,
    average_precision_score,
    precision_recall_curve,
)

from pyod.models.ecod import ECOD

from usflows import Flow

# ---------------------------------------------------------------------------
# Global configuration
# ---------------------------------------------------------------------------

RANDOM_STATE: int = 42
DATA_DIR: Path = Path("./nf4ad/data/adbench")  # assumes ./data contains the Classical .npz files


# ---------------------------------------------------------------------------
# Dataset resolution & loading (unchanged)
# ---------------------------------------------------------------------------

def resolve_npz_path(dataset_name: str, data_dir: Path = DATA_DIR) -> Path:
    dataset_name = dataset_name.strip()
    data_dir = Path(data_dir)

    if not data_dir.is_dir():
        raise FileNotFoundError(
            f"Data directory {data_dir.resolve()} does not exist. "
            "Make sure you created it and copied the ADBench .npz files into it."
        )

    candidate = data_dir / dataset_name
    if candidate.is_file():
        return candidate

    if not dataset_name.endswith(".npz"):
        candidate_with_ext = data_dir / f"{dataset_name}.npz"
        if candidate_with_ext.is_file():
            return candidate_with_ext

    candidates = []

    if dataset_name.isdigit():
        prefix = f"{dataset_name}_"
        candidates = [p for p in data_dir.glob("*.npz") if p.name.startswith(prefix)]
    else:
        norm = dataset_name.lower()
        for p in data_dir.glob("*.npz"):
            stem = p.stem.lower()
            if stem == norm:
                candidates.append(p)
                continue
            if "_" in stem:
                _, suffix = stem.split("_", 1)
                if suffix == norm:
                    candidates.append(p)

    if not candidates:
        raise FileNotFoundError(
            f"Could not match dataset name '{dataset_name}' to any .npz file in "
            f"{data_dir.resolve()}."
        )

    if len(candidates) > 1:
        names = ", ".join(sorted(p.name for p in candidates))
        raise RuntimeError(
            f"Dataset name '{dataset_name}' is ambiguous; it matches multiple files: "
            f"{names}. Please specify a more precise name."
        )

    return candidates[0]


def load_classical_dataset(
    dataset_name: str,
    data_dir: Path = DATA_DIR,
) -> Tuple[np.ndarray, np.ndarray]:
    npz_path = resolve_npz_path(dataset_name, data_dir)
    npz = np.load(npz_path, allow_pickle=True)

    X = npz["X"]
    y = npz["y"].astype(int)

    print(
        f"Loaded {npz_path.name}: X.shape={X.shape}, "
        f"y.shape={y.shape}, anomaly_ratio={y.mean():.4f}"
    )

    return X, y


# ---------------------------------------------------------------------------
# Metrics (unchanged)
# ---------------------------------------------------------------------------


def evaluate_anomaly_scores(
    y_true: ArrayLike,
    scores: ArrayLike,
) -> Dict[str, float]:
    y_true = np.asarray(y_true).astype(int).ravel()
    scores = np.asarray(scores, dtype=float).ravel()

    if y_true.shape[0] != scores.shape[0]:
        raise ValueError(
            f"y_true and scores must have the same length, "
            f"got {y_true.shape[0]} and {scores.shape[0]}."
        )

    if np.unique(y_true).size < 2:
        raise ValueError(
            "y_true must contain both normal (0) and anomalous (1) labels."
        )

    metrics: Dict[str, float] = {}
    metrics["auc_roc"] = float(roc_auc_score(y_true, scores))
    metrics["auc_pr"] = float(average_precision_score(y_true, scores))

    precision, recall, thresholds = precision_recall_curve(y_true, scores)
    f1 = 2 * precision * recall / (precision + recall + 1e-12)
    best_idx = int(np.argmax(f1))
    metrics["best_f1"] = float(f1[best_idx])

    if thresholds.size > 0 and best_idx < thresholds.size:
        metrics["best_f1_threshold"] = float(thresholds[best_idx])
    else:
        metrics["best_f1_threshold"] = float("nan")

    return metrics


# ---------------------------------------------------------------------------
# Your method stub (unchanged)
# ---------------------------------------------------------------------------

def create_flow_prior(latent_dim, device: torch.device):
    """Helper to create flow prior with specified latent dimension."""
    from nf4ad.flows import NonUSFlow
    import pyro.distributions as dist
    import torch.nn as nn
    
    base_dist = dist.Normal(
        torch.zeros(latent_dim).to(device),
        torch.ones(latent_dim).to(device)
    )
    
    # Simple MLP conditioner for testing
    class SimpleConditioner(nn.Module):
        def __init__(self, in_dim, out_dim):
            super().__init__()
            self.net = nn.Sequential(
                nn.Linear(in_dim, 128),
                nn.ReLU(),
                nn.Linear(128, out_dim),
            )
        
        def forward(self, x):
            return self.net(x)
    
    # from U import 
    
    flow = NonUSFlow(
        in_dims=[latent_dim],
        device=device,
        coupling_blocks=3,
        base_distribution=base_dist,
        prior_scale=1.0,
        affine_conjugation=True,
        conditioner_cls=SimpleConditioner,
        conditioner_args={
            'in_dim': latent_dim,
            'out_dim': latent_dim * 2,  # For affine coupling: scale + shift
        },
        nonlinearity=nn.ReLU(),
    )
    
    return flow


def your_method_scores(
    X_train: np.ndarray,
    y_train: np.ndarray,
    X_test: np.ndarray,
) -> np.ndarray:
    """
    Template wrapper for flow anomaly detection method.

    Implement s.t.:
        1) Fits your model on (X_train, y_train) in the appropriate way
           (for unsupervised methods you may ignore y_train).
        2) Returns a 1D array of anomaly scores for X_test, where higher
           scores mean "more anomalous".

    Returns
    -------
    scores : np.ndarray of shape (n_test,)
        Anomaly scores for X_test.
    """
    # Get number of features
    n_features = X_train.shape[1]
    
    from nf4ad.adbench_wrapper import ADBenchVAEFlowTabular
    
    flow = ADBenchVAEFlowTabular(
        flow_prior=create_flow_prior(n_features, device=torch.device('cuda')),
        n_features=5,
        latent_dim=n_features,
    )
    vaeflow.fit(X_train, y_train)
    return vaeflow.predict_score(X_test)


# ---------------------------------------------------------------------------
# PyOD baseline (ECOD)
# ---------------------------------------------------------------------------

def run_pyod_ecod_baseline(
    X_train: np.ndarray,
    y_train: np.ndarray,
    X_test: np.ndarray,
) -> np.ndarray:
    """
    Run ECOD (Empirical Cumulative Distribution based Outlier Detection)
    from PyOD as an unsupervised baseline.

    This is one of the unsupervised methods ADBench includes via PyOD,
    but here we call it directly through PyOD's modern API.
    """
    clf = ECOD()          # y is ignored in unsupervised PyOD models
    clf.fit(X_train)      # fit on train set
    scores = clf.decision_function(X_test)  # higher = more anomalous

    return np.asarray(scores, dtype=float).ravel()


# ---------------------------------------------------------------------------
# High-level helper to run everything on one dataset (slightly tweaked)
# ---------------------------------------------------------------------------

def run_single_dataset_example(
    dataset_name: str = "cardio",
    data_dir: Path = DATA_DIR,
    use_baseline: bool = True,
    method_fn: Callable[[np.ndarray, np.ndarray, np.ndarray], np.ndarray] | None = None,
) -> Dict[str, Any]:
    """
    Pipeline:
      - load dataset from .npz
      - split into train/test
      - standardize features
      - run either ECOD baseline or your custom method
      - compute metrics
    """
    X, y = load_classical_dataset(dataset_name, data_dir=data_dir)

    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=0.5,
        random_state=RANDOM_STATE,
        stratify=y,
    )

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    if use_baseline:
        scores = run_pyod_ecod_baseline(X_train_scaled, y_train, X_test_scaled)
        model_used = "PyOD-ECOD"
    else:
        if method_fn is None:
            raise ValueError(
                "use_baseline=False but no method_fn was provided. "
                "Pass your own method wrapper, e.g. method_fn=your_method_scores."
            )
        scores = method_fn(X_train_scaled, y_train, X_test_scaled)
        model_used = getattr(method_fn, "__name__", "custom_method")

    metrics = evaluate_anomaly_scores(y_test, scores)

    result: Dict[str, Any] = {
        "dataset": dataset_name,
        "n_train": int(X_train.shape[0]),
        "n_test": int(X_test.shape[0]),
        "model": model_used,
        "metrics": metrics,
    }

    print(
        f"\nResults on dataset='{dataset_name}' using model='{model_used}':\n"
        f"  AUC-ROC : {metrics['auc_roc']:.4f}\n"
        f"  AUC-PR  : {metrics['auc_pr']:.4f}\n"
        f"  best F1 : {metrics['best_f1']:.4f} "
        f"(at score threshold â‰ˆ {metrics['best_f1_threshold']:.4f})"
    )

    return result


# ---------------------------------------------------------------------------
# Example call
# ---------------------------------------------------------------------------

# Run example
example_result = run_single_dataset_example(
    dataset_name="cardio",
    use_baseline=False,
    method_fn=your_method_scores,
)
example_result


ModuleNotFoundError: No module named 'pyod'

In [1]:
# Run example
example_result = run_single_dataset_example(
    dataset_name="cardio",
    use_baseline=False,
    method_fn=your_method_scores,
)
example_result

NameError: name 'run_single_dataset_example' is not defined

# Flow Model Benchmarking on ADBench

This notebook demonstrates how to use the comprehensive benchmarking system for Flow models on ADBench datasets.

In [1]:
from nf4ad.adbench_benchmark import (
    ADBenchBenchmark,
    BenchmarkConfig,
    FlowConfig,
    ADBENCH_CLASSICAL_DATASETS,
)
from pathlib import Path
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

  from .autonotebook import tqdm as notebook_tqdm


## 1. Quick Single Dataset Test

In [2]:
# Configure benchmark
config = BenchmarkConfig(
    data_dir=Path("./nf4ad/data/adbench"),
    output_dir=Path("./results/adbench"),
    device="cpu",
    verbose=True,
)

# Configure flow model
flow_config = FlowConfig(
    coupling_blocks=8,
    hidden_dim=128,
    lr=1e-3,
    batch_size=64,
    epochs=100,
    patience=3,
    clamp=1.5
)

# Run on single dataset
benchmark = ADBenchBenchmark(config)
results = benchmark.run_on_datasets(
    datasets=["6_cardio"],
    flow_config=flow_config,
)

# Display results
benchmark.results_to_dataframe()


Running experiment on 6_cardio
Dataset: 6_cardio
Features: 21
Train: 915 samples (88 anomalies)
Test: 916 samples (88 anomalies)
Training Flow model on 915 samples...
Input shape: torch.Size([915, 21])
Device: cpu
Epoch 10/100, Loss: 24212999031666069504.0000
Early stopping at epoch 15/100
Restored best model with loss: 9445856125663156224.0000
Training completed. Final loss: 11029379515657467904.0000

Results:
  ROC-AUC: 0.8022
  PR-AUC:  0.3818
  Best F1: 0.4247
  Training time: 5.30s
  Epochs: 15


Unnamed: 0,dataset,n_features,n_train,n_test,n_anomalies_test,roc_auc,pr_auc,best_f1,training_time,inference_time,n_epochs,config_coupling_blocks,config_hidden_dim,config_lr,config_batch_size,config_epochs,config_patience,config_min_delta,config_gradient_clip,config_clamp
0,6_cardio,21,915,916,88,0.802207,0.381838,0.424658,5.304112,0.011694,15,8,128,0.001,64,100,3,0.0001,,1.5


## 2. Multiple Datasets Benchmark

In [None]:
# Select datasets to test
test_datasets = [
    "6_cardio",
    "2_annthyroid",
    "38_thyroid",
    "4_breastw",
    "18_Ionosphere",
]

# Run benchmark
benchmark = ADBenchBenchmark(config)
results = benchmark.run_on_datasets(
    datasets=test_datasets,
    flow_config=flow_config,
)

# Display results
df = benchmark.results_to_dataframe()
print(df[['dataset', 'roc_auc', 'pr_auc', 'best_f1', 'training_time', 'n_epochs']])

## 3. Hyperparameter Search

In [None]:
# Define parameter grid
param_grid = {
    'coupling_blocks': [4, 8, 12],
    'hidden_dim': [64, 128, 256],
    'lr': [1e-4, 1e-3, 1e-2],
    'batch_size': [32, 64],
    'epochs': [100],
    'patience': [10],
}

# Select datasets for tuning
tune_datasets = ["6_cardio", "2_annthyroid", "38_thyroid"]

# Run hyperparameter search
benchmark = ADBenchBenchmark(config)
results_df = benchmark.hyperparameter_search(
    datasets=tune_datasets,
    param_grid=param_grid,
    n_trials=10,  # Try 10 random configurations
)

# Show top configurations
print("Top 10 configurations by ROC-AUC:")
top_configs = results_df.nlargest(10, 'roc_auc')
print(top_configs[[
    'dataset', 'roc_auc', 'pr_auc', 'best_f1',
    'config_coupling_blocks', 'config_hidden_dim', 'config_lr', 'config_batch_size'
]])

## 4. Analyze Results by Dataset

In [None]:
# Group by dataset and show statistics
summary = results_df.groupby('dataset').agg({
    'roc_auc': ['mean', 'std', 'max'],
    'pr_auc': ['mean', 'std', 'max'],
    'best_f1': ['mean', 'std', 'max'],
    'training_time': 'mean',
    'n_epochs': 'mean',
}).round(4)

print(summary)

## 5. Visualize Results

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plot ROC-AUC by dataset
plt.figure(figsize=(12, 6))
sns.boxplot(data=results_df, x='dataset', y='roc_auc')
plt.xticks(rotation=45)
plt.title('ROC-AUC Distribution by Dataset')
plt.ylabel('ROC-AUC')
plt.tight_layout()
plt.show()

# Plot hyperparameter effects
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Coupling blocks effect
sns.boxplot(data=results_df, x='config_coupling_blocks', y='roc_auc', ax=axes[0, 0])
axes[0, 0].set_title('Effect of Coupling Blocks')

# Hidden dim effect
sns.boxplot(data=results_df, x='config_hidden_dim', y='roc_auc', ax=axes[0, 1])
axes[0, 1].set_title('Effect of Hidden Dimension')

# Learning rate effect
sns.boxplot(data=results_df, x='config_lr', y='roc_auc', ax=axes[1, 0])
axes[1, 0].set_title('Effect of Learning Rate')

# Batch size effect
sns.boxplot(data=results_df, x='config_batch_size', y='roc_auc', ax=axes[1, 1])
axes[1, 1].set_title('Effect of Batch Size')

plt.tight_layout()
plt.show()

## 6. Training Curves Analysis

In [None]:
# Plot training curves for best performing configurations
best_result = benchmark.results[0]  # Get first result

plt.figure(figsize=(10, 6))
plt.plot(best_result.training_losses)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title(f'Training Curve - {best_result.dataset}')
plt.grid(True)
plt.show()

print(f"Dataset: {best_result.dataset}")
print(f"Final ROC-AUC: {best_result.metrics['roc_auc']:.4f}")
print(f"Training time: {best_result.training_time:.2f}s")
print(f"Epochs trained: {best_result.n_epochs_trained}")

## 7. Save and Load Results

In [None]:
# Save results
benchmark.save_results("my_experiment")

# Results are saved as:
# - JSON (full details with training curves)
# - CSV (summary table)

# Load and analyze later
import json

with open(config.output_dir / "my_experiment.json") as f:
    loaded_results = json.load(f)

print(f"Loaded {len(loaded_results)} results")

## 8. Using USFlow Model

In [None]:
# Test with USFlow instead of NonUSFlow
config = BenchmarkConfig(
    data_dir=Path("./nf4ad/data/adbench"),
    output_dir=Path("./results/adbench/usflow"),
    device="cpu",
    verbose=True,
)

# Configure USFlow model
usflow_config = FlowConfig(
    flow_type="usflow",  # Switch to USFlow
    coupling_blocks=8,
    hidden_dim=128,
    lr=1e-3,
    batch_size=64,
    epochs=50,
    patience=5,
    lu_transform=1,
    householder=0,
    affine_conjugation=True,
    prior_scale=1.0,
    masktype="checkerboard",
)

# Run on single dataset
benchmark = ADBenchBenchmark(config)
results = benchmark.run_on_datasets(
    datasets=["6_cardio"],
    flow_config=usflow_config,
)

# Display results
benchmark.results_to_dataframe()

## 9. Compare NonUSFlow vs USFlow

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Run both flow types on same datasets
test_datasets = ["6_cardio", "2_annthyroid"]
benchmark = ADBenchBenchmark(config)

for flow_type in ["nonusflow", "usflow"]:
    flow_config = FlowConfig(
        flow_type=flow_type,
        coupling_blocks=8,
        hidden_dim=128,
        lr=1e-3,
        batch_size=64,
        epochs=30,
        patience=5,
    )
    
    if flow_type == "usflow":
        flow_config.lu_transform = 1
        flow_config.householder = 0
        flow_config.masktype = "checkerboard"
    
    benchmark.run_on_datasets(
        datasets=test_datasets,
        flow_config=flow_config,
    )

# Compare results
df = benchmark.results_to_dataframe()

# Plot comparison
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

for i, metric in enumerate(['roc_auc', 'pr_auc', 'best_f1']):
    sns.barplot(data=df, x='dataset', y=metric, hue='config_flow_type', ax=axes[i])
    axes[i].set_title(f'{metric.upper()} Comparison')
    axes[i].set_xticklabels(axes[i].get_xticklabels(), rotation=45)

plt.tight_layout()
plt.show()

# Print summary
comparison = df.groupby(['dataset', 'config_flow_type']).agg({
    'roc_auc': 'mean',
    'pr_auc': 'mean',
    'best_f1': 'mean',
    'training_time': 'mean',
}).round(4)
print("\nComparison Summary:")
print(comparison)

## 10. USFlow Hyperparameter Search

In [None]:
# USFlow-specific hyperparameter grid
usflow_param_grid = {
    'flow_type': ['usflow'],
    'coupling_blocks': [4, 8],
    'hidden_dim': [64, 128],
    'lr': [1e-3, 1e-2],
    'batch_size': [32, 64],
    'lu_transform': [0, 1],
    'householder': [0, 1],
    'affine_conjugation': [True, False],
    'masktype': ['checkerboard', 'channel'],
    'epochs': [50],
    'patience': [5],
}

# Run search
tune_datasets = ["6_cardio"]
benchmark = ADBenchBenchmark(config)
results_df = benchmark.hyperparameter_search(
    datasets=tune_datasets,
    param_grid=usflow_param_grid,
    n_trials=8,
)

# Show best configurations
print("Top 5 USFlow configurations:")
top = results_df.nlargest(5, 'roc_auc')
print(top[[
    'roc_auc', 'pr_auc', 'training_time',
    'config_lu_transform', 'config_householder', 
    'config_masktype', 'config_affine_conjugation'
]])