- Datasets: TOTF, LOBSTER
- Scalers: MinMax, Box-Cox
- Models: Transformer+OCSVM, PRAE, PNN 

In [None]:
import os
import torch
import joblib
import json
import pandas as pd
import numpy as np

import sys
sys.path.append("..")

from PDMM import preprocessing as prep
from PDMM import  machine_learning as ml
from PDMM import visualization as viz
from PDMM.pipeline import AnomalyDetectionPipeline, sequential_training_pipeline

os.makedirs('models', exist_ok=True)

In [2]:
# Configuration
DATASETS = {
    'TOTF': {
        'path': 'data/TOTF.PA-book/2015-01-02-TOTF.PA-book.csv.gz',
        'type': 'standard_csv'
    },

    'LOBSTER': {
        'orderbook': 'data/LOBSTER/AMZN_2012-06-21_34200000_57600000_orderbook_10.csv',
        'message': 'data/LOBSTER/AMZN_2012-06-21_34200000_57600000_message_10.csv',
        'type': 'lobster'
    }
}
SCALERS = ['minmax', 'box-cox']
MODELS = ['transformer_ocsvm', 'prae', 'pnn']

# Hyperparameters
SEQ_LENGTH = 25
BATCH_SIZE = 128
EPOCHS = 50
HIDDEN_DIM = 64
LR = 1e-3
PATIENCE = 5
TRAIN_RATIO = 0.7
VAL_RATIO = 0.15

In [3]:
# Load Data
for dataset_name, data_config in DATASETS.items():
    print(f"PROCESSING DATASET: {dataset_name}")

    pipeline = AnomalyDetectionPipeline(seq_length=SEQ_LENGTH, batch_size=BATCH_SIZE)

    try:
        # Load Data
        if data_config['type'] == 'lobster':
            df = prep.load_lobster_data(
                orderbook_path=data_config['orderbook'], 
                message_path=data_config['message'], 
                levels=10
            )
            pipeline.raw_df = df
        
        else:
            pipeline.load_data(data_config['path'])

        print(f"Loaded {len(pipeline.raw_df)} rows.")

        # Engineer Features
        pipeline.engineer_features(feature_sets=['base', 'tao', 'hawkes', 'poutre', 'ofi'])
        master_features_df = pipeline.processed_df.copy()

        for scaler_type in SCALERS:
            if dataset_name == 'TOTF' and scaler_type == 'minmax':
                continue


            
            print(f"Applying Scaler: {scaler_type}")

            pipeline.processed_df = master_features_df.copy()
            pipeline.feature_names = pipeline.processed_df.columns.tolist()

            # Scale Features
            pipeline.scale_and_sequence(method=scaler_type, train_ratio=TRAIN_RATIO, val_ratio=VAL_RATIO)

            # Train Models
            for model_type in MODELS:
                print(f"Training Model: {model_type}")

                # Train Model
                pipeline.train_model(
                    model_type=model_type, 
                    epochs=EPOCHS, 
                    lr=LR,
                    hidden_dim=HIDDEN_DIM,
                    patience=PATIENCE
                )

                # Save Artifacts
                base_filename = f"models/{dataset_name}_{scaler_type}_{model_type}"
                
                # Save Model Architecture
                config = {
                    'dataset': dataset_name,
                    'model_type': model_type,
                    'scaler_type': scaler_type,
                    'input_dim': pipeline.X_train.shape[2],
                    'seq_length': SEQ_LENGTH,
                    'hidden_dim': HIDDEN_DIM,
                    'batch_size': BATCH_SIZE,
                    'epochs': EPOCHS,
                    'learning_rate': LR,
                    'feature_names': pipeline.feature_names,
                    'train_samples': len(pipeline.X_train)
                }
                with open(f"{base_filename}_config.json", 'w') as f:
                    json.dump(config, f, indent=4)

                # Save Neural Network Weights
                torch.save(pipeline.model.state_dict(), f"{base_filename}_weights.pth")

                # Save Scaler
                joblib.dump(pipeline.scaler, f"{base_filename}_scaler.pkl")

                # Handle OC-SVM
                if model_type == 'transformer_ocsvm' and pipeline.detector is not None:
                        joblib.dump(pipeline.detector, f"{base_filename}_ocsvm_detector.pkl")
                        joblib.dump(pipeline.latent_scaler, f"{base_filename}_latent_scaler.pkl")

                print(f"Saved artifacts to {base_filename}")

    except FileNotFoundError as e:
        print(f"File not found for {dataset_name}: {e}")
        continue

    except Exception as e:
        print(f"Unexpected error processing {dataset_name}: {e}")

PROCESSING DATASET: TOTF
Pipeline initialized on device: cuda
Loading data from data/TOTF.PA-book/2015-01-02-TOTF.PA-book.csv.gz...
Successfully loaded 640429 rows.
Loaded 640429 rows.
Engineering features: ['base', 'tao', 'hawkes', 'poutre', 'ofi']...
Feature Engineering complete. Total features: 130
Applying Scaler: box-cox
Preprocessing with method: box-cox...
Dropping 2 constant/zero-variance features: ['ask_sweep_cost', 'ask-volume-10']
Data split: Train 448300, Val 96064, Test 96065
Training Model: transformer_ocsvm
Initializing Transformer Autoencoder...
Training Autoencoder (Max Epochs=50)...
Epoch 1/50 - Train Loss: 0.233387 | Val Loss: 0.118255
Validation loss decreased (inf --> 0.118255).  Saving model ...
Epoch 2/50 - Train Loss: 0.098460 | Val Loss: 0.078406
Validation loss decreased (0.118255 --> 0.078406).  Saving model ...
Epoch 3/50 - Train Loss: 0.081290 | Val Loss: 0.069582
Validation loss decreased (0.078406 --> 0.069582).  Saving model ...
Epoch 4/50 - Train Loss: 