- Datasets: TOTF, LOBSTER
- Scalers: MinMax, Box-Cox
- Models: Transformer+OCSVM, PRAE, PNN 

In [None]:
import os
import torch
import joblib
import json
import pandas as pd
import numpy as np

import preprocessing as prep
import machine_learning as ml
from pipeline import AnomalyDetectionPipeline

os.makedirs('models', exist_ok=True)

In [None]:
# Configuration
DATASETS = {
    'TOTF': {
        'path': 'data/TOTF.PA-book/2015-01-02-TOTF.PA-book.csv.gz',
        'type': 'standard_csv'
    },

    'LOBSTER': {
        'orderbook': 'data/LOBSTER/AMZN_2012-06-21_34200000_57600000_orderbook_10.csv',
        'message': 'data/LOBSTER/AMZN_2012-06-21_34200000_57600000_message_10.csv',
        'type': 'lobster'
    }
}
SCALERS = ['minmax', 'box-cox']
MODELS = ['transformer_ocsvm', 'prae', 'pnn']

# Hyperparameters
SEQ_LENGTH = 25
BATCH_SIZE = 128
EPOCHS = 50
HIDDEN_DIM = 64
LR = 1e-3
PATIENCE = 5
TRAIN_RATIO = 0.7
VAL_RATIO = 0.15

In [None]:
# Load Data
for dataset_name, data_config in DATASETS.items():
    print(f"PROCESSING DATASET: {dataset_name}")

    pipeline = AnomalyDetectionPipeline(seq_length=SEQ_LENGTH, batch_size=BATCH_SIZE)

    try:
        # Load Data
        if data_config['type'] == 'lobster':
            df = prep.load_lobster_data(
                orderbook_path=data_config['orderbook'], 
                message_path=data_config['message'], 
                levels=10
            )
            pipeline.raw_df = df
        
        else:
            pipeline.load_data(data_config['path'])

        print(f"Loaded {len(pipeline.raw_df)} rows.")

        # Engineer Features
        pipeline.engineer_features(feature_sets=['base', 'tao', 'hawkes', 'poutre', 'ofi'])
        master_features_df = pipeline.processed_df.copy()

        for scaler_type in SCALERS:
            print(f"Applying Scaler: {scaler_type}")

            for model_type in MODELS:
                print(f"Training Model: {model_type}")

                pipeline.processed_df = master_features_df.copy()
                pipeline.feature_names = pipeline.processed_df.columns.tolist()

                # Scale Features
                pipeline.scale_and_sequence(method=scaler_type, train_ratio=TRAIN_RATIO, val_ratio=VAL_RATIO)

                # Train Model
                pipeline.train_model(
                    model_type=model_type, 
                    epochs=EPOCHS, 
                    lr=LR,
                    hidden_dim=HIDDEN_DIM,
                    patience=PATIENCE
                )

                # Save Artifacts
                base_filename = f"models/{dataset_name}_{scaler_type}_{model_type}"
                
                # Save Model Architecture
                config = {
                    'dataset': dataset_name,
                    'model_type': model_type,
                    'scaler_type': scaler_type,
                    'input_dim': pipeline.X_train.shape[2],
                    'seq_length': SEQ_LENGTH,
                    'hidden_dim': HIDDEN_DIM,
                    'batch_size': BATCH_SIZE,
                    'epochs': EPOCHS,
                    'learning_rate': LR,
                    'feature_names': pipeline.feature_names,
                    'train_samples': len(pipeline.X_train)
                }
                with open(f"{base_filename}_config.json", 'w') as f:
                    json.dump(config, f, indent=4)

                # Save Neural Network Weights
                torch.save(pipeline.model.state_dict(), f"{base_filename}_weights.pth")

                # Save Scaler
                joblib.dump(pipeline.scaler, f"{base_filename}_scaler.pkl")

                # Handle OC-SVM
                if model_type == 'transformer_ocsvm':
                    if pipeline.detector is not None:
                        joblib.dump(pipeline.detector, f"{base_filename}_ocsvm_detector.pkl")
                        joblib.dump(pipeline.ocsvm, f"{base_filename}_latent_scaler.pkl")

                print(f"Saved artifacts to {base_filename}")

    except FileNotFoundError as e:
        print(f"File not found for {dataset_name}: {e}")
        continue

    except Exception as e:
        print(f"Unexpected error processing {dataset_name}: {e}")

In [None]:
# Verification
target_dataset = 'TOTF'
target_scaler = 'box-cox'
target_model = 'prae'
base_path = f"models/{target_dataset}_{target_scaler}_{target_model}"

if os.path.exists(f"{base_path}_config.json"):
    # Load Config
    with open(f"{base_path}_config.json", 'r') as f:
        config = json.load(f)
        
    # Initialize Pipeline
    test_pipeline = AnomalyDetectionPipeline(seq_length=config['seq_length'], batch_size=config['batch_size'])

    # Load Scaler
    test_pipeline.scaler = joblib.load(f"{base_path}_scaler.pkl")

    # Initialize Model Architecture
    input_dim = config['input_dim']
    model_dim = config['hidden_dim']

    if target_model == 'prae':
        base_ae = ml.TransformerAutoencoder(num_features=input_dim, model_dim=model_dim, num_heads=2, representation_dim=128, seq_length=config['seq_length'])
        test_pipeline.model = ml.ProbabilisticRobustAutoencoder(base_ae, num_train_samples=1)

    # Load Model Weights
    try:
        test_pipeline.model.load_state_dict(torch.load(f"{base_path}_weights.pth"))
        test_pipeline.model.eval()
        print("Model and artifacts loaded successfully for verification.")

    except Exception as e:
        print(f"Error loading model weights: {e}")

else:
    print(f"Configuration file not found at {base_path}_config.json")