In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
from sklearn.metrics import ConfusionMatrixDisplay
from scipy.signal import find_peaks

from itertools import groupby
from operator import itemgetter

import torch
from pipeline import AnomalyDetectionPipeline
import preprocessing as prep
import machine_learning as ml

In [2]:
data = 'data/TOTF.PA-book/2015-01-02-TOTF.PA-book.csv.gz'
lobster_ob = "data/LOBSTER/AMZN_2012-06-21_34200000_57600000_orderbook_10.csv"
lobster_msg = "data/LOBSTER/AMZN_2012-06-21_34200000_57600000_message_10.csv"

In [3]:
# Initialize Pipeline
pipeline = AnomalyDetectionPipeline(seq_length=25, batch_size=128)

# Load Data
nrows = 40_000
try:
    df_lobster = prep.load_lobster_data(orderbook_path=lobster_ob, message_path=lobster_msg, levels=10, nrows=nrows)
    pipeline.raw_df = df_lobster
except Exception as e:
    print(f"Error loading data: {e}")

# pipeline.load_data(data, nrows=nrows)

# Engineer Features
selected_features = ['base', 'tao', 'hawkes', 'poutre']
pipeline.engineer_features(feature_sets=selected_features)

# Preprocess Data
pipeline.scale_and_sequence(method='box-cox', train_split=0.7)

Pipeline initialized on device: cuda
Loading LOBSTER data from data/LOBSTER/AMZN_2012-06-21_34200000_57600000_orderbook_10.csv...
Successfully loaded 40000 LOBSTER rows.
Engineering features: ['base', 'tao', 'hawkes', 'poutre']...
Feature Engineering complete. Total features: 121
Preprocessing with method: box-cox...
Dropping 12 constant/zero-variance features: ['Hawkes_L_ask_beta10_Eta1.0', 'Hawkes_L_bid_beta100_Eta10.0', 'Hawkes_L_bid_beta1000_Eta1.0', 'Hawkes_L_bid_beta1000_Eta10.0', 'Hawkes_L_bid_beta10_Eta10.0', 'Hawkes_L_ask_beta100_Eta1.0', 'Hawkes_L_bid_beta10_Eta1.0', 'Hawkes_L_bid_beta100_Eta1.0', 'Hawkes_L_ask_beta1000_Eta1.0', 'Hawkes_L_ask_beta1000_Eta10.0', 'Hawkes_L_ask_beta100_Eta10.0', 'Hawkes_L_ask_beta10_Eta10.0']
Data split: Train (27982, 25, 109), Test (11993, 25, 109)


<pipeline.AnomalyDetectionPipeline at 0x1a5e657ebc0>

In [4]:
# Train PNN Model
pipeline.train_model(
    model_type='pnn', 
    epochs=2, 
    lr=1e-3,
    hidden_dim=64
)

Initializing Probabilistic Neural Network (PNN)...
Training PNN...
Epoch 1/2 - Loss: 1.396600
Epoch 2/2 - Loss: 0.953557


<pipeline.AnomalyDetectionPipeline at 0x1a5e657ebc0>

In [5]:
# Scan for anomalies using PNN
spoof_results = pipeline.detect_spoofing(
    Q_spoof=50000,    # Size of hypothetical spoof order
    delta_ticks=5,    # Distance away from best quote
    maker_fee=-0.002, # Negative fee (rebate) common in maker-taker markets
    taker_fee=0.003
)
if not spoof_results.empty:
    print(f"Found {len(spoof_results)} potential spoofing signatures.")
    
    # Plot expected gain over time
    plt.figure(figsize=(12, 4))
    plt.plot(spoof_results['Index'], spoof_results['Expected_Gain'], '.', color='red', label='Positive Gain')
    plt.title("Detected Spoofing Opportunities (Positive Expected Gain)")
    plt.xlabel("Time Step")
    plt.ylabel("Estimated Profit")
    plt.legend()
    plt.show()
else:
    print("No obvious spoofing opportunities detected with current parameters.")

Scanning for spoofing (Q=50000, dist=5 ticks)...
Found 0 potential spoofing opportunities.
No obvious spoofing opportunities detected with current parameters.


In [6]:
def plot_lob_snapshot(pipeline, index, levels=10):
    """Visualizes the Order Book shape at a specific index."""
    row = pipeline.raw_df.iloc[index]
    
    bids = [row[f'bid-volume-{i}'] for i in range(1, levels+1)]
    asks = [row[f'ask-volume-{i}'] for i in range(1, levels+1)]
    
    # Levels (1 to 10)
    x = np.arange(1, levels+1)
    
    plt.figure(figsize=(10, 5))
    plt.bar(x, bids, color='green', label='Bid Volume (Buy)', alpha=0.7)
    plt.bar(x, [-a for a in asks], color='red', label='Ask Volume (Sell)', alpha=0.7) # Negative for visual contrast
    
    plt.axhline(0, color='black', linewidth=0.8)
    plt.xlabel("Level (1 = Best Quote)")
    plt.ylabel("Volume (Shares)")
    plt.title(f"LOB Snapshot at Index {index}")
    plt.legend()
    plt.show()

# Visualize the most profitable spoofing opportunity found by PNN
if not spoof_results.empty:
    best_opportunity_idx = spoof_results.loc[spoof_results['Expected_Gain'].idxmax(), 'Index']
    print(f"Visualizing Order Book at most profitable moment: Index {best_opportunity_idx}")
    plot_lob_snapshot(pipeline, best_opportunity_idx)

In [7]:
def plot_lob_evolution(pipeline, center_index, offset=10, levels=10):
    """
    Plots LOB snapshots before, during, and after a specific index.
    
    Args:
        center_index: The time index of the detected anomaly.
        offset: Number of time steps to look before/after.
        levels: Number of price levels to display.
    """
    # Define time points: Before, Event, After
    indices = [center_index - offset, center_index, center_index + offset]
    titles = [f"Before (t={center_index - offset})", 
              f"Event (t={center_index})", 
              f"After (t={center_index + offset})"]
    
    fig, axes = plt.subplots(1, 3, figsize=(18, 5), sharey=True)
    x = np.arange(1, levels + 1)
    
    for i, idx in enumerate(indices):
        # Safety check for index bounds
        if 0 <= idx < len(pipeline.raw_df):
            row = pipeline.raw_df.iloc[idx]
            
            # Extract volumes
            bids = [row[f'bid-volume-{l}'] for l in range(1, levels+1)]
            asks = [row[f'ask-volume-{l}'] for l in range(1, levels+1)]
            
            # Plot Bid (Green) vs Ask (Red/Negative)
            axes[i].bar(x, bids, color='green', label='Bid Volume' if i==0 else "", alpha=0.7)
            axes[i].bar(x, [-a for a in asks], color='red', label='Ask Volume' if i==0 else "", alpha=0.7)
            
            axes[i].axhline(0, color='black', linewidth=0.8)
            axes[i].set_title(titles[i])
            axes[i].set_xlabel("Price Level (1=Best)")
            axes[i].grid(True, alpha=0.3)
            
            if i == 0:
                axes[i].set_ylabel("Volume (Shares)")
                axes[i].legend()
        else:
            axes[i].text(0.5, 0.5, "Index Out of Bounds", ha='center')

    plt.suptitle(f"Order Book Dynamics Around Potential Spoofing Event", fontsize=14)
    plt.tight_layout()
    plt.show()

if not spoof_results.empty:
    # Use the index of the most profitable opportunity
    best_idx = spoof_results.loc[spoof_results['Expected_Gain'].idxmax(), 'Index']
    
    offset = 1
    print(f"Visualizing evolution around index {best_idx} with offset {offset}...")
    plot_lob_evolution(pipeline, best_idx, offset=offset)