# 156 Project Work

The idea is to convert as much LOB into 2d heatmaps as possible. The heatmap conversion needs a lot of work to ensure it can be properly fed into a convolutional NN. The output of the CNN should be a 1, 2, or 3 that reflects whether the mid-price movement is (1) downward, (2) stationary, (3) upward. We will evaluate the efficacy of out CNN through win-rate.

The dataset is so large, so we will (probably) only use one of the datasets for training and one for testing. They are already split into normalized cross-fold training and testing data, so we will just use the NoAuction, min-max normalized, CF1 training and testing dataset. 

The last five rows of the dataset represents the price movement (1, 2, or 3), while the first 144 rows are the predictors. The first 40 rows are the only rows we will be using. They are in the following order:
Rows 1-10: Bid prices with 1 being the best, 10 being the worst
Rows 11-20: Ask prices with 11 being the best, 20 being the worst
Rows 21-30: Bid volumes associated with each bid price
Rows 31-40: Ask volumes associated with each ask price

Here is the dataset link and the paper associated with it, respectively:
https://etsin.fairdata.fi/dataset/73eb48d7-4dbc-4a10-a52a-da745b47a649
https://arxiv.org/pdf/1705.03233

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler

def load_fi2010_data(file_path, num_levels=10):
    """
    Load the FI-2010 dataset from a .txt file and extract the top 'num_levels' bid-ask levels.
    
    Args:
    - file_path (str): Path to the FI-2010 .txt dataset.
    - num_levels (int): Number of bid-ask levels to extract.

    Returns:
    - DataFrame with top bid-ask levels.
    """
    col_names = []
    
    # Define column names for bid/ask prices and sizes
    for i in range(1, num_levels + 1):
        col_names.append(f"ask_price_{i}")
        col_names.append(f"ask_size_{i}")
        col_names.append(f"bid_price_{i}")
        col_names.append(f"bid_size_{i}")

    # Load dataset from .txt file (space-separated)
    data = pd.read_csv(file_path, delimiter=r'\s+', header=None)
    
    # Select the first (num_levels * 4) columns corresponding to LOB data
    data = data.iloc[:, : num_levels * 4]
    data.columns = col_names
    
    return data

def convert_to_2d_matrix(bid_prices, ask_prices, bid_volumes, ask_volumes):
    """
    Convert the bid/ask data into a 10x4 matrix for CNN input.
    
    Args:
    - bid_prices (np.array): Normalized bid prices (10 levels).
    - ask_prices (np.array): Normalized ask prices (10 levels).
    - bid_volumes (np.array): Normalized bid volumes (10 levels).
    - ask_volumes (np.array): Normalized ask volumes (10 levels).
    
    Returns:
    - 10x4 matrix (height x width), where height = number of price levels (10) and width = 4.
    """
    # Create a 10x4 matrix for each snapshot
    lob_matrix = np.zeros((10, 4))  # 10 price levels and 4 features (Ask Price, Bid Price, Ask Volume, Bid Volume)
    
    # Assign data to the matrix
    lob_matrix[:, 0] = ask_prices  # Ask Prices
    lob_matrix[:, 1] = bid_prices  # Bid Prices
    lob_matrix[:, 2] = ask_volumes  # Ask Volumes
    lob_matrix[:, 3] = bid_volumes  # Bid Volumes
    
    return lob_matrix

def create_lob_heatmap(lob_matrix, save_path=None):
    """
    Convert the LOB data matrix into a 2D heatmap.
    
    Args:
    - lob_matrix (np.array): 10x4 matrix of normalized bid-ask data.
    - save_path (str): Path to save the generated heatmap image.
    
    Returns:
    - None (saves image if save_path is provided).
    """
    plt.figure(figsize=(6, 6))
    sns.heatmap(lob_matrix, annot=False, cmap="coolwarm", xticklabels=["Ask Price", "Bid Price", "Ask Volume", "Bid Volume"], yticklabels=np.arange(10, 0, -1))
    plt.title("Limit Order Book Heatmap")
    plt.xlabel("Order Type")
    plt.ylabel("Price Level")
    
    if save_path:
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        plt.savefig(save_path, bbox_inches="tight")
        plt.close()
    else:
        plt.show()

def generate_heatmaps(dataset, save_dir, num_samples=1):
    """
    Generate heatmaps for a given number of LOB snapshots.
    
    Args:
    - dataset (DataFrame): FI-2010 dataset containing LOB snapshots.
    - save_dir (str): Directory to save the heatmaps.
    - num_samples (int): Number of samples to convert to heatmaps.
    
    Returns:
    - None (Saves images in save_dir).
    """
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    print(f"Generating {num_samples} heatmaps...")
    for i in tqdm(range(min(num_samples, len(dataset)))):  # Ensuring we don't exceed dataset size
        # Extract LOB data
        row = dataset.iloc[i].values
        
        # Split data into bid/ask prices and volumes
        bid_prices = row[20:30]  # Columns 20-29 for bid prices
        ask_prices = row[0:10]   # Columns 0-9 for ask prices
        bid_volumes = row[30:40]  # Columns 30-39 for bid volumes
        ask_volumes = row[10:20]  # Columns 10-19 for ask volumes
        
        # Normalize and convert to 2D matrix
        lob_matrix = convert_to_2d_matrix(bid_prices, ask_prices, bid_volumes, ask_volumes)
        
        # Save heatmap
        save_path = os.path.join(save_dir, f"heatmap_{i}.png")
        create_lob_heatmap(lob_matrix, save_path)

# ---- Step 6: Process Labels ----
def load_labels(file_path, label_row=144):
    """
    Load the labels (price movement directions).
    
    Args:
    - file_path (str): Path to the dataset.
    - label_row (int): Row where the labels are located (144 for row 145).
    
    Returns:
    - One-hot encoded labels for price movements.
    """
    data = pd.read_csv(file_path, delimiter=r'\s+', header=None)
    labels = data.iloc[label_row].values  # Extract labels
    one_hot_labels = np.array([one_hot_encode(label) for label in labels])
    return one_hot_labels

def one_hot_encode(label):
    """
    One-hot encode the label for price movement.
    
    Args:
    - label (int): Price movement label (1, 2, or 3).
    
    Returns:
    - One-hot encoded vector.
    """
    if label == 1:
        return [1, 0, 0]
    elif label == 2:
        return [0, 1, 0]
    elif label == 3:
        return [0, 0, 1]

# ---- Step 7: Final Code to Prepare Data for CNN ----
def prepare_data_for_cnn(file_path, num_samples=1, num_levels=10):
    """
    Prepare the dataset for CNN training by processing LOB data and labels.
    
    Args:
    - file_path (str): Path to the FI-2010 dataset.
    - num_samples (int): Number of samples to process.
    - num_levels (int): Number of price levels.
    
    Returns:
    - X: Feature matrix (LOB snapshots converted to 10x4 matrices).
    - y: One-hot encoded labels.
    """
    # Load the data
    dataset = load_fi2010_data(file_path, num_levels)
    
    # Prepare labels
    labels = load_labels(file_path)
    
    # Prepare features (LOB data converted to 10x4 matrices)
    X = []
    for i in tqdm(range(min(num_samples, len(dataset)))):
        row = dataset.iloc[i].values
        bid_prices = row[20:30]
        ask_prices = row[0:10]
        bid_volumes = row[30:40]
        ask_volumes = row[10:20]
        
        lob_matrix = convert_to_2d_matrix(bid_prices, ask_prices, bid_volumes, ask_volumes)
        X.append(lob_matrix)
    
    X = np.array(X)
    return X, labels

file_path = 'Train_DST.txt'  
X, y = prepare_data_for_cnn(file_path, num_samples=5)

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np

# Define CNN architecture
def build_cnn(input_shape, num_classes=3):
    model = keras.Sequential([
        layers.Conv2D(32, (3, 3), activation='relu', padding='same', input_shape=input_shape),
        layers.MaxPooling2D((2, 2)),
        
        layers.Conv2D(64, (3, 3), activation='relu', padding='same'),
        layers.MaxPooling2D((2, 2)),

        layers.Conv2D(128, (3, 3), activation='relu', padding='same'),
        layers.MaxPooling2D((2, 2)),

        layers.Flatten(),
        layers.Dense(128, activation='relu'),
        layers.Dense(64, activation='relu'),
        layers.Dense(num_classes, activation='softmax')  # Multi-class classification
    ])
    
    model.compile(optimizer=keras.optimizers.Adam(),
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    
    return model

input_shape = (64, 64, 1) 
model = build_cnn(input_shape)

model.summary()

In [None]:
import numpy as np
import tensorflow as tf
from sklearn.metrics import classification_report

# Make predictions
y_pred = model.predict(X)
y_pred_classes = np.argmax(y_pred, axis=1)  # Convert softmax outputs to class labels

# Evaluate model performance
loss, accuracy = model.evaluate(X, y, verbose=0)
print(f"Test Accuracy: {accuracy:.4f}")

# Generate classification report
print("Classification Report:")
print(classification_report(y, y_pred_classes, target_names=["Down", "Stationary", "Up"]))