In [47]:
import sys
import os
import pandas as pd
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import json

# go up one additional level from the current working directory
project_root = os.path.dirname(os.path.dirname(os.path.abspath(os.getcwd())))
if project_root not in sys.path:
    sys.path.insert(0, project_root)
print(project_root)

/cs/student/projects3/cf/2024/benjhunt/deep-lob-project


In [48]:
# Constants
data_location = fr"{project_root}/results/2025-08-26"
folders = [name for name in os.listdir(data_location)]
# Go one level deeper and get every pair of (folder, subfolder)
folder_pairs = []
for folder in folders:
    folder_path = os.path.join(data_location, folder)
    if os.path.isdir(folder_path):
        subfolders = [name for name in os.listdir(folder_path) if os.path.isdir(os.path.join(folder_path, name))]
        for subfolder in subfolders:
            folder_pairs.append((folder, subfolder))

json_files = [
    {'Representation': representation, 'Ticker': ticker, 'File': f}
    for representation, ticker in folder_pairs
    for f in glob.glob(os.path.join(data_location, representation, ticker, "*.json"))
]

# Load and flatten all JSON files into a single list of records
def flatten_record(entry):
    with open(entry['File'], 'r') as file:
        data = json.load(file)
        records = data if isinstance(data, list) else [data]
        for record in records:
            record.update({'Representation': entry['Representation'], 'Ticker': entry['Ticker']})
            # Flatten nested dicts
            for key in ['meta', 'metrics', 'metricsStrength']:
                if key in record and isinstance(record[key], dict):
                    for k, v in record[key].items():
                        record[f"{key}.{k}"] = v
            yield record

all_data = [rec for entry in json_files for rec in flatten_record(entry)]

joined_df = pd.DataFrame(all_data)
# Reorder columns so that 'Representation' and 'Ticker' are first
cols = ['Representation', 'Ticker'] + [col for col in joined_df.columns if col not in ['Representation', 'Ticker']]
joined_df = joined_df[cols]
data = joined_df.copy()
print(data.shape)
# data

(546, 54)


In [49]:
# show full DataFrame and full Series output
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

grouped = data.groupby(['Representation', 'Ticker', 'meta.labelType'])['meta.lookForwardHorizon'].agg(lambda x: ','.join(map(str, sorted(x))))

display(grouped)

Representation  Ticker  meta.labelType
orderbooks      AAPL    CATEGORICAL       10,20,40,60,80,100,200
                        REGRESSION        10,20,40,60,80,100,200
                BAC     CATEGORICAL       10,20,40,60,80,100,200
                        REGRESSION        10,20,40,60,80,100,200
                GOOG    CATEGORICAL       10,20,40,60,80,100,200
                        REGRESSION        10,20,40,60,80,100,200
                JPM     CATEGORICAL       10,20,40,60,80,100,200
                        REGRESSION        10,20,40,60,80,100,200
                META    CATEGORICAL       10,20,40,60,80,100,200
                        REGRESSION        10,20,40,60,80,100,200
                MSFT    CATEGORICAL       10,20,40,60,80,100,200
                        REGRESSION        10,20,40,60,80,100,200
                MTCH    CATEGORICAL       10,20,40,60,80,100,200
                        REGRESSION        10,20,40,60,80,100,200
                NFLX    CATEGORICAL       10,20,40,