In [1]:
import json
import sys
from datetime import datetime, timedelta
from pathlib import Path
import polars as pl


project_root = Path().resolve().parent
sys.path.insert(0, str(project_root / "src"))

from dspy.hdb import get_dataset
from dspy.sim.market_simulator import MarketSimulator
from dspy.utils import to_ns, ts_to_str
from dspy.features.feature_utils import apply_batch_features, extract_features , flatten_features
from dspy.agents.agent_utils import get_agent
from dspy.features.utils import get_products
from dspy.utils import add_ts_dt

# ---------- Load run config file ----------

def load_config(path: Path) -> dict:
    with open(path, "r") as f:
        return json.load(f)

config_path = project_root / "run/run_config.json"
config = load_config(config_path)

dataset_name     = config["dataset"]
product          = config["product"]
depth            = config["depth"]
latency_ns       = config["latency_micros"] * 1_000
max_inventory    = config["max_inventory"]
inv_penalty      = config["inventory_penalty"]
initial_cash     = config["initial_cash"]
agent_config     = config["agent"]
intervals        = config["intervals"]
min_order_size   = config["min_order_size"]
tick_size        = config["tick_size"]
initial_cash     = config["initial_cash"]
cost_in_bps      = config["cost_in_bps"]
fixed_cost       = config["fixed_cost"]
simulator_mode   = config["simulator_mode"]

loader = get_dataset(dataset_name)
all_books, all_ts = [], []
feature_path = project_root / "run/features.json"
feature_config = load_config(feature_path)
inventory_feature_flag = "inventory" in feature_config.keys()

#loaddataframe
for interval in intervals:
        start_str = interval["start"]
        end_str   = interval["end"]
        print('dataframe from:', start_str,'to:',end_str)

        start_ts = datetime.strptime(interval["start"], "%Y-%m-%d %H:%M:%S").strftime("%y%m%d.%H%M%S")
        end_ts   = datetime.strptime(interval["end"],   "%Y-%m-%d %H:%M:%S").strftime("%y%m%d.%H%M%S")

        df = loader.load_book(
            product=product,
            times=[start_ts, end_ts],
            depth=depth,
            type="book_snapshot_25",
            lazy=False
        )
from dspy.features.book_features import add_mid,add_vwap
df=add_mid(df)
df.head()

dataframe from: 2025-04-01 00:00:00 to: 2025-04-02 12:59:59


ts,ts_local,asks[0].price,asks[0].amount,bids[0].price,bids[0].amount,asks[1].price,asks[1].amount,bids[1].price,bids[1].amount,asks[2].price,asks[2].amount,bids[2].price,bids[2].amount,asks[3].price,asks[3].amount,bids[3].price,bids[3].amount,asks[4].price,asks[4].amount,bids[4].price,bids[4].amount,mid
i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1743465600287000000,1743465601009280000,82517.5,5.192,82517.4,10.116,82517.6,0.79,82517.3,0.016,82517.9,0.002,82517.2,0.002,82518.3,0.002,82516.1,0.185,82518.4,0.002,82516.0,0.091,82517.45
1743465602054000000,1743465602057050000,82517.5,5.192,82517.4,10.098,82517.6,0.79,82517.3,0.016,82517.9,0.002,82517.2,0.002,82518.3,0.002,82516.1,0.185,82518.4,0.002,82516.0,0.091,82517.45
1743465604841000000,1743465604844153000,82517.5,5.174,82517.4,8.993,82517.6,0.79,82517.3,0.016,82517.9,0.002,82517.2,0.002,82518.3,0.002,82516.1,0.185,82518.4,0.002,82516.0,0.002,82517.45
1743465604892000000,1743465604895330000,82517.5,5.485,82517.4,4.088,82517.6,0.79,82517.3,0.016,82517.9,0.002,82517.2,0.002,82518.3,0.002,82516.0,0.002,82518.4,0.002,82515.5,0.002,82517.45
1743465604943000000,1743465604946217000,82525.8,0.159,82525.7,2.129,82525.9,0.005,82525.3,0.1,82526.4,1.585,82523.6,0.1,82526.5,0.029,82523.1,0.1,82526.6,0.004,82522.7,0.097,82525.75


In [2]:
df = add_ts_dt(df)
df.head(10)

ts,ts_local,asks[0].price,asks[0].amount,bids[0].price,bids[0].amount,asks[1].price,asks[1].amount,bids[1].price,bids[1].amount,asks[2].price,asks[2].amount,bids[2].price,bids[2].amount,asks[3].price,asks[3].amount,bids[3].price,bids[3].amount,asks[4].price,asks[4].amount,bids[4].price,bids[4].amount,mid,ts_dt
i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,datetime[ns]
1743465600287000000,1743465601009280000,82517.5,5.192,82517.4,10.116,82517.6,0.79,82517.3,0.016,82517.9,0.002,82517.2,0.002,82518.3,0.002,82516.1,0.185,82518.4,0.002,82516.0,0.091,82517.45,2025-04-01 00:00:00.287
1743465602054000000,1743465602057050000,82517.5,5.192,82517.4,10.098,82517.6,0.79,82517.3,0.016,82517.9,0.002,82517.2,0.002,82518.3,0.002,82516.1,0.185,82518.4,0.002,82516.0,0.091,82517.45,2025-04-01 00:00:02.054
1743465604841000000,1743465604844153000,82517.5,5.174,82517.4,8.993,82517.6,0.79,82517.3,0.016,82517.9,0.002,82517.2,0.002,82518.3,0.002,82516.1,0.185,82518.4,0.002,82516.0,0.002,82517.45,2025-04-01 00:00:04.841
1743465604892000000,1743465604895330000,82517.5,5.485,82517.4,4.088,82517.6,0.79,82517.3,0.016,82517.9,0.002,82517.2,0.002,82518.3,0.002,82516.0,0.002,82518.4,0.002,82515.5,0.002,82517.45,2025-04-01 00:00:04.892
1743465604943000000,1743465604946217000,82525.8,0.159,82525.7,2.129,82525.9,0.005,82525.3,0.1,82526.4,1.585,82523.6,0.1,82526.5,0.029,82523.1,0.1,82526.6,0.004,82522.7,0.097,82525.75,2025-04-01 00:00:04.943
1743465604994000000,1743465604997436000,82525.8,0.148,82525.7,5.081,82525.9,0.005,82525.3,0.1,82526.4,1.585,82523.6,0.1,82526.5,0.002,82523.1,0.1,82526.6,0.004,82522.7,0.097,82525.75,2025-04-01 00:00:04.994
1743465605046000000,1743465605049051000,82526.4,0.92,82525.7,0.135,82526.5,0.002,82523.6,0.1,82526.6,0.004,82523.1,0.1,82526.7,0.002,82522.7,0.097,82526.8,0.002,82522.4,0.1,82526.05,2025-04-01 00:00:05.046
1743465605099000000,1743465605101874000,82526.4,0.065,82523.6,0.022,82526.5,0.002,82523.1,0.1,82526.6,0.004,82522.7,0.097,82526.7,0.002,82522.4,0.1,82526.8,0.002,82522.0,0.103,82525.0,2025-04-01 00:00:05.099
1743465605150000000,1743465605153810000,82481.5,0.2,82479.4,0.496,82484.0,0.05,82479.3,1.323,82508.5,0.05,82479.2,0.012,82509.2,0.06,82479.0,0.012,82510.4,0.07,82478.9,0.002,82480.45,2025-04-01 00:00:05.150
1743465605201000000,1743465605205113000,82481.5,4.708,82480.3,0.012,82482.3,0.05,82479.6,0.352,82484.0,0.05,82479.4,0.489,82507.6,0.158,82479.3,1.323,82508.5,0.208,82479.2,0.012,82480.9,2025-04-01 00:00:05.201


In [3]:
df=add_vwap(df,1,5)
df.head(10)

ts,ts_local,asks[0].price,asks[0].amount,bids[0].price,bids[0].amount,asks[1].price,asks[1].amount,bids[1].price,bids[1].amount,asks[2].price,asks[2].amount,bids[2].price,bids[2].amount,asks[3].price,asks[3].amount,bids[3].price,bids[3].amount,asks[4].price,asks[4].amount,bids[4].price,bids[4].amount,mid,ts_dt,vwap_level1
i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,datetime[ns],f64
1743465600287000000,1743465601009280000,82517.5,5.192,82517.4,10.116,82517.6,0.79,82517.3,0.016,82517.9,0.002,82517.2,0.002,82518.3,0.002,82516.1,0.185,82518.4,0.002,82516.0,0.091,82517.45,2025-04-01 00:00:00.287,82517.433917
1743465602054000000,1743465602057050000,82517.5,5.192,82517.4,10.098,82517.6,0.79,82517.3,0.016,82517.9,0.002,82517.2,0.002,82518.3,0.002,82516.1,0.185,82518.4,0.002,82516.0,0.091,82517.45,2025-04-01 00:00:02.054,82517.433957
1743465604841000000,1743465604844153000,82517.5,5.174,82517.4,8.993,82517.6,0.79,82517.3,0.016,82517.9,0.002,82517.2,0.002,82518.3,0.002,82516.1,0.185,82518.4,0.002,82516.0,0.002,82517.45,2025-04-01 00:00:04.841,82517.436521
1743465604892000000,1743465604895330000,82517.5,5.485,82517.4,4.088,82517.6,0.79,82517.3,0.016,82517.9,0.002,82517.2,0.002,82518.3,0.002,82516.0,0.002,82518.4,0.002,82515.5,0.002,82517.45,2025-04-01 00:00:04.892,82517.457297
1743465604943000000,1743465604946217000,82525.8,0.159,82525.7,2.129,82525.9,0.005,82525.3,0.1,82526.4,1.585,82523.6,0.1,82526.5,0.029,82523.1,0.1,82526.6,0.004,82522.7,0.097,82525.75,2025-04-01 00:00:04.943,82525.706949
1743465604994000000,1743465604997436000,82525.8,0.148,82525.7,5.081,82525.9,0.005,82525.3,0.1,82526.4,1.585,82523.6,0.1,82526.5,0.002,82523.1,0.1,82526.6,0.004,82522.7,0.097,82525.75,2025-04-01 00:00:04.994,82525.70283
1743465605046000000,1743465605049051000,82526.4,0.92,82525.7,0.135,82526.5,0.002,82523.6,0.1,82526.6,0.004,82523.1,0.1,82526.7,0.002,82522.7,0.097,82526.8,0.002,82522.4,0.1,82526.05,2025-04-01 00:00:05.046,82526.310427
1743465605099000000,1743465605101874000,82526.4,0.065,82523.6,0.022,82526.5,0.002,82523.1,0.1,82526.6,0.004,82522.7,0.097,82526.7,0.002,82522.4,0.1,82526.8,0.002,82522.0,0.103,82525.0,2025-04-01 00:00:05.099,82525.691954
1743465605150000000,1743465605153810000,82481.5,0.2,82479.4,0.496,82484.0,0.05,82479.3,1.323,82508.5,0.05,82479.2,0.012,82509.2,0.06,82479.0,0.012,82510.4,0.07,82478.9,0.002,82480.45,2025-04-01 00:00:05.150,82480.003448
1743465605201000000,1743465605205113000,82481.5,4.708,82480.3,0.012,82482.3,0.05,82479.6,0.352,82484.0,0.05,82479.4,0.489,82507.6,0.158,82479.3,1.323,82508.5,0.208,82479.2,0.012,82480.9,2025-04-01 00:00:05.201,82481.496949


In [44]:
def add_ret_time(
    df: pl.DataFrame,
    delta: int = 50,
    base_col: str = "mid",
    levels: int = 1,
    depth: int = 1,
    time_col: str = "ts_dt",
    products: list[str] | None = None
) -> pl.DataFrame:
    
    if time_col not in df.columns:
        df = add_ts_dt(df)

    # Ensure time_col is datetime type
    if df[time_col].dtype != pl.Datetime:
        df = df.with_columns(pl.col(time_col).cast(pl.Datetime))


    if base_col == "mid":
        col_prefix = "mid"
        if not any(col.startswith(col_prefix) for col in df.columns):
            df = add_mid(df, products=products)
    elif base_col == "vwap":
        col_prefix = f"vwap_level{levels}"
        if not any(col.startswith(col_prefix) for col in df.columns):
            df = add_vwap(df, levels=levels, depth=depth, products=products)
    else:
        raise ValueError("base_col must be 'mid' or 'vwap'.")

    if products is None:
        products = get_products(df, [col_prefix])

    if products == []:
        price_col = col_prefix
        lagged_df = (
            df.select([time_col, price_col])
            .with_columns([(pl.col(time_col) + pl.duration(milliseconds=delta)).cast(pl.Datetime("ns")).alias(time_col)])
            .rename({price_col: f"{price_col}_past"})
        )

        df = df.join_asof(
            lagged_df,
            left_on=time_col,
            right_on=time_col,
            strategy="backward",
            tolerance=timedelta(milliseconds=1000000)
         ).with_columns([
            ((pl.col(price_col)/pl.col(f"{price_col}_past"))-1).alias(
                f"{col_prefix}_ret_{delta}ms" if base_col == "mid" else f"vwap_ret_{delta}ms_l{levels}"
            )
        ])

    else:
        for product in products:
            col = f"{col_prefix}_{product}"
            lagged_df = df.select([time_col, col]).rename({col: f"{col}_past"})

            df = df.join_asof(
                lagged_df,
                left_on=time_col,
                right_on=time_col,
                strategy="backward",
                tolerance=timedelta(milliseconds=1000000)
            ).with_columns([
                ((pl.col(col)/pl.col(f"{col}_past")) -1).alias(
                    f"{col_prefix}_ret_{delta}ms_{product}" if base_col == "mid"
                    else f"vwap_ret_{delta}ms_l{levels}_{product}"
                )
            ])

    return df.drop_nulls()


In [4]:
#new
def add_ret_time(
    df: pl.DataFrame,
    delta: int = 50,  # in milliseconds
    base_col: str = "mid",  # 'mid' or 'vwap'
    levels: int = 1,
    depth: int = 1,
    time_col: str = "ts_dt",
    products: list[str] | None = None
) -> pl.DataFrame:

    if time_col not in df.columns:
        df = add_ts_dt(df)

    # Ensure time_col is datetime[ns]
    if df[time_col].dtype != pl.Datetime("ns"):
        df = df.with_columns(pl.col(time_col).cast(pl.Datetime("ns")))

    # Ensure price column exists
    if base_col == "mid":
        col_prefix = "mid"
        if not any(col.startswith(col_prefix) for col in df.columns):
            df = add_mid(df, products=products)
    elif base_col == "vwap":
        col_prefix = f"vwap_level{levels}"
        if not any(col.startswith(col_prefix) for col in df.columns):
            df = add_vwap(df, levels=levels, depth=depth, products=products)
    else:
        raise ValueError("base_col must be 'mid' or 'vwap'.")

    if products is None:
        products = get_products(df, [col_prefix])

    if products == []:
        price_col = col_prefix
        lagged_df = (
            df.select([time_col, price_col])
            .with_columns(
                (pl.col(time_col) + pl.duration(milliseconds=delta))
                .cast(pl.Datetime("ns"))
                .alias(time_col)
            )
            .rename({price_col: f"{price_col}_past"})
        )
        ret_col_name = (
                f"ret_{delta}ms_{col_prefix}"
            )
        df = df.join_asof(
            lagged_df,
            left_on=time_col,
            right_on=time_col,
            strategy="backward",
            tolerance=timedelta(milliseconds=1000000)  # generous tolerance
        ).with_columns([
            ((pl.col(price_col) / pl.col(f"{price_col}_past")) - 1).alias(ret_col_name)
        ]).drop([f"{price_col}_past"])

    else:
        for product in products:
            col = f"{col_prefix}_{product}"
            lagged_df = (
                df.select([time_col, col])
                .with_columns(
                    (pl.col(time_col) + pl.duration(milliseconds=delta))
                    .cast(pl.Datetime("ns"))
                    .alias(time_col)
                )
                .rename({col: f"{col}_past"})
            )

            ret_col_name = (
                f"ret_{delta}ms_{col_prefix}_{product}"
            )

            df = df.join_asof(
                lagged_df,
                left_on=time_col,
                right_on=time_col,
                strategy="backward",
                tolerance=timedelta(milliseconds=1000000)
            ).with_columns([
                ((pl.col(col) / pl.col(f"{col}_past")) - 1).alias(ret_col_name)
            ]).drop([f"{col}_past"])

    return df.drop_nulls()

In [5]:
df = add_ret_time(df,200,'vwap',1,5)
df.head(10)

ts,ts_local,asks[0].price,asks[0].amount,bids[0].price,bids[0].amount,asks[1].price,asks[1].amount,bids[1].price,bids[1].amount,asks[2].price,asks[2].amount,bids[2].price,bids[2].amount,asks[3].price,asks[3].amount,bids[3].price,bids[3].amount,asks[4].price,asks[4].amount,bids[4].price,bids[4].amount,mid,ts_dt,vwap_level1,vwap_ret_200ms_l1
i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,datetime[ns],f64,f64
1743465602054000000,1743465602057050000,82517.5,5.192,82517.4,10.098,82517.6,0.79,82517.3,0.016,82517.9,0.002,82517.2,0.002,82518.3,0.002,82516.1,0.185,82518.4,0.002,82516.0,0.091,82517.45,2025-04-01 00:00:02.054,82517.433957,4.8388e-10
1743465604841000000,1743465604844153000,82517.5,5.174,82517.4,8.993,82517.6,0.79,82517.3,0.016,82517.9,0.002,82517.2,0.002,82518.3,0.002,82516.1,0.185,82518.4,0.002,82516.0,0.002,82517.45,2025-04-01 00:00:04.841,82517.436521,3.108e-08
1743465604892000000,1743465604895330000,82517.5,5.485,82517.4,4.088,82517.6,0.79,82517.3,0.016,82517.9,0.002,82517.2,0.002,82518.3,0.002,82516.0,0.002,82518.4,0.002,82515.5,0.002,82517.45,2025-04-01 00:00:04.892,82517.457297,2.8285e-07
1743465604943000000,1743465604946217000,82525.8,0.159,82525.7,2.129,82525.9,0.005,82525.3,0.1,82526.4,1.585,82523.6,0.1,82526.5,0.029,82523.1,0.1,82526.6,0.004,82522.7,0.097,82525.75,2025-04-01 00:00:04.943,82525.706949,0.0001
1743465604994000000,1743465604997436000,82525.8,0.148,82525.7,5.081,82525.9,0.005,82525.3,0.1,82526.4,1.585,82523.6,0.1,82526.5,0.002,82523.1,0.1,82526.6,0.004,82522.7,0.097,82525.75,2025-04-01 00:00:04.994,82525.70283,0.0001
1743465605046000000,1743465605049051000,82526.4,0.92,82525.7,0.135,82526.5,0.002,82523.6,0.1,82526.6,0.004,82523.1,0.1,82526.7,0.002,82522.7,0.097,82526.8,0.002,82522.4,0.1,82526.05,2025-04-01 00:00:05.046,82526.310427,0.000108
1743465605099000000,1743465605101874000,82526.4,0.065,82523.6,0.022,82526.5,0.002,82523.1,0.1,82526.6,0.004,82522.7,0.097,82526.7,0.002,82522.4,0.1,82526.8,0.002,82522.0,0.103,82525.0,2025-04-01 00:00:05.099,82525.691954,0.0001
1743465605150000000,1743465605153810000,82481.5,0.2,82479.4,0.496,82484.0,0.05,82479.3,1.323,82508.5,0.05,82479.2,0.012,82509.2,0.06,82479.0,0.012,82510.4,0.07,82478.9,0.002,82480.45,2025-04-01 00:00:05.150,82480.003448,-0.000554
1743465605201000000,1743465605205113000,82481.5,4.708,82480.3,0.012,82482.3,0.05,82479.6,0.352,82484.0,0.05,82479.4,0.489,82507.6,0.158,82479.3,1.323,82508.5,0.208,82479.2,0.012,82480.9,2025-04-01 00:00:05.201,82481.496949,-0.000536
1743465605252000000,1743465605254936000,82481.5,6.204,82481.1,4.926,82482.3,0.05,82480.3,0.012,82484.0,0.05,82479.6,0.352,82507.6,0.158,82479.3,1.323,82508.5,0.208,82479.2,0.012,82481.3,2025-04-01 00:00:05.252,82481.322965,-0.000545


In [15]:
import numpy as np
arr=[82517.45,82517.45,82517.45,82517.45,82525.75]
arr=np.array(arr)
(82525.75-np.mean(arr))/np.std(arr,ddof=1)
np.std(arr,ddof=1)
np.mean(arr)

np.float64(82519.11)

In [21]:
col_prefix = 'mid'
time_col="ts_dt"
ret_col='retm'
vol_col='tickvoltest'
delta = 4000
lagged_df =df.select([time_col, col_prefix]).with_columns([
            (pl.col(time_col) + pl.duration(milliseconds=-delta)).alias("join_key"),
            pl.col(col_prefix).alias(f"{col_prefix}_past")])
lagged_df

ts_dt,mid,join_key,mid_past
datetime[ns],f64,datetime[μs],f64
2025-04-01 00:00:00.287,82517.45,2025-03-31 23:59:56.287,82517.45
2025-04-01 00:00:02.054,82517.45,2025-03-31 23:59:58.054,82517.45
2025-04-01 00:00:04.841,82517.45,2025-04-01 00:00:00.841,82517.45
2025-04-01 00:00:04.892,82517.45,2025-04-01 00:00:00.892,82517.45
2025-04-01 00:00:04.943,82525.75,2025-04-01 00:00:00.943,82525.75
…,…,…,…
2025-04-02 12:59:58.746,85008.35,2025-04-02 12:59:54.746,85008.35
2025-04-02 12:59:58.799,85008.35,2025-04-02 12:59:54.799,85008.35
2025-04-02 12:59:58.852,85008.85,2025-04-02 12:59:54.852,85008.85
2025-04-02 12:59:58.905,85008.85,2025-04-02 12:59:54.905,85008.85


In [22]:
df.join(lagged_df, left_on=time_col, right_on="join_key", how="left")

SchemaError: datatypes of join keys don't match - `ts_dt`: datetime[ns] on left does not match `join_key`: datetime[μs] on right

In [12]:
c=82526.310427
p=82525.706949
(c-p)/p

7.312606244954507e-06

In [8]:
aq=5.192+0.79
bq=10.116+0.016	
(bq-aq)/(bq+aq)

0.2575400273054486

In [5]:
aq=5.192
ap=82517.5
bq=10.116
bp=82517.4
(bp*aq+ap*bq)/(bq+aq)

82517.46608309382

In [6]:
(bp*bq+ap*aq)/(bq+aq)

82517.43391690619