In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from itertools import combinations

In [46]:
# Load Data
train_data = pd.read_csv("data/train_data.csv")
train_data.dropna(inplace=True)

In [57]:
def feature_engineering(df):
    # 1. Volume Imbalance at best bid/ask
    df["volume_imbalance_1"] = (
        df["bid_volume_1"] - df["ask_volume_1"]
    ) / (df["bid_volume_1"] + df["ask_volume_1"] + 1e-10)

    # 2. Cumulative volume imbalance (top 5)
    df["cum_bid_vol"] = df[[f"bid_volume_{i}" for i in range(1, 6)]].sum(axis=1)
    df["cum_ask_vol"] = df[[f"ask_volume_{i}" for i in range(1, 6)]].sum(axis=1)
    df["volume_imbalance_5"] = (
        df["cum_bid_vol"] - df["cum_ask_vol"]
    ) / (df["cum_bid_vol"] + df["cum_ask_vol"] + 1e-10)

    # 3. Spread
    df["spread"] = df["ask_price_1"] - df["bid_price_1"]

    # 4. Inverted Midprice
    df["inv_midprice"] = 1 / (df["midprice"] + 1e-10)

    # 5. Liquidity slope
    df["bid_liquidity"] = (df["bid_price_1"] - df["bid_price_5"]) / (df["bid_volume_1"] + 1e-10)
    df["ask_liquidity"] = (df["ask_price_5"] - df["ask_price_1"]) / (df["ask_volume_1"] + 1e-10)

    # 6. Order flow imbalance
    df["order_flow_imbalance"] = (
        df["recent_buy_order_count"] - df["recent_sell_order_count"]
    ) / (df["total_order_count"] + 1e-10)

    df = df.dropna()

    # === Base Features (No normalization) ===
    base_features = [
        "volume_imbalance_1", "volume_imbalance_5", "spread", "inv_midprice",
        "bid_liquidity", "ask_liquidity", "order_flow_imbalance"
    ]

    # === Add Squared Features ===
    squared_cols = []
    for f in base_features:
        sq_col = f + "_squared"
        df[sq_col] = df[f] ** 2
        squared_cols.append(sq_col)

    # === Add Interaction Features ===
    interaction_cols = []
    for f1, f2 in combinations(base_features, 2):
        inter_col = f"{f1}_x_{f2}"
        df[inter_col] = df[f1] * df[f2]
        interaction_cols.append(inter_col)

    # Final feature list
    feature_cols = base_features + squared_cols + interaction_cols

    return df, feature_cols


In [61]:
train_data, feature_cols = feature_engineering(train_data)
print(train_data.columns)

X = train_data[feature_cols]
y = train_data["actual_returns"]  # Continuous future return

# === Standardize Features ===


# === Train Regression Model ===
model = LinearRegression()
model.fit(X, y)

# === In-sample Prediction ===
preds = model.predict(X)
mse = mean_squared_error(y, preds)

print(f"Mean Squared Error (In-sample): {mse:.6f}")


Index(['timestamp_code', 'last_trade_price', 'midprice',
       'recent_buy_order_count', 'recent_sell_order_count',
       'total_order_count', 'net_open_interest_change', 'bid_price_1',
       'bid_price_2', 'bid_price_3', 'bid_price_4', 'bid_price_5',
       'ask_price_1', 'ask_price_2', 'ask_price_3', 'ask_price_4',
       'ask_price_5', 'bid_volume_1', 'bid_volume_2', 'bid_volume_3',
       'bid_volume_4', 'bid_volume_5', 'ask_volume_1', 'ask_volume_2',
       'ask_volume_3', 'ask_volume_4', 'ask_volume_5', 'actual_returns',
       'volume_imbalance_1', 'cum_bid_vol', 'cum_ask_vol',
       'volume_imbalance_5', 'spread', 'inv_midprice', 'bid_liquidity',
       'ask_liquidity', 'order_flow_imbalance', 'volume_imbalance_1_norm',
       'volume_imbalance_5_norm', 'spread_norm', 'inv_midprice_norm',
       'bid_liquidity_norm', 'ask_liquidity_norm', 'order_flow_imbalance_norm',
       'volume_imbalance_1_norm_squared', 'volume_imbalance_5_norm_squared',
       'spread_norm_squared', '

In [62]:
# === Print Feature Coefficients ===
print("\nFeature Coefficients:")
for feature, coef in zip(feature_cols, model.coef_):
    print(f"{feature:35s}: {coef:.6f}")


Feature Coefficients:
volume_imbalance_1                 : 0.000120
volume_imbalance_5                 : 0.000037
spread                             : -0.000167
inv_midprice                       : 4.180602
bid_liquidity                      : 0.000185
ask_liquidity                      : -0.000145
order_flow_imbalance               : -0.000020
volume_imbalance_1_squared         : -0.000002
volume_imbalance_5_squared         : -0.000001
spread_squared                     : 0.000054
inv_midprice_squared               : -4139.075991
bid_liquidity_squared              : -0.000023
ask_liquidity_squared              : 0.000018
order_flow_imbalance_squared       : -0.000000
volume_imbalance_1_x_volume_imbalance_5: 0.000006
volume_imbalance_1_x_spread        : 0.000023
volume_imbalance_1_x_inv_midprice  : -0.230807
volume_imbalance_1_x_bid_liquidity : -0.000100
volume_imbalance_1_x_ask_liquidity : -0.000086
volume_imbalance_1_x_order_flow_imbalance: -0.000001
volume_imbalance_5_x_spread     

In [63]:
import pandas as pd

# Load test data (replace with actual test CSV file)
test_data = pd.read_csv("data/final_test_data.csv")


test_data, feature_cols = feature_engineering(test_data)

# Final feature matrix
X_test = test_data[feature_cols]  # feature_cols = list of features used in training



# ==== [2] Predict ====
preds = model.predict(X_test)  # or model.predict(X_test) if no scaling

# ==== [3] Prepare Submission Format ====
submission = pd.DataFrame({
    "timestamp_code": test_data["timestamp_code"],
    "predicted_returns": preds
})

# ==== [4] Save to CSV ====
submission.to_csv("predicted_returns.csv", index=False)
