In [8]:
import pandas as pd
import numpy as np
import glob # Used to find all the files matching a pattern
import os
from tqdm.auto import tqdm # For showing a progress bar

# --- Configuration ---
DATA_DIR = 'data' # The folder where you saved your data
PRICE_SHIFT = -10 # How many seconds into the future to predict the price
                  # A negative number means we look into the future

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
# Find all the parquet files in the data directory
files = sorted(glob.glob(os.path.join(DATA_DIR, '*.parquet')))

# Read all files and combine them into a single DataFrame
df_list = []
for file in tqdm(files, desc="Loading data files"):
    try:
        df_list.append(pd.read_parquet(file))
    except Exception as e:
        print(f"Could not read file {file}: {e}")

raw_df = pd.concat(df_list, ignore_index=True)

# Set the timestamp as the index for easier time-series operations
raw_df.set_index('timestamp', inplace=True)

print(f"Loaded {len(raw_df)} snapshots.")
raw_df.head()

Loading data files: 100%|████████████████████████████████████████████████████████████████████████████████| 1207/1207 [00:02<00:00, 501.88it/s]


Loaded 1207 snapshots.


Unnamed: 0_level_0,order_book,trades
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2025-10-18 13:47:29.857072+00:00,"{'asks': [[106832.0, 4.09724], [106832.01, 0.0...","[{'amount': 0.00108, 'cost': 115.3947492, 'dat..."
2025-10-18 13:47:36.036002+00:00,"{'asks': [[106832.0, 0.31345], [106832.01, 0.0...","[{'amount': 5e-05, 'cost': 5.342, 'datetime': ..."
2025-10-18 13:47:41.397149+00:00,"{'asks': [[106832.0, 0.21276], [106832.01, 0.0...","[{'amount': 6e-05, 'cost': 6.410235, 'datetime..."
2025-10-18 13:47:46.760761+00:00,"{'asks': [[106851.84, 0.7091], [106851.85, 0.0...","[{'amount': 6e-05, 'cost': 6.40992, 'datetime'..."
2025-10-18 13:47:52.118733+00:00,"{'asks': [[106844.8, 8.35012], [106844.81, 0.0...","[{'amount': 0.00511, 'cost': 545.91152, 'datet..."


In [10]:
# These functions will process one row of the raw DataFrame at a time.

def calculate_mid_price(row):
    """Calculates the mid-price from the order book."""
    best_bid = row['order_book']['bids'][0][0] # Highest buy price
    best_ask = row['order_book']['asks'][0][0] # Lowest sell price
    return (best_bid + best_ask) / 2

def calculate_bid_ask_spread(row):
    """Calculates the difference between the best bid and best ask."""
    best_bid = row['order_book']['bids'][0][0]
    best_ask = row['order_book']['asks'][0][0]
    return best_ask - best_bid

def calculate_order_book_imbalance(row, depth=10):
    """
    Calculates the Order Book Imbalance (OBI).
    OBI = (Total Bid Volume) / (Total Bid Volume + Total Ask Volume)
    A value > 0.5 suggests buying pressure, < 0.5 suggests selling pressure.
    """
    bids_vol = sum([vol for price, vol in row['order_book']['bids'][:depth]])
    asks_vol = sum([vol for price, vol in row['order_book']['asks'][:depth]])
    total_vol = bids_vol + asks_vol
    return bids_vol / total_vol if total_vol > 0 else 0.5

def calculate_trade_flow(row, window=10):
    """
    Calculates the recent trade flow.
    Compares the volume of 'buy' vs 'sell' market orders in the last 'window' trades.
    Returns the total volume of buy trades minus the total volume of sell trades.
    """
    trade_list = row['trades']
    if not isinstance(trade_list, list) or len(trade_list) == 0:
        return 0

    # Ensure slicing works correctly
    trade_list = trade_list[:window]

    buys = sum([trade['amount'] for trade in trade_list if trade['side'] == 'buy'])
    sells = sum([trade['amount'] for trade in trade_list if trade['side'] == 'sell'])

    return buys - sells

In [12]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

tqdm.pandas(desc="Calculating Features (Optimized)")

# --- Configuration ---
PRICE_SHIFT = -10  # How many rows into the future to predict

# --- Create empty DataFrame for features ---
features_df = pd.DataFrame(index=raw_df.index)

# --- Mid-price and Spread ---
# Vectorized calculation: take first bid & ask from order book
features_df['mid_price'] = raw_df['order_book'].apply(lambda ob: (ob['bids'][0][0] + ob['asks'][0][0]) / 2)
features_df['spread'] = raw_df['order_book'].apply(lambda ob: ob['asks'][0][0] - ob['bids'][0][0])

# --- Order Book Imbalance (OBI) ---
def obi_vectorized(ob, depth=10):
    bids_vol = sum([vol for price, vol in ob['bids'][:depth]])
    asks_vol = sum([vol for price, vol in ob['asks'][:depth]])
    total = bids_vol + asks_vol
    return bids_vol / total if total > 0 else 0.5

features_df['obi'] = raw_df['order_book'].progress_apply(obi_vectorized)

# --- Trade Flow ---
def trade_flow_vectorized(trades, window=10):
    if not isinstance(trades, list) or len(trades) == 0:
        return 0
    trades = trades[:window]
    buys = sum([t.get('amount', 0) for t in trades if t.get('side') == 'buy'])
    sells = sum([t.get('amount', 0) for t in trades if t.get('side') == 'sell'])
    return buys - sells

features_df['trade_flow'] = raw_df['trades'].progress_apply(trade_flow_vectorized)

# --- Target Calculation ---
features_df['future_mid_price'] = features_df['mid_price'].shift(PRICE_SHIFT)
price_change_pct = (features_df['future_mid_price'] - features_df['mid_price']) / features_df['mid_price']

threshold = 0.0001  # 0.01%
features_df['target'] = 0
features_df.loc[price_change_pct > threshold, 'target'] = 1
features_df.loc[price_change_pct < -threshold, 'target'] = -1

# --- Clean up ---
final_df = features_df.dropna()
final_df['target'] = final_df['target'].astype(int)

# --- Save processed data ---
processed_data_path = 'processed_market_data.parquet'
final_df.to_parquet(processed_data_path)

print("Optimized feature calculation complete!")
print(f"Final dataset shape: {final_df.shape}")
print("\nTarget distribution:")
print(final_df['target'].value_counts())


Calculating Features (Optimized): 100%|████████████████████████████████████████████████████████████████| 1207/1207 [00:00<00:00, 69259.52it/s]
Calculating Features (Optimized): 100%|███████████████████████████████████████████████████████████████| 1207/1207 [00:00<00:00, 733380.40it/s]

Optimized feature calculation complete!
Final dataset shape: (1197, 6)

Target distribution:
target
 1    430
-1    420
 0    347
Name: count, dtype: int64



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['target'] = final_df['target'].astype(int)


In [13]:
import pandas as pd
import xgboost as xgb
import joblib

# --- Configuration ---
PROCESSED_DATA_PATH = 'processed_market_data.parquet'
MODEL_SAVE_PATH = 'model.joblib'
TEST_SIZE = 0.2

# --- Load processed data ---
df = pd.read_parquet(PROCESSED_DATA_PATH)
X = df.drop(columns=['mid_price', 'future_mid_price', 'target'])
y = df['target']

# --- Time-series split ---
split_index = int(len(df) * (1 - TEST_SIZE))
X_train, X_test = X[:split_index], X[split_index:]
y_train, y_test = y[:split_index], y[split_index:]

print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")

# --- Map labels for 3-class classification ---
# {-1, 0, 1} -> {0, 1, 2}
y_train_mapped = y_train + 1

# --- Initialize XGBoost model (clean version, no warning) ---
model = xgb.XGBClassifier(
    objective='multi:softmax',  # Multi-class classification
    num_class=3,                # Number of classes
    eval_metric='mlogloss',     # Evaluation metric
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

# --- Train model ---
print("\nTraining XGBoost model...")
model.fit(X_train, y_train_mapped)
print("Model training complete.")

# --- Save model ---
joblib.dump(model, MODEL_SAVE_PATH)
print(f"\nModel saved to {MODEL_SAVE_PATH}")

# --- Optional: Test predictions ---
y_pred = model.predict(X_test)
print(f"\nSample predictions: {y_pred[:10]}")

Training data shape: (957, 3)
Testing data shape: (240, 3)

Training XGBoost model...
Model training complete.

Model saved to model.joblib


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [2]:
print(df['target'].value_counts())

target
0    3
1    1
Name: count, dtype: int64
