SET UP ENVIRONMENT + LOAD DATA

In [2]:
# %% Imports
import pandas as pd
import numpy as np
import json
import sys
import os
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from collections import deque
import re

# Add the project root to the Python path to allow importing dashboard modules
# Adjust the number of '..' if your notebook is nested deeper or this is not needed
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

# Try importing the historical data loader
try:
    # Assuming historical_data_loader.py is in the dashboard directory
    from dashboard.historical_data_loader import load_historical_data
except ImportError as e:
    print(f"Error importing dashboard.historical_data_loader: {e}")
    print("Ensure the dashboard package exists relative to the project root.")
    # Define dummy function if import fails
    def load_historical_data(dir_path):
        print("Warning: historical_data_loader not found. Returning empty dict.")
        return {}

# Set display options for pandas
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 100)

print("Imports successful and path configured.")

# %% Configuration - *** SET THIS PATH ***
HISTORICAL_DATA_DIR = 'C:/Users/Admin/projects/prosperity-poj/strategy/round2/resources/round2' # <<< ADJUST IF NEEDED

# Check if path exists
if not os.path.exists(HISTORICAL_DATA_DIR):
     print(f"ERROR: Historical data directory not found at '{HISTORICAL_DATA_DIR}'. Please update the path.")


# %% Load Historical Data
print(f"Loading historical data from: {HISTORICAL_DATA_DIR}")
historical_data = load_historical_data(HISTORICAL_DATA_DIR)
print(f"Historical data loaded for days: {list(historical_data.keys())}")

# Combine historical data for easier analysis
all_historical_prices = pd.DataFrame()
all_historical_trades = pd.DataFrame()
if historical_data:
    for day, data in historical_data.items():
        if 'prices' in data and not data['prices'].empty:
            prices_df = data['prices'].copy()
            # Ensure 'day' column exists or is added correctly
            if 'day' not in prices_df.columns:
                 prices_df['day'] = day
            all_historical_prices = pd.concat([all_historical_prices, prices_df], ignore_index=True)
        if 'trades' in data and not data['trades'].empty:
            trades_df = data['trades'].copy()
            # Ensure 'day' column exists or is added correctly
            if 'day' not in trades_df.columns:
                 trades_df['day'] = day
            all_historical_trades = pd.concat([all_historical_trades, trades_df], ignore_index=True)

# Convert types and sort combined dataframes
if not all_historical_prices.empty:
     # Convert columns to numeric, coercing errors to NaN
     numeric_cols = ['timestamp', 'bid_price_1', 'bid_volume_1', 'bid_price_2', 'bid_volume_2',
                     'bid_price_3', 'bid_volume_3', 'ask_price_1', 'ask_volume_1',
                     'ask_price_2', 'ask_volume_2', 'ask_price_3', 'ask_volume_3',
                     'mid_price', 'profit_and_loss'] # Include relevant columns
     for col in numeric_cols:
         if col in all_historical_prices.columns:
              all_historical_prices[col] = pd.to_numeric(all_historical_prices[col], errors='coerce')

     all_historical_prices = all_historical_prices.sort_values(by=['day', 'timestamp']).reset_index(drop=True)
     # Calculate mid-price if not present or needs recalculation
     if 'mid_price' not in all_historical_prices.columns or all_historical_prices['mid_price'].isnull().any():
          all_historical_prices['mid_price'] = (all_historical_prices['bid_price_1'] + all_historical_prices['ask_price_1']) / 2
     print(f"Combined Historical Prices shape: {all_historical_prices.shape}")
     print(f"Historical Prices Columns: {all_historical_prices.columns.tolist()}")


if not all_historical_trades.empty:
    # Convert columns to numeric, coercing errors to NaN
    numeric_cols_trades = ['timestamp', 'price', 'quantity']
    for col in numeric_cols_trades:
        if col in all_historical_trades.columns:
            all_historical_trades[col] = pd.to_numeric(all_historical_trades[col], errors='coerce')

    all_historical_trades = all_historical_trades.sort_values(by=['day', 'timestamp']).reset_index(drop=True)
    print(f"Combined Historical Trades shape: {all_historical_trades.shape}")
    print(f"Historical Trades Columns: {all_historical_trades.columns.tolist()}")


Imports successful and path configured.
Loading historical data from: C:/Users/Admin/projects/prosperity-poj/strategy/round2/resources/round2
Loading historical data for Round 2 from: C:/Users/Admin/projects/prosperity-poj/strategy/round2/resources/round2
  Loaded prices for day -1 from prices_round_2_day_-1.csv
  Loaded trades for day -1 from trades_round_2_day_-1_nn.csv
  Loaded prices for day 0 from prices_round_2_day_0.csv
  Loaded trades for day 0 from trades_round_2_day_0_nn.csv
  Loaded prices for day 1 from prices_round_2_day_1.csv
  Loaded trades for day 1 from trades_round_2_day_1_nn.csv
Historical data loaded for days: [-1, 0, 1]
Combined Historical Prices shape: (240000, 17)
Historical Prices Columns: ['day', 'timestamp', 'product', 'bid_price_1', 'bid_volume_1', 'bid_price_2', 'bid_volume_2', 'bid_price_3', 'bid_volume_3', 'ask_price_1', 'ask_volume_1', 'ask_price_2', 'ask_volume_2', 'ask_price_3', 'ask_volume_3', 'mid_price', 'profit_and_loss']
Combined Historical Trades 

In [5]:
# %% Analysis 1: Squid Ink Mean Reversion

# --- Parameters ---
ROLLING_WINDOW = 125 # How many timestamps to average over (Try different values: 10, 20, 50)
rolling_window_std = 125 #for zscore
PRODUCT_TO_ANALYZE = "SQUID_INK"

# --- Filter Data ---
if not all_historical_prices.empty:
    squid_prices = all_historical_prices[all_historical_prices['product'] == PRODUCT_TO_ANALYZE].copy()
    print(f"Filtered {PRODUCT_TO_ANALYZE} prices shape: {squid_prices.shape}")
    if squid_prices.empty:
        print(f"No historical price data found for {PRODUCT_TO_ANALYZE}. Cannot perform analysis.")
else:
    print("Historical price data is empty. Cannot perform analysis.")
    squid_prices = pd.DataFrame() # Ensure it's an empty df if no data

# --- Calculate Metrics (if data exists) ---
if not squid_prices.empty:
    # Calculate rolling average *within each day*
    squid_prices['rolling_mid'] = squid_prices.groupby('day')['mid_price'].transform(
        lambda x: x.rolling(window=ROLLING_WINDOW, min_periods=ROLLING_WINDOW // 2).mean() # Use min_periods for start
    )
    squid_prices['rolling_std'] = squid_prices.groupby('day')['mid_price'].transform(
        lambda x: x.rolling(window=rolling_window_std, min_periods=rolling_window_std // 2).std() # Use min_periods for start
    )
    squid_prices['zscore'] = (squid_prices['mid_price'] - squid_prices['rolling_mid']) / squid_prices['rolling_std']

    # Calculate deviation from rolling average
    squid_prices['deviation'] = squid_prices['mid_price'] - squid_prices['rolling_mid']

    # Calculate the NEXT price change (shift mid_price UP by 1) *within each day*
    squid_prices['next_mid_price'] = squid_prices.groupby('day')['mid_price'].shift(-1)
    squid_prices['next_price_change'] = squid_prices['next_mid_price'] - squid_prices['mid_price']

    # Drop rows where rolling mean or next change couldn't be calculated (NaNs)
    squid_prices_clean = squid_prices.dropna(subset=['rolling_mid', 'deviation', 'next_price_change'])
    print(f"Cleaned data shape (after removing NaNs): {squid_prices_clean.shape}")

# %% Visualize Relationship (Scatter Plot)

if not squid_prices_clean.empty:
    fig_scatter = px.scatter(
        squid_prices_clean,
        x='deviation',
        y='next_price_change',
        trendline='ols', # Add Ordinary Least Squares regression line
        title=f'{PRODUCT_TO_ANALYZE} - Deviation from {ROLLING_WINDOW}-Period Rolling Mid vs. Next Price Change',
        labels={'deviation': f'Deviation (Mid - Rolling Mid)', 'next_price_change': 'Next Price Change (Mid[t+1] - Mid[t])'},
        hover_data=['day', 'timestamp', 'mid_price', 'rolling_mid']
    )
    fig_scatter.add_hline(y=0, line_dash="dash", line_color="grey")
    fig_scatter.add_vline(x=0, line_dash="dash", line_color="grey")
    fig_scatter.show()

    # Calculate Correlation
    correlation = squid_prices_clean['deviation'].corr(squid_prices_clean['next_price_change'])
    print(f"\nCorrelation between Deviation and Next Price Change: {correlation:.4f}")

    if correlation < -0.1:
         print("Result: Negative correlation suggests mean reversion tendency.")
         print("Interpretation: When price is above average (positive deviation), it tends to decrease next (negative change), and vice-versa.")
    elif correlation > 0.1:
         print("Result: Positive correlation suggests momentum tendency.")
         print("Interpretation: When price is above average (positive deviation), it tends to increase further next (positive change), and vice-versa.")
    else:
         print("Result: Weak correlation.")
         print("Interpretation: Deviation from the rolling mean doesn't strongly predict the next immediate price change.")

else:
    print("No data available for scatter plot and correlation analysis.")


# %% Visualize Time Series (Example Day)

if not squid_prices_clean.empty:
    # Choose a day to visualize (e.g., day 0)
    day_to_plot = -1
    squid_day_plot = squid_prices_clean[squid_prices_clean['day'] == day_to_plot]

    if not squid_day_plot.empty:
        fig_ts = make_subplots(rows=1, cols=1, shared_xaxes=True,
                               subplot_titles=(f'{PRODUCT_TO_ANALYZE} Mid-Price vs. Rolling Average (Day {day_to_plot})',
                                               f'{PRODUCT_TO_ANALYZE} Deviation & Next Price Change (Day {day_to_plot})'),
                               specs=[[{"secondary_y": True}]])

        # Top Plot: Prices
        fig_ts.add_trace(go.Scatter(x=squid_day_plot['timestamp'], y=squid_day_plot['mid_price'], name='Mid Price', mode='lines'), row=1, col=1, secondary_y=False)
        fig_ts.add_trace(go.Scatter(x=squid_day_plot['timestamp'], y=squid_day_plot['rolling_mid'], name=f'Rolling Mid ({ROLLING_WINDOW})', mode='lines', line=dict(dash='dash')), row=1, col=1, secondary_y=False)
        fig_ts.add_trace(go.Scatter(x=squid_day_plot['timestamp'], y=squid_day_plot['zscore'], name='Z-Score', mode='lines'), row=1, col=1, secondary_y=True)
        fig_ts.show()
    else:
        print(f"No cleaned data available for Day {day_to_plot} to visualize.")

else:
    print("No data available for time series visualization.")


Filtered SQUID_INK prices shape: (30000, 17)
Cleaned data shape (after removing NaNs): (29814, 23)



Correlation between Deviation and Next Price Change: -0.0512
Result: Weak correlation.
Interpretation: Deviation from the rolling mean doesn't strongly predict the next immediate price change.
