# EDA for Training Data

In [None]:
import pandas as pd
import numpy as np
import polars as pl
import os
import json
import matplotlib.pyplot as plt

# Data Structure

In [None]:
# Read config file - pay attention to correct path
with open("../config.json", "r") as f:
    config = json.load(f)

# Corrected: should now be config["path"]["global_path"]
global_path = config["path"]["global_path"]

# Build data paths
data_path = os.path.join(global_path, "data", "raw", "jane-street-real-time-market-data-forecasting")
train_path = os.path.join(data_path, "train.parquet")

# Method 1: directly specify partition path
partition_0_path = os.path.join(train_path, "partition_id=0")
train_partition_0 = pl.scan_parquet(partition_0_path)

# View first 10 rows of data
sample_data = train_partition_0.limit(10).collect()
print(f"\nPartition 0 size of top 10 rows: {sample_data.shape}")
print(f"rows: {len(sample_data)}")

# Display all column names vertically - clearer and more readable
print(f"\nAll column names (total {len(sample_data.columns)} columns):")
for i, col in enumerate(sample_data.columns, 1):
    print(f"{i:3d}. {col}")

print(f"\nFirst 10 rows of data:")
print(sample_data)

# Trend of Reponders

47127338 rows, 1699 date_ids, over 4.5 years

In [None]:
partitions = []
for i in range(9):
    partition_path = os.path.join(train_path, f"partition_id={i}")
    partitions.append(partition_path)

responder_cols = [f'responder_{i}' for i in range(9)]
needed_cols = ['date_id', 'time_id'] + responder_cols

partitions_data = []
for partition_id in partitions:
    partition_path = os.path.join(train_path, f"partition_id={partition_id}") 
    partition_data = pl.scan_parquet(partition_path).select(needed_cols).collect()
    partitions_data.append(partition_data)

data = pl.concat(partitions_data)

df = data.to_pandas()
df.shape

daily_means_full = df.groupby('date_id')[responder_cols].mean().reset_index()
daily_means_full = daily_means_full.sort_values('date_id').reset_index(drop=True)

print(f"Date range: {daily_means_full['date_id'].min()} to {daily_means_full['date_id'].max()}")

# Calculate cumulative sums
cumulative_data = daily_means_full.copy()
for col in responder_cols:
    cumulative_data[f'{col}_cumsum'] = daily_means_full[col].cumsum()

# Create the enhanced plot with full data
plt.figure(figsize=(15, 10))

# Define colors for each responder
colors = plt.cm.tab10(np.linspace(0, 1, len(responder_cols)))

# Plot cumulative responder values
for i, responder in enumerate(responder_cols):
    cumsum_col = f'{responder}_cumsum'
    plt.plot(range(len(cumulative_data)), cumulative_data[cumsum_col], 
             label=responder, color=colors[i], linewidth=2, alpha=0.8)

plt.title(f'Cumulative resp and time horizons 1, 2, 3,4,5,6,7 and 8 ({len(cumulative_data)} days) - FULL DATASET', 
          fontsize=16, fontweight='bold')
plt.xlabel('Trade', fontsize=14)
plt.ylabel('Cumulative Response', fontsize=14)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=12)
plt.grid(True, alpha=0.3)
plt.tight_layout()

plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

y_min = min([cumulative_data[f'{col}_cumsum'].min() for col in responder_cols])
y_max = max([cumulative_data[f'{col}_cumsum'].max() for col in responder_cols])
y_range = y_max - y_min
plt.ylim(y_min - y_range*0.1, y_max + y_range*0.1)

plt.text(0.02, 0.98, f'Total samples: {len(df):,}', 
         transform=plt.gca().transAxes, fontsize=10, verticalalignment='top',
         bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))

plt.show()

# Summary Statistics

In [None]:
# Print statistics
print(f"\n" + "="*60)
print(f"Total samples processed: {len(df):,}")
print(f"Total unique days: {len(cumulative_data)}")
print(f"Date range: {daily_means_full['date_id'].min()} to {daily_means_full['date_id'].max()}")

print(f"\nFinal cumulative values:")
for responder in responder_cols:
    cumsum_col = f'{responder}_cumsum'
    final_value = cumulative_data[cumsum_col].iloc[-1]
    print(f"{responder}: {final_value:.6f}")

print(f"\nDaily statistics for responders (full dataset):")
print(daily_means_full[responder_cols].describe())

# Calculate and display some additional insights
print(f"\nAdditional insights:")
print(f"Best performing responder: {responder_cols[np.argmax([cumulative_data[f'{col}_cumsum'].iloc[-1] for col in responder_cols])]}")
print(f"Worst performing responder: {responder_cols[np.argmin([cumulative_data[f'{col}_cumsum'].iloc[-1] for col in responder_cols])]}")

# Calculate volatility (standard deviation of daily changes)
print(f"\nVolatility (std of daily means):")
for responder in responder_cols:
    volatility = daily_means_full[responder].std()
    print(f"{responder}: {volatility:.6f}")

print(f"\n" + "="*60)