In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from collections import defaultdict
from typing import List, Dict, Tuple
from collections import defaultdict


In [2]:
# --------------
# CONFIGURATION
# --------------
p = ''
PRICES_PATH = 'strategy/round5/resources/round5/prices_round_5_day_{day}.csv'
TRADES_PATH = 'strategy/round5/resources/round5/trades_round_5_day_{day}_nn.csv'
DAYS = [2, 3, 4]
LEAD_LAG_DT = 100  # timestamps ahead/behind to compute lead returns

# --------------
# 1. LOAD AND MERGE DATA
# --------------
def load_and_merge(day: int) -> pd.DataFrame:
    prices = pd.read_csv(PRICES_PATH.format(day=day), sep=';')
    trades = pd.read_csv(TRADES_PATH.format(day=day), sep=';')

    trades['day'] = day

    prices = prices.rename(columns={'product': 'symbol'})
    trades = trades.rename(columns={
        'price': 'trade_price'
    })

    # rename for consistency
    to_merge = prices[['day', 'timestamp', 'symbol', 'mid_price']]
    df = pd.merge(
        trades,
        to_merge,
        on=['day', 'timestamp', 'symbol'],
        how='left'
    )
    df['timestamp'] += (df['day']-2) * 1000000
    return df

In [3]:
def compute_informed_scores(df: pd.DataFrame) -> pd.DataFrame:
    # 1) sort + lead/lag mid_price
    df = df.sort_values(['symbol','timestamp'])
    df['mid_before'] = df.groupby('symbol')['mid_price'].shift(LEAD_LAG_DT)
    df['mid_after']  = df.groupby('symbol')['mid_price'].shift(-LEAD_LAG_DT)
    df['return']     = (df['mid_after'] - df['mid_before']) / df['mid_before']

    # drop any rows where we can't compute a return
    df = df.dropna(subset=['return'])

    # 2) build one row per (counterparty, side)
    buyers = df[['symbol','return','buyer']].rename(columns={'buyer':'counterparty'})
    buyers['side'] = 'buy'

    sellers = df[['symbol','return','seller']].rename(columns={'seller':'counterparty'})
    sellers['side'] = 'sell'

    long_df = pd.concat([buyers, sellers], ignore_index=True)

    # 3) average return per counterparty/symbol/side
    scores = (
        long_df
        .groupby(['counterparty','symbol','side'])['return']
        .mean()
        .reset_index()
    )
    return scores

In [4]:
# --------------
# 3. LIQUIDITY & SLIPPAGE METRICS
# --------------
def compute_liquidity_slippage(df: pd.DataFrame) -> pd.DataFrame:
    # assume df has columns: 'mid_price', 'trade_price', 'quantity', 'buyer'
    df = df.copy()
    df['mid_when_traded'] = df['mid_price']
    # slippage signed by trade direction; quantity>0 is 'buy', so positive slippage = we bought above mid
    df['slippage'] = (df['trade_price'] - df['mid_when_traded']) * np.sign(df['quantity'])

    # average quantity & slippage per buyer
    summary = (
        df
        .groupby('buyer')
        .agg(
            avg_size     = ('quantity',   'mean'),
            avg_slippage = ('slippage',   'mean')
        )
        .reset_index()
        .rename(columns={'buyer':'counterparty'})
    )
    return summary


In [5]:
# --------------
# 4. BUILD TRADE FLOW NETWORK
# --------------
def build_trade_graph(df: pd.DataFrame) -> nx.DiGraph:
    G = nx.DiGraph()
    # nodes = counterparties
    # edges weighted by volume from buyer→seller
    for _, row in df.iterrows():
        b, s, vol = row["buyer"], row["seller"], abs(row["quantity"])
        if not G.has_node(b): G.add_node(b, total_volume=0)
        if not G.has_node(s): G.add_node(s, total_volume=0)
        G.nodes[b]["total_volume"] += vol
        G.nodes[s]["total_volume"] += vol
        if G.has_edge(b, s):
            G[b][s]["weight"] += vol
        else:
            G.add_edge(b, s, weight=vol)
    return G


In [6]:
# --------------
# 5. COUNTERPARTY IMBALANCE SIGNAL
# --------------
def compute_imbalance_signal(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df['sign'] = np.sign(df['quantity'])
    df['weighted_side'] = df['sign'] * df['informed_score']  # use informed_score from step 2
    imbalance = df.groupby(['timestamp','symbol'])['weighted_side'].sum().reset_index()
    return imbalance


In [7]:
def compute_fill_distribution(df: pd.DataFrame) -> pd.DataFrame:
    # now df has a .counterparty column
    fills = (
        df
        .groupby(['symbol','counterparty'])['quantity']
        .sum()
        .reset_index(name='filled_qty')
    )
    total = (
        fills
        .groupby('symbol')['filled_qty']
        .sum()
        .reset_index(name='total_qty')
    )
    merged = fills.merge(total, on='symbol')
    merged['share'] = merged['filled_qty']/merged['total_qty']
    return merged


In [8]:
def compute_time_of_day_activity(df: pd.DataFrame) -> pd.DataFrame:
    # derive hour from timestamp (intra‑day)
    df = df.copy()
    df["hour"] = (df["timestamp"] // 10000) % 24
    grouped = (
        df
        .groupby(["counterparty", "symbol", "hour"])
        .size()
        .reset_index(name="num_trades")
    )
    return grouped


In [None]:
# -- --------------
#  MAIN ANALYSIS
# -- --------------
if __name__ == "__main__":
    # 1) load & merge
    all_days: List[pd.DataFrame] = []
    for d in DAYS:
        all_days.append(load_and_merge(d))
    df_all = pd.concat(all_days, ignore_index=True)

    #  — who’s the “counterparty” we group by? use the buyer field:
    df_all['counterparty'] = df_all['buyer']

    #  — define trade side for informed score merging
    df_all['side'] = np.where(df_all['quantity'] > 0, 'buy', 'sell')

    print(df_all.shape)

    print(df_all.head())
    df_all.to_csv("strategy/round5/df_all.csv", index=False)
    # 2) informed scores
    scores = compute_informed_scores(df_all)
    scores.to_csv("strategy/round5/informed_scores.csv", index=False)

    # 3) liquidity & slippage (grouped by buyer as counterparty)
    liq = compute_liquidity_slippage(df_all)
    liq.to_csv("strategy/round5/liquidity_slippage.csv", index=False)


    # 4) trade‑flow network
    G = build_trade_graph(df_all)
    nx.write_gexf(G, "strategy/round5/trade_flow_network.gexf")

    # 5) imbalance signal (merge in the informed score)
    df_all = df_all.merge(
        scores[["counterparty","symbol","side","return"]],
        on=["counterparty","symbol","side"],
        how="left"
    ).rename(columns={"return":"informed_score"})
    imbalance = compute_imbalance_signal(df_all)
    imbalance.to_csv("strategy/round5/imbalance_signal.csv", index=False)


    # 6) fill distribution
    fill = compute_fill_distribution(df_all)
    fill.to_csv("strategy/round5/fill_distribution.csv", index=False)

    # 7) temporal patterns
    tod = compute_time_of_day_activity(df_all)
    tod.to_csv("strategy/round5/counterparty_time_activity.csv", index=False)

    print("✅ Analysis complete, CSV/GEXF outputs generated.")

In [None]:
import pandas as pd
import networkx as nx
import plotly.express as px
import plotly.graph_objects as go

# ——————————————————————————————
# 1) Price & Trade Overview (rebuild df_all)
# ——————————————————————————————
PRICES_PATH = 'strategy/round5/resources/round5/prices_round_5_day_{day}.csv'
TRADES_PATH = 'strategy/round5/resources/round5/trades_round_5_day_{day}_nn.csv'
DAYS = [2,3,4]

def load_and_merge(day: int) -> pd.DataFrame:
    prices = pd.read_csv(PRICES_PATH.format(day=day), sep=';')
    trades = pd.read_csv(TRADES_PATH.format(day=day), sep=';')
    trades['day'] = day
    prices = prices.rename(columns={'product':'symbol'})
    trades = trades.rename(columns={'price':'trade_price'})
    prices['timestamp'] += (prices['day']-2) * 1000000
    trades['timestamp'] += (trades['day']-2) * 1000000
    to_merge = prices[['day','timestamp','symbol','mid_price']]
    df = trades.merge(to_merge, on=['day','timestamp','symbol'], how='left')
    return df

# build full df_all
all_days = [load_and_merge(d) for d in DAYS]
df_all = pd.concat(all_days, ignore_index=True)
# for coloring we want symbol, for sizing the markers we use quantity
df_all['quantity'] = df_all['quantity'].astype(float)

fig1 = px.line(
    df_all,
    x="timestamp", y="mid_price", color="symbol",
    title="Mid‑Price Time Series by Product",
    labels={"mid_price":"Mid Price","timestamp":"Timestamp","symbol":"Product"}
)
fig1.add_trace(
    go.Scatter(
        x = df_all["timestamp"],
        y = df_all["mid_price"],
        mode="markers",
        marker=dict(
            size=(df_all["quantity"].abs())**0.5,
            color="rgba(255,0,0,0.4)"
        ),
        name="Trade Size"
    )
)
fig1.show()


# ——————————————————————————————
# 2) Informed‑Trader Scores
# ——————————————————————————————
df_scores = pd.read_csv("strategy/round5/informed_scores.csv")
fig2 = px.bar(
    df_scores,
    x="counterparty", y="return", color="symbol",
    barmode="group",
    title="Informed‑Trader Scores by Counterparty & Product",
    labels={"return":"Informed Score","counterparty":"Counterparty"}
)
fig2.update_layout(xaxis_tickangle=-45)
fig2.show()


# ——————————————————————————————
# 3) Liquidity & Slippage Metrics
# ——————————————————————————————
df_liq = pd.read_csv("strategy/round5/liquidity_slippage.csv")
fig3 = px.scatter(
    df_liq,
    x="avg_size", y="avg_slippage",
    color="counterparty", size="avg_size",
    title="Average Trade‑Size vs. Slippage by Counterparty",
    labels={"avg_size":"Avg. Trade Size","avg_slippage":"Avg. Slippage"}
)
fig3.show()


# ——————————————————————————————
# 4) Trade‑Flow Network
# ——————————————————————————————
G = nx.read_gexf("strategy/round5/trade_flow_network.gexf")
pos = nx.spring_layout(G, k=0.5, iterations=50)

edge_x, edge_y = [], []
for u,v in G.edges():
    x0,y0 = pos[u]; x1,y1 = pos[v]
    edge_x += [x0, x1, None]; edge_y += [y0, y1, None]

edge_trace = go.Scatter(
    x=edge_x, y=edge_y,
    mode="lines",
    line=dict(color="#888", width=1),
    hoverinfo="none"
)

node_x, node_y, node_text = [], [], []
for n in G.nodes():
    x,y = pos[n]
    node_x.append(x); node_y.append(y)
    vol = G.nodes[n].get("total_volume", 0)
    node_text.append(f"{n}<br>Total Volume: {vol}")

node_trace = go.Scatter(
    x=node_x, y=node_y,
    mode="markers+text",
    marker=dict(
        size=[max(5, G.nodes[n].get("total_volume",0)**0.3) for n in G.nodes()],
        color="skyblue", line_width=1
    ),
    text=list(G.nodes()),
    textposition="bottom center",
    hovertext=node_text,
    hoverinfo="text"
)

fig4 = go.Figure([edge_trace, node_trace])
fig4.update_layout(
    title="Trade‑Flow Network (Counterparty as Nodes)",
    showlegend=False,
    xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
    yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
)
fig4.show()


# ——————————————————————————————
# 5) Counterparty Imbalance Signal
# ——————————————————————————————
# 5) Counterparty Imbalance Signal  (actually per‐symbol here)
# -------------------------------------------------------------------
df_imb = pd.read_csv("strategy/round5/imbalance_signal.csv")

fig5 = px.line(
    df_imb,
    x="timestamp",
    y="weighted_side",
    facet_col="symbol",        # facet per product
    facet_col_wrap=2,
    title="Imbalance Signal Over Time by Product",
    labels={
        "weighted_side": "Imbalance Signal",
        "timestamp":    "Timestamp",
        "symbol":       "Product"
    }
)
fig5.update_layout(showlegend=False)
fig5.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))  # simplify facet titles
fig5.show()


# ——————————————————————————————
# 6) Fill‑Distribution Heatmap
# ——————————————————————————————
df_fill = pd.read_csv("strategy/round5/fill_distribution.csv")
pivot = df_fill.pivot_table(
    index="counterparty", columns="symbol", values="share",
    aggfunc="sum", fill_value=0
)
fig6 = px.imshow(
    pivot.T,
    labels=dict(x="Counterparty", y="Product", color="Fill Share"),
    title="Fill‑Share Distribution by Counterparty & Product"
)
fig6.show()


# ——————————————————————————————
# 7) Temporal Patterns of Activity
# ——————————————————————————————
df_temp = pd.read_csv("strategy/round5/counterparty_time_activity.csv")
fig7 = px.line(
    df_temp,
    x="hour", y="num_trades",
    color="counterparty",
    facet_col="symbol", facet_col_wrap=2,
    title="Hourly Trade Counts by Counterparty & Product",
    labels={"num_trades":"# Trades","hour":"Hour of Day"}
)
fig7.update_layout(legend=dict(title="Counterparty"))
fig7.show()


In [11]:
import pandas as pd
from collections import defaultdict

def compute_mtm_pnl(df_trades: pd.DataFrame) -> pd.DataFrame:
    """
    Calculates the Mark-to-Market (MtM) PnL for each participant after each trade.

    Args:
        df_trades: DataFrame containing trade data, MUST include columns:
                   'timestamp', 'buyer', 'seller', 'symbol',
                   'trade_price', 'quantity', 'mid_price'.
                   Should be sorted by 'timestamp'.

    Returns:
        DataFrame with columns: 'timestamp', 'participant', 'symbol',
                                'mtm_pnl', 'position', 'avg_cost_basis'.
    """
    # Ensure data is sorted by timestamp for correct state evolution
    df_trades = df_trades.sort_values(by='timestamp').copy()

    # State dictionary: participant_state[participant][product] = {'position': int, 'cost': float}
    participant_state = defaultdict(lambda: defaultdict(lambda: {'position': 0, 'cost': 0.0}))

    pnl_records = []

    for trade in df_trades.itertuples():
        ts = trade.timestamp
        buyer = trade.buyer
        seller = trade.seller
        symbol = trade.symbol
        price = trade.trade_price
        qty = trade.quantity # Positive for buy, Negative for sell in original logs? Assuming positive means buyer perspective
        mid_price = trade.mid_price

        # --- Update Buyer State ---
        buyer_state = participant_state[buyer][symbol]
        buyer_state['position'] += qty
        buyer_state['cost'] += qty * price # Add cost for buying

        # --- Update Seller State ---
        seller_state = participant_state[seller][symbol]
        seller_state['position'] -= qty # Seller's position changes by -qty
        seller_state['cost'] -= qty * price # Seller receives value, reducing their net cost basis

        # --- Calculate and Record PnL for Buyer ---
        if mid_price is not None and not pd.isna(mid_price):
             buyer_pos = buyer_state['position']
             buyer_cost = buyer_state['cost']
             buyer_mkt_val = buyer_pos * mid_price
             buyer_pnl = buyer_mkt_val - buyer_cost
             buyer_avg_cost = (buyer_cost / buyer_pos) if buyer_pos != 0 else 0
             pnl_records.append({
                 'timestamp': ts,
                 'participant': buyer,
                 'symbol': symbol,
                 'mtm_pnl': buyer_pnl,
                 'position': buyer_pos,
                 'avg_cost_basis': buyer_avg_cost
             })

        # --- Calculate and Record PnL for Seller ---
        if mid_price is not None and not pd.isna(mid_price):
             seller_pos = seller_state['position']
             seller_cost = seller_state['cost']
             seller_mkt_val = seller_pos * mid_price
             seller_pnl = seller_mkt_val - seller_cost
             seller_avg_cost = (seller_cost / seller_pos) if seller_pos != 0 else 0 # Cost basis calculation is tricky for shorts this way, interpretation needed
             pnl_records.append({
                 'timestamp': ts,
                 'participant': seller,
                 'symbol': symbol,
                 'mtm_pnl': seller_pnl,
                 'position': seller_pos,
                 'avg_cost_basis': seller_avg_cost # Note: avg cost for shorts might look weird here
             })

    # Convert records to DataFrame
    pnl_df = pd.DataFrame(pnl_records)

    # Optional: Handle potential NaNs if mid_price was missing for some trades
    pnl_df.dropna(subset=['mtm_pnl'], inplace=True)

    return pnl_df


In [None]:
# Make sure df_all is sorted by timestamp first if it isn't already
df_all = df_all.sort_values(by='timestamp')

participant_pnl_df = compute_mtm_pnl(df_all)
print(participant_pnl_df.head())
participant_pnl_df.to_csv("strategy/round5/participant_pnl_df.csv", index=False)
# To get the final PnL per participant per symbol at the end of the period:
final_pnl = participant_pnl_df.loc[participant_pnl_df.groupby(['participant', 'symbol'])['timestamp'].idxmax()]
print("\\nFinal PnL:")
print(final_pnl)

#To get total PnL per participant across all symbols:
total_final_pnl = final_pnl.groupby('participant')['mtm_pnl'].sum().sort_values(ascending=False)
print("\\nTotal Final PnL per Participant:")
print(total_final_pnl)

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pandas as pd # Make sure pandas is imported

# --- Ensure participant_pnl_df exists and is sorted ---
# participant_pnl_df = compute_mtm_pnl(df_all) # Assuming df_all is ready
participant_pnl_df = participant_pnl_df.sort_values(by=['participant', 'symbol', 'timestamp'])

# --- Prepare unique price data for the secondary axis ---
# Need df_all to get the price series
if 'df_all' not in locals():
     print("Error: df_all not found. Please ensure the main analysis block creating df_all has been run.")
else:
    prices_unique = df_all[['timestamp', 'symbol', 'mid_price']].drop_duplicates(subset=['timestamp', 'symbol']).sort_values('timestamp').copy()
    # Handle potential NaN mid_prices if any
    prices_unique.dropna(subset=['mid_price'], inplace=True)

    # --- Plot 1 (Revised Again): Separate Graph per Symbol with Price on Y2 ---

    unique_symbols = participant_pnl_df['symbol'].unique()

    print(f"Generating separate PnL plots with price overlay for symbols: {list(unique_symbols)}")

    for symbol in unique_symbols:
        print(f"--- Processing symbol: {symbol} ---")
        # Filter PnL data for the current symbol
        plot_df_sym_single = participant_pnl_df[participant_pnl_df['symbol'] == symbol]

        # Filter Price data for the current symbol
        symbol_price_series = prices_unique[prices_unique['symbol'] == symbol]

        # *** Add data check ***
        print(f"PnL Data shape for {symbol}: {plot_df_sym_single.shape}")
        print(f"Price Data shape for {symbol}: {symbol_price_series.shape}")
        if plot_df_sym_single.empty:
            print(f"Skipping plot for {symbol} as no PnL data was found.")
            continue # Skip to the next symbol
        if symbol_price_series.empty:
             print(f"Skipping plot for {symbol} as no Price data was found.")
             continue
        # *** End of check ***

        # Create figure with secondary y-axis
        fig = make_subplots(specs=[[{"secondary_y": True}]])

        # Add traces for each participant's PnL on the primary axis
        participants = plot_df_sym_single['participant'].unique()
        print(f"Plotting PnL for {len(participants)} participants for {symbol}...")
        for participant in participants:
            participant_data = plot_df_sym_single[plot_df_sym_single['participant'] == participant]
            fig.add_trace(
                go.Scatter(
                    x=participant_data['timestamp'],
                    y=participant_data['mtm_pnl'],
                    mode='lines',
                    name=f"{participant} (PnL)" # Add (PnL) to legend name
                    # Potentially add line=dict(color=...) for consistent participant colors if needed
                ),
                secondary_y=False, # Add to primary y-axis
            )

        # Add trace for the mid-price on the secondary axis
        print(f"Adding price trace for {symbol}...")
        fig.add_trace(
            go.Scatter(
                x=symbol_price_series['timestamp'],
                y=symbol_price_series['mid_price'],
                mode='lines',
                name=f"{symbol} Mid Price", # Legend name for price
                line=dict(color='rgba(0,0,0,0.3)', dash='dash') # Style the price line differently
            ),
            secondary_y=True, # Add to secondary y-axis
        )

        # Update layout
        print(f"Configuring layout for {symbol} plot...")
        fig.update_layout(
            title_text=f"Mark-to-Market PnL and Price for {symbol}",
            xaxis_title="Timestamp",
            legend_title_text='Participant / Series',
            # If legend gets too large, consider adjustments:
            # legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1)
        )

        # Update y-axes titles
        fig.update_yaxes(title_text="Participant MtM PnL", secondary_y=False)
        fig.update_yaxes(title_text=f"{symbol} Mid Price", secondary_y=True)

        # Show figure
        print(f"Displaying plot for {symbol}...")
        try:
            fig.show()
            print(f"Successfully displayed plot for {symbol}.")
        except Exception as e:
             print(f"ERROR displaying plot for {symbol}: {e}")


# --- Plot 2: Total PnL per Participant over Time ---

# Calculate cumulative PnL per participant at each timestamp
# Need to group by participant and timestamp, then sum PnL across symbols.
# Since a participant might trade different symbols at the same timestamp,
# we first find the last recorded state for each participant/symbol *at or before* each global timestamp.
# This is a bit complex. A simpler approach is to sum the last recorded PnL for each symbol per participant.

# Find the last record for each participant/symbol combination
last_pnl_per_symbol = participant_pnl_df.loc[
    participant_pnl_df.groupby(['participant', 'symbol'])['timestamp'].idxmax()
]

# Sum these final PnLs to get the total *final* PnL (this doesn't show evolution well)
total_final_pnl_agg = last_pnl_per_symbol.groupby('participant')['mtm_pnl'].sum().reset_index()

# --- For plotting evolution, we need a slightly different approach ---
# Calculate the total PnL *at each timestamp* a participant had *any* trade update.
# This requires summing the PnL across all symbols for that participant at that time.

# Create a pivot table: participant x symbol, value = mtm_pnl, index = timestamp
pnl_pivot = participant_pnl_df.pivot_table(
    index=['timestamp', 'participant'],
    columns='symbol',
    values='mtm_pnl'
)

# Forward fill NaNs - assumes PnL holds until the next trade in that symbol for that participant
pnl_pivot_ffill = pnl_pivot.groupby(level='participant').ffill()

# Sum across symbols (columns) to get total PnL per participant per timestamp
total_pnl_over_time = pnl_pivot_ffill.sum(axis=1).reset_index()
total_pnl_over_time.rename(columns={0: 'total_mtm_pnl'}, inplace=True)

# --- Now plot the total PnL evolution ---
# Again, consider filtering participants if the plot is too crowded
# Example: Use the same top_participants filter as before
# plot_df_total = total_pnl_over_time[total_pnl_over_time['participant'].isin(top_participants)]
# Or plot everyone:
plot_df_total = total_pnl_over_time

fig_pnl_total = px.line(
    plot_df_total,
    x="timestamp",
    y="total_mtm_pnl",
    color="participant",
    title="Total Mark-to-Market PnL Evolution by Participant",
    labels={"total_mtm_pnl": "Total MtM PnL", "timestamp": "Timestamp"}
)
fig_pnl_total.update_layout(showlegend=True) # Adjust legend visibility as needed
fig_pnl_total.show()

# You might also want a simple bar chart of the final total PnL per participant
fig_pnl_final_bar = px.bar(
    total_final_pnl_agg.sort_values('mtm_pnl', ascending=False),
    x='participant',
    y='mtm_pnl',
    title='Total Final MtM PnL per Participant',
    labels={'mtm_pnl': 'Total Final PnL', 'participant': 'Participant'}
)
fig_pnl_final_bar.update_layout(xaxis_tickangle=-45)
fig_pnl_final_bar.show()



In [None]:
# Block 1: Load Data and Identify Traders

import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from collections import defaultdict

# --- Re-run data loading using notebook's functions/variables ---
# Make sure the definition of load_and_merge, DAYS, PRICES_PATH, TRADES_PATH is available above this cell

print("Loading and merging data for all specified days...")
all_days_data = []
for d in DAYS:
    print(f" Loading day {d}...")
    try:
        day_df = load_and_merge(d)
        all_days_data.append(day_df)
    except Exception as e:
        print(f"  Error loading data for day {d}: {e}")

if not all_days_data:
    print("Error: No data loaded. Cannot proceed.")
    # Or handle this case appropriately
else:
    df_all = pd.concat(all_days_data, ignore_index=True)
    print(f"Loaded combined data. Shape: {df_all.shape}")
    print(df_all.head())

    # --- Identify Unique Traders ---
    print("\nIdentifying unique traders...")
    buyers = df_all['buyer'].dropna().unique()
    sellers = df_all['seller'].dropna().unique()
    traders = sorted(list(set(buyers) | set(sellers))) # Combine and sort unique traders
    print(f"Found {len(traders)} unique traders:")
    print(traders)

    # --- Prepare Price Data for Plotting ---
    # Extract unique mid-prices per timestamp and symbol for overlay
    print("\nPreparing unique price data for plots...")
    prices_unique_df = df_all[['timestamp', 'symbol', 'mid_price']].drop_duplicates().sort_values('timestamp').copy()
    prices_unique_df.dropna(subset=['mid_price'], inplace=True)
    print(f"Prepared unique price data. Shape: {prices_unique_df.shape}")

# Optional: Save df_all if needed for later cells, matching the notebook style
# df_all.to_csv("strategy/round5/df_all_reloaded_for_plotting.csv", index=False)

In [None]:
# Block 2 (Revised): Visualize Trades by Individual Trader (Dots on Mid-Price Line)

# Requires df_all, traders, and prices_unique_df from Block 1
# Requires plotly.graph_objects as go, plotly.express as px, make_subplots

if 'df_all' not in locals() or 'traders' not in locals() or 'prices_unique_df' not in locals():
    print("Error: Required DataFrames (df_all, prices_unique_df) or 'traders' list not found. Please run Block 1 first.")
else:
    print("\nGenerating plots for trades by individual trader (markers on mid-price)...")
    available_products = sorted(df_all['symbol'].unique())

    for product in available_products:
        print(f"  Plotting for Product: {product}...")
        # Filter trades dataframe for this product; keep rows with valid mid_price
        product_trades = df_all[(df_all['symbol'] == product) & (df_all['mid_price'].notna())].copy()
        # Filter unique prices dataframe for this product
        product_prices = prices_unique_df[prices_unique_df['symbol'] == product].copy()

        if product_trades.empty and product_prices.empty:
             print(f"    No trade or price data with valid mid_price for product {product}. Skipping plot.")
             continue

        fig = make_subplots() # No secondary y-axis needed

        # Plotting the mid-price line
        if not product_prices.empty:
             fig.add_trace(
                 go.Scatter(
                     x=product_prices["timestamp"],
                     y=product_prices['mid_price'],
                     name="Mid Price",
                     mode='lines',
                     line={"color": "gray"} # Solid gray line for mid-price
                 )
             )
        else:
             print(f"    No price data available for {product} line plot.")


        # Plotting trade markers at their corresponding mid-price level
        if not product_trades.empty:
            for i, trader in enumerate(traders):
                # Assign color based on index using Plotly's default cycle
                color = px.colors.qualitative.Plotly[i % len(px.colors.qualitative.Plotly)]

                # Plot buys by this trader (marker Y value is mid_price)
                buyer_trades = product_trades[product_trades["buyer"] == trader]
                if not buyer_trades.empty:
                    fig.add_trace(go.Scatter(
                        x=buyer_trades["timestamp"],
                        y=buyer_trades["mid_price"], # <<< Use mid_price for Y-coordinate
                        mode="markers",
                        name=f"Buyer = {trader}",
                        visible="legendonly", # Start hidden
                        marker={"color": color, "symbol": "triangle-up", "size": 7},
                        hovertext=buyer_trades['trade_price'].apply(lambda p: f'Trade Price: {p:.2f}'), # Show actual trade price on hover
                        hoverinfo='x+text+name'
                    ))

                # Plot sells by this trader (marker Y value is mid_price)
                seller_trades = product_trades[product_trades["seller"] == trader]
                if not seller_trades.empty:
                    fig.add_trace(go.Scatter(
                        x=seller_trades["timestamp"],
                        y=seller_trades["mid_price"], # <<< Use mid_price for Y-coordinate
                        mode="markers",
                        name=f"Seller = {trader}",
                        visible="legendonly", # Start hidden
                        marker={"color": color, "symbol": "triangle-down", "size": 7},
                        hovertext=seller_trades['trade_price'].apply(lambda p: f'Trade Price: {p:.2f}'), # Show actual trade price on hover
                        hoverinfo='x+text+name'
                    ))
        else:
            print(f"    No trade data with valid mid_price available for {product} markers.")

        fig.update_layout(
             title_text=f"Product: {product} - Mid Price and Trades (by Trader)",
             xaxis_title="Timestamp",
             yaxis_title="Price Level", # Single Y-axis
             legend_title="Series / Trades (Click to Show/Hide)"
         )
        fig.show()


In [None]:
# Block 3 (Revised): Visualize Trades by Buyer-Seller Pair (Dots on Mid-Price Line)

# Requires df_all, traders, and prices_unique_df from Block 1
# Requires plotly.graph_objects as go, plotly.express as px, make_subplots

if 'df_all' not in locals() or 'traders' not in locals() or 'prices_unique_df' not in locals():
    print("Error: Required DataFrames (df_all, prices_unique_df) or 'traders' list not found. Please run Block 1 first.")
else:
    print("\nGenerating plots for trades by buyer-seller pair (markers on mid-price)...")
    available_products = sorted(df_all['symbol'].unique())

    for product in available_products:
        print(f"  Plotting for Product: {product}...")
        # Filter trades dataframe for this product; keep rows with valid mid_price
        product_trades = df_all[(df_all['symbol'] == product) & (df_all['mid_price'].notna())].copy()
        # Filter unique prices dataframe for this product
        product_prices = prices_unique_df[prices_unique_df['symbol'] == product].copy()

        if product_trades.empty and product_prices.empty:
             print(f"    No trade or price data with valid mid_price for product {product}. Skipping plot.")
             continue

        fig = make_subplots() # No secondary y-axis

        # Plotting the mid-price line
        if not product_prices.empty:
             fig.add_trace(
                 go.Scatter(
                     x=product_prices["timestamp"],
                     y=product_prices['mid_price'],
                     name="Mid Price",
                     mode='lines',
                     line={"color": "gray"} # Solid gray line
                 )
             )
        else:
             print(f"    No price data available for {product} line plot.")

        # Plotting trades by pair at their corresponding mid-price level
        pairs_plotted = 0
        pair_color_index = 0

        if not product_trades.empty:
            for buyer in traders:
                for seller in traders:
                    if buyer == seller: # Skip self-trades if desired
                        continue

                    # Filter trades for the specific buyer-seller pair
                    pair_trades = product_trades[(product_trades["buyer"] == buyer) & (product_trades["seller"] == seller)]

                    # Optional: Filter out pairs with very few or too many trades
                    min_trades_for_pair = 2
                    max_trades_for_pair = 200 # Adjust as needed
                    if not (min_trades_for_pair <= len(pair_trades) <= max_trades_for_pair):
                        continue

                    # Assign color using Plotly's default cycle
                    color = px.colors.qualitative.Plotly[pair_color_index % len(px.colors.qualitative.Plotly)]
                    pair_color_index += 1

                    fig.add_trace(go.Scatter(
                        x=pair_trades["timestamp"],
                        y=pair_trades["mid_price"], # <<< Use mid_price for Y-coordinate
                        mode="markers",
                        name=f"{buyer} -> {seller}", # Legend entry for the pair
                        visible="legendonly", # Start hidden
                        marker={"color": color, "size": 6},
                        hovertext=pair_trades['trade_price'].apply(lambda p: f'Trade Price: {p:.2f}'), # Show actual trade price on hover
                        hoverinfo='x+text+name'
                    ))
                    pairs_plotted += 1

            print(f"    Plotted {pairs_plotted} buyer-seller pairs for {product} (filtered by trade count).")
        else:
             print(f"    No trade data with valid mid_price available for {product} markers.")


        fig.update_layout(
            title_text=f"Product: {product} - Mid Price and Trades (by Buyer/Seller Pair)",
            xaxis_title="Timestamp",
            yaxis_title="Price Level", # Single Y-axis
            legend_title="Series / Buyer -> Seller Pairs (Click to Show/Hide)"
        )
        fig.show()
