In [1]:
!pip install wandb pandas matplotlib scipy plotly



In [2]:
import wandb
import itertools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from concurrent.futures import ThreadPoolExecutor
from typing import Optional
from scipy import stats

ENTITY = "no-organization-for-signup"

# Baseline
BASELINE_TRAINING = "hypergrid_v5"
BASELINE_EVALUATE = "hypergrid_v5_eval"
# PIC
NEW_TRAINING = "hypergrid_v6"
NEW_EVALUATE = "hypergrid_v6_eval"

## Localize Data

In [3]:
## Download Training Information
time_series_keys = [
    '_step', 
    'time_this_iter_s', 
    'time_total_s', 
    'timers/env_runner_sampling_timer', 
    'env_runners/episode_return_mean', 
    'env_runners/episode_return_min', 
    'env_runners/episode_return_max', 
    'config/induced_hom',
]

# Collect information from one run
def export_run(run):
    try:
        # Save entire config data
        config = {k: v for k, v in run.config.items() if not k.startswith('_')}
        extra_data = {
            'id': run.id,
            'name': run.name,
            'created_at': run.created_at,
            'state': run.state,
            # Run aggregate metrics
            'avg_ram_use': np.mean(run.history()["perf/ram_util_percent"]),
            'avg_cpu_use': np.mean(run.history()["perf/cpu_util_percent"]),
        }
        # Combine summary and config into a single dictionary
        run_data = {**extra_data, **config}

        # Add more run metadata
        hist = run.history()[time_series_keys]
        hist['id'] = run.id
        hist['sensor_conf'] = run.config['sensor_conf'] 

        return run_data, hist
    except Exception as e:
        return f"Error exporting run {run.id}: {e}"

# Initialize W&B API
def pull_training(target):
    print(f"Pulling training data from: {target}")
    api = wandb.Api()
    runs = api.runs(f'{ENTITY}/{target}')

    # Initialize lists to hold run data and history
    runs_data = []
    histories = []

    # Set up the ThreadPoolExecutor to parallelize the process
    with ThreadPoolExecutor(max_workers=5) as executor:
        # Submit export tasks to the executor
        for run_data, history in executor.map(export_run, runs):
            runs_data.append(run_data)
            histories.append(history)
    print("Data Collected. Saving Data Now.")

    # Convert the list of dictionaries to a DataFrame
    runs_df = pd.DataFrame(runs_data)
    hist_df = pd.concat(histories, keys=[f'run_{i}' for i in range(len(histories))])

    # Export the DataFrame to CSV
    runs_df.to_csv(f"{target}.csv", index=False)
    hist_df.to_csv(f"{target}_history.csv", index=True)

    print(f"Data has been successfully exported to '{target}.csv'.")

for target in [BASELINE_TRAINING, NEW_TRAINING]:
    pull_training(target)

[34m[1mwandb[0m: Currently logged in as: [33mbhosley[0m ([33mno-organization-for-signup[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Pulling training data from: hypergrid_v5
Data Collected. Saving Data Now.
Data has been successfully exported to 'hypergrid_v5.csv'.
Pulling training data from: hypergrid_v6
Data Collected. Saving Data Now.
Data has been successfully exported to 'hypergrid_v6.csv'.


In [4]:
## Import Evaluation Data
eval_metrics_keys = [
    'metrics/returns/policy_0', 
    'metrics/returns/policy_1', 
    'metrics/returns/policy_2',
    'metrics/returns/policy_3',
    'metrics/returns/min',
    'metrics/returns/max',
    'metrics/returns/mean',
]

# Collect information from one run
def export_eval(run):
    try:
        evals = run.history()[eval_metrics_keys]
        evals['eval_type'] = run.config['eval_type']
        evals['policy_type'] = run.config['policy_type']
        evals['sensor_config'] = run.config['sensor_config']
        return evals
    except Exception as e:
        return f"Error exporting run {run.id}: {e}"

def pull_evaluation(target):
    # Initialize W&B API
    api = wandb.Api()
    evals = api.runs(f'{ENTITY}/{target}')

    # Initialize lists to hold eval data
    evals_data = []

    # Set up the ThreadPoolExecutor to parallelize the process
    with ThreadPoolExecutor(max_workers=5) as executor:
        # Submit export tasks to the executor
        for eval_data in executor.map(export_eval, evals):
            evals_data.append(eval_data)

    # Convert the list of dictionaries to a DataFrame
    evals_df = pd.concat(evals_data, keys=[f'eval_{i}' for i in range(len(evals_data))])

    # Export the DataFrame to CSV
    evals_df.to_csv(f"{target}.csv", index=False)

    print(f"Data has been successfully exported to '{target}.csv'.")

for target in [BASELINE_EVALUATE, NEW_EVALUATE]:
    pull_evaluation(target)

Data has been successfully exported to 'hypergrid_v5_eval.csv'.
Data has been successfully exported to 'hypergrid_v6_eval.csv'.


Hardware

In [10]:
import plotly.express as px
import plotly.graph_objects as go

# Known Metrics: "time_this_iter_s", "time_total_s", 
# "timers/env_runner_sampling_timer", "env_runners/episode_return_min", 
# "env_runners/episode_return_max", "env_runners/episode_return_mean"
metric = "env_runners/episode_return_mean"

# df_tr = pd.read_csv(f"{TRAINING}_history.csv")
li = []

# for f in [BASELINE_EVALUATE, NEW_EVALUATE]:
#     df = pd.read_csv(f"{f}.csv", header=0)
#     li.append(df)

df = pd.read_csv(f"{BASELINE_TRAINING}.csv", header=0)
li.append(df)
df = pd.read_csv(f"{NEW_TRAINING}.csv", header=0)
df["policy_type"] = "pic"
li.append(df)
df_tr = pd.concat(li, ignore_index=True)

def hex_to_rgba(hex_color, alpha=0.20):
    hex_color = hex_color.lstrip("#")
    r, gg, b = tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4))
    return f"rgba({r},{gg},{b},{alpha})"

# ---- Aggregate mean + band (std or CI) ----

g = (
    df_tr[["sensor_conf", "config/induced_hom", "_step", metric]]
    .groupby(["sensor_conf", "config/induced_hom", "_step"], observed=True)
    .agg(
        mean=(metric, "mean"),
        n=(metric, "size"),
        std=(metric, "std")
    )
    .reset_index()
)
g["std"] = g["std"].fillna(0.0)
g["se"] = g["std"] / np.sqrt(g["n"].clip(lower=1))

# Choose your band: 1*std (wider, intuitive) or 1.96*se (95% CI)
USE_STD_BAND = True
g["band"] = g["std"] if USE_STD_BAND else 1.96 * g["se"]
g["lower"] = g["mean"] - g["band"]
g["upper"] = g["mean"] + g["band"]

# ---- Figure (mean lines ± CI band) ----

fig = px.line(
    g,
    x="_step",
    y="mean",
    color="config/induced_hom",
    facet_row="sensor_conf",
)

# ---- Add Error Bands ---- 

# Match Facet Order
cats_order = list(pd.unique(g["sensor_conf"]))
cats_order.reverse()

# Match Correct Colors
colorway = fig.layout.colorway or px.colors.qualitative.Plotly
color_levels = list(g["config/induced_hom"].drop_duplicates())
color_map = {lvl: colorway[i % len(colorway)] for i, lvl in enumerate(color_levels)}
fill_map  = {lvl: hex_to_rgba(color_map[lvl], 0.20) for lvl in color_levels}

# Add bands by sensor config x algo
for (conf, ih), df_grp in g.groupby(["sensor_conf", "config/induced_hom"], observed=True):
    df_grp = df_grp.sort_values("_step")
    fig.add_trace(
        go.Scatter(
            x=df_grp["_step"], y=df_grp["upper"],
            mode="lines", line=dict(width=0),
            showlegend=False, hoverinfo="skip", legendgroup=f"{ih}",
        ),
        col=1, row=cats_order.index(conf) + 1,
    )
    fig.add_trace(
        go.Scatter(
            x=df_grp["_step"], y=df_grp["lower"],
            mode="lines", line=dict(width=0),
            fill="tonexty", 
            fillcolor=fill_map[ih],
            showlegend=False, hoverinfo="skip", legendgroup=f"{ih}",
        ),
        col=1, row=cats_order.index(conf) + 1,
    )
# Per Facet updates
fig.for_each_annotation(lambda a: a.update(
    text=(
        a.text.split("=")[-1]   # Remove "sensor_conf=__"
        .replace("_"," ").title() # Make title
    ), textangle = 45))
fig.for_each_yaxis(lambda y: y.update(title=""))

# Overall updates
fig.update_layout(
    template = "plotly_white",
    title_text = "Mean Episode Return",
    margin = dict(t=40, r=10, b=10, l=10),
    xaxis_title = "Training Step",
    legend_title_text = "Implicit Indication",
    legend=dict(
        orientation="h",
        yanchor="top",
        y=1.1,
        xanchor="right",
        x=1.1
    ),
    width=750,
    height=600,
)
fig.update_xaxes(tickangle=45)

fig.write_image(f"training_curves.png")
fig.show()

KeyError: "['config/induced_hom', '_step', 'env_runners/episode_return_mean'] not in index"

In [7]:
import plotly.express as px
import plotly.graph_objects as go

li = []

# for f in [BASELINE_EVALUATE, NEW_EVALUATE]:
#     df = pd.read_csv(f"{f}.csv", header=0)
#     li.append(df)

df = pd.read_csv(f"{BASELINE_EVALUATE}.csv", header=0)
li.append(df)
df = pd.read_csv(f"{NEW_EVALUATE}.csv", header=0)
df["policy_type"] = "pic"
li.append(df)

df_ev = pd.concat(li, ignore_index=True)

TOP_K  = 15          # keep top-K runs per (eval_type, policy_type, sensor_config)
N_AGENTS = 4/2
# M: ['metrics/returns/policy_0', 'metrics/returns/policy_1', 'metrics/returns/policy_2', 'metrics/returns/policy_3', 'metrics/returns/min', 'metrics/returns/max', 'metrics/returns/mean']
METRIC = 'metrics/returns/mean'
FACTORS = ["eval_type", "policy_type", "sensor_config"]

# Sort by shifted metric and keep top-K per bucket
df_top = (
    df_ev.sort_values(METRIC, ascending=False)
      .groupby(FACTORS, group_keys=False)
      .head(TOP_K)
)

# Fix Metric
df_top["metric"] = (pd.to_numeric(df_top[METRIC], errors="coerce") + 5) 
df_top.loc[df_top["policy_type"] == "induced_hom", "metric"] *= N_AGENTS

# Set Orders
eval_order = ['baseline', 'agent_loss', 'sensor_degradation', 'sensor_improvement', 
    'degrade_coverage', 'improve_coverage', 'shuffled_set', 'novel_span']
sense_order = ["complete", "intersecting_span", "disjoint_span", "incomplete"]
df_top["eval_type"] = pd.Categorical(
    df_top["eval_type"], categories=eval_order, ordered=True)
df_top["sensor_config"] = pd.Categorical(
    df_top["sensor_config"], categories=sense_order, ordered=True)

# Group and Summarize
summary = (
    df_top
    .groupby(FACTORS)["metric"]
    .agg(mean="mean", n="count", sd=lambda x: x.std(ddof=1))
    .reset_index())
summary["se"]   = summary["sd"] / np.sqrt(summary["n"].clip(lower=1))
summary["ci95"] = np.where(summary["n"] > 1, 1.96 * summary["se"], np.nan)

# Adjustments for graphics
summary["sensor_config"] = (
    summary["sensor_config"].map(lambda t: t.replace("_"," ").title())
)
summary["eval_type"] = (
    summary["eval_type"].map(lambda t: t.replace("_"," ").title())
)
summary = summary.replace({"induced_hom": "Implicit Ind.", "default_het": "Heterogeneous"})
invalid_pairs = [
    ["Complete", "Sensor Improvement"],
    ["Complete", "Improve Coverage"],
    ["Intersecting Span", "Improve Coverage"],
    ["Disjoint Span", "Improve Coverage"],
]
for sc,et in invalid_pairs:
    summary = summary[~((summary["eval_type"] == et) & (summary["sensor_config"] == sc))]
# summary





In [8]:
summary

Unnamed: 0,eval_type,policy_type,sensor_config,mean,n,sd,se,ci95
0,Baseline,Heterogeneous,Complete,5.015000,15,0.264330,0.068250,0.133770
1,Baseline,Heterogeneous,Intersecting Span,4.341667,15,0.324439,0.083770,0.164189
2,Baseline,Heterogeneous,Disjoint Span,4.271667,15,0.286546,0.073986,0.145012
3,Baseline,Heterogeneous,Incomplete,4.751667,15,0.417418,0.107777,0.211243
4,Baseline,Implicit Ind.,Complete,11.793333,15,1.471074,0.379830,0.744466
...,...,...,...,...,...,...,...,...
91,Novel Span,Implicit Ind.,Incomplete,5.865000,15,1.950819,0.503699,0.987251
92,Novel Span,pic,Complete,4.174167,15,1.048537,0.270731,0.530633
93,Novel Span,pic,Intersecting Span,2.070000,15,0.872264,0.225217,0.441426
94,Novel Span,pic,Disjoint Span,2.305000,15,0.631728,0.163111,0.319699


In [9]:
## Bar Comparisons
fig = px.bar(
    summary,
    x="sensor_config",
    y="mean",
    color="policy_type",
    barmode="group",
    facet_row="eval_type",
    error_y="ci95",
    hover_data=["n","se"],
    title="Evaluation of Policies under Different Training Conditions"
)
fig.for_each_annotation(
    # Remove "sensor_conf=__"
    lambda a: a.update(text=(a.text.split("=")[-1]), textangle = 0)
)

fig.update_layout(
    legend=dict(
        x=1.02,   # just outside the plotting area
        y=1.12,
        xanchor="left",
        yanchor="top"
    ),
    xaxis_title = "Sensor Config",
    legend_title_text = "Policies",
    width=800, height=800,
)
fig.for_each_yaxis(lambda y: y.update(title=""))
# fig.write_image(f"eval_scale.png")
fig.show()

In [18]:
!pip show nbformat

Name: nbformat
Version: 5.10.4
Summary: The Jupyter Notebook format
Home-page: https://jupyter.org
Author: 
Author-email: Jupyter Development Team <jupyter@googlegroups.com>
License: BSD 3-Clause License

- Copyright (c) 2001-2015, IPython Development Team
- Copyright (c) 2015-, Jupyter Development Team

All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

1. Redistributions of source code must retain the above copyright notice, this
   list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright notice,
   this list of conditions and the following disclaimer in the documentation
   and/or other materials provided with the distribution.

3. Neither the name of the copyright holder nor the names of its
   contributors may be used to endorse or promote products derived from
   this software without specific prior writte

In [13]:
!pip install --upgrade kaleido

Collecting kaleido
  Using cached kaleido-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Collecting choreographer>=1.1.1 (from kaleido)
  Using cached choreographer-1.2.1-py3-none-any.whl.metadata (6.8 kB)
Collecting logistro>=1.0.8 (from kaleido)
  Using cached logistro-2.0.1-py3-none-any.whl.metadata (3.9 kB)
Collecting orjson>=3.10.15 (from kaleido)
  Downloading orjson-3.11.5-cp313-cp313-macosx_15_0_arm64.whl.metadata (41 kB)
Collecting pytest-timeout>=2.4.0 (from kaleido)
  Using cached pytest_timeout-2.4.0-py3-none-any.whl.metadata (20 kB)
Collecting simplejson>=3.19.3 (from choreographer>=1.1.1->kaleido)
  Using cached simplejson-3.20.2-cp313-cp313-macosx_11_0_arm64.whl.metadata (3.4 kB)
Collecting pytest>=7.0.0 (from pytest-timeout>=2.4.0->kaleido)
  Downloading pytest-9.0.2-py3-none-any.whl.metadata (7.6 kB)
Collecting iniconfig>=1.0.1 (from pytest>=7.0.0->pytest-timeout>=2.4.0->kaleido)
  Using cached iniconfig-2.3.0-py3-none-any.whl.metadata (2.5 kB)
Collecting pluggy<2,>=1.5 (fro