## Data Analysis

In [1]:
import wandb
import itertools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from concurrent.futures import ThreadPoolExecutor
from typing import Optional
from scipy import stats

ENTITY = "no-organization-for-signup"
TRAINING = "hypergrid_v5"
EVALUATE = "hypergrid_v5_eval"

## Localize Data

In [2]:
## Download Training Information
time_series_keys = [
    '_step', 
    'time_this_iter_s', 
    'time_total_s', 
    'timers/env_runner_sampling_timer', 
    'env_runners/episode_return_mean', 
    'env_runners/episode_return_min', 
    'env_runners/episode_return_max', 
    'config/induced_hom',
]

# Collect information from one run
def export_run(run):
    try:
        # Save entire config data
        config = {k: v for k, v in run.config.items() if not k.startswith('_')}
        extra_data = {
            'id': run.id,
            'name': run.name,
            'created_at': run.created_at,
            'state': run.state,
            # Run aggregate metrics
            'avg_ram_use': np.mean(run.history()["perf/ram_util_percent"]),
            'avg_cpu_use': np.mean(run.history()["perf/cpu_util_percent"]),
        }
        # Combine summary and config into a single dictionary
        run_data = {**extra_data, **config}

        # Add more run metadata
        hist = run.history()[time_series_keys]
        hist['id'] = run.id
        hist['sensor_conf'] = run.config['sensor_conf'] 

        return run_data, hist
    except Exception as e:
        return f"Error exporting run {run.id}: {e}"

# Initialize W&B API
api = wandb.Api()
runs = api.runs(f'{ENTITY}/{TRAINING}')

# Initialize lists to hold run data and history
runs_data = []
histories = []

# Set up the ThreadPoolExecutor to parallelize the process
with ThreadPoolExecutor(max_workers=5) as executor:
    # Submit export tasks to the executor
    for run_data, history in executor.map(export_run, runs):
        runs_data.append(run_data)
        histories.append(history)

# Convert the list of dictionaries to a DataFrame
runs_df = pd.DataFrame(runs_data)
hist_df = pd.concat(histories, keys=[f'run_{i}' for i in range(len(histories))])

# Export the DataFrame to CSV
runs_df.to_csv(f"{TRAINING}.csv", index=False)
hist_df.to_csv(f"{TRAINING}_history.csv", index=True)

print(f"Data has been successfully exported to '{TRAINING}.csv'.")

[34m[1mwandb[0m: Currently logged in as: [33mbhosley[0m ([33mno-organization-for-signup[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Data has been successfully exported to 'hypergrid_v5.csv'.


In [160]:
## Import Evaluation Data
eval_metrics_keys = [
    'metrics/returns/policy_0', 
    'metrics/returns/policy_1', 
    'metrics/returns/policy_2',
    'metrics/returns/policy_3',
    'metrics/returns/min',
    'metrics/returns/max',
    'metrics/returns/mean',
]

# Collect information from one run
def export_eval(run):
    try:
        evals = run.history()[eval_metrics_keys]
        evals['eval_type'] = run.config['eval_type']
        evals['policy_type'] = run.config['policy_type']
        evals['sensor_config'] = run.config['sensor_config']
        return evals
    except Exception as e:
        return f"Error exporting run {run.id}: {e}"

# Initialize W&B API
api = wandb.Api()
evals = api.runs(f'{ENTITY}/{EVALUATE}')

# Initialize lists to hold eval data
evals_data = []

# Set up the ThreadPoolExecutor to parallelize the process
with ThreadPoolExecutor(max_workers=5) as executor:
    # Submit export tasks to the executor
    for eval_data in executor.map(export_eval, evals):
        evals_data.append(eval_data)

# Convert the list of dictionaries to a DataFrame
evals_df = pd.concat(evals_data, keys=[f'eval_{i}' for i in range(len(evals_data))])

# Export the DataFrame to CSV
evals_df.to_csv(f"{EVALUATE}.csv", index=False)

print(f"Data has been successfully exported to '{EVALUATE}.csv'.")

Data has been successfully exported to 'hypergrid_v5_eval.csv'.


## Hardware Utilization

In [22]:
hard_df = pd.read_csv(f"{TRAINING}.csv")
metrics = ["avg_ram_use", "avg_cpu_use"]
for metric in metrics:
    samples = [
        hard_df[hard_df["induced_hom"]==False][metric],
        hard_df[hard_df["induced_hom"]==True][metric]
    ]
    print(f"\nFor {metric}, the comparitave stats are:")
    print("t = {:.3f}, p = {:.3g}".format(*stats.ttest_ind(*samples)))
    print("U = {}, p = {:.3g}".format(*stats.mannwhitneyu(*samples)))
    print("KS = {:.3f}, p = {:.3g}".format(*stats.ks_2samp(*samples)))


For avg_ram_use, the comparitave stats are:
t = -1.294, p = 0.197
U = 3438.0, p = 0.161
KS = 0.153, p = 0.212

For avg_cpu_use, the comparitave stats are:
t = 0.560, p = 0.576
U = 4054.0, p = 0.687
KS = 0.107, p = 0.633


## Training

In [185]:
import plotly.express as px
import plotly.graph_objects as go

# Known Metrics: "time_this_iter_s", "time_total_s", 
# "timers/env_runner_sampling_timer", "env_runners/episode_return_min", 
# "env_runners/episode_return_max", "env_runners/episode_return_mean"
metric = "env_runners/episode_return_mean"
df_tr = pd.read_csv(f"{TRAINING}_history.csv")

def hex_to_rgba(hex_color, alpha=0.20):
    hex_color = hex_color.lstrip("#")
    r, gg, b = tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4))
    return f"rgba({r},{gg},{b},{alpha})"

# ---- Aggregate mean + band (std or CI) ----

g = (
    df_tr[["sensor_conf", "config/induced_hom", "_step", metric]]
    .groupby(["sensor_conf", "config/induced_hom", "_step"], observed=True)
    .agg(
        mean=(metric, "mean"),
        n=(metric, "size"),
        std=(metric, "std")
    )
    .reset_index()
)
g["std"] = g["std"].fillna(0.0)
g["se"] = g["std"] / np.sqrt(g["n"].clip(lower=1))

# Choose your band: 1*std (wider, intuitive) or 1.96*se (95% CI)
USE_STD_BAND = True
g["band"] = g["std"] if USE_STD_BAND else 1.96 * g["se"]
g["lower"] = g["mean"] - g["band"]
g["upper"] = g["mean"] + g["band"]

# ---- Figure (mean lines Â± CI band) ----

fig = px.line(
    g,
    x="_step",
    y="mean",
    color="config/induced_hom",
    facet_row="sensor_conf",
)

# ---- Add Error Bands ---- 

# Match Facet Order
cats_order = list(pd.unique(g["sensor_conf"]))
cats_order.reverse()

# Match Correct Colors
colorway = fig.layout.colorway or px.colors.qualitative.Plotly
color_levels = list(g["config/induced_hom"].drop_duplicates())
color_map = {lvl: colorway[i % len(colorway)] for i, lvl in enumerate(color_levels)}
fill_map  = {lvl: hex_to_rgba(color_map[lvl], 0.20) for lvl in color_levels}

# Add bands by sensor config x algo
for (conf, ih), df_grp in g.groupby(["sensor_conf", "config/induced_hom"], observed=True):
    df_grp = df_grp.sort_values("_step")
    fig.add_trace(
        go.Scatter(
            x=df_grp["_step"], y=df_grp["upper"],
            mode="lines", line=dict(width=0),
            showlegend=False, hoverinfo="skip", legendgroup=f"{ih}",
        ),
        col=1, row=cats_order.index(conf) + 1,
    )
    fig.add_trace(
        go.Scatter(
            x=df_grp["_step"], y=df_grp["lower"],
            mode="lines", line=dict(width=0),
            fill="tonexty", 
            fillcolor=fill_map[ih],
            showlegend=False, hoverinfo="skip", legendgroup=f"{ih}",
        ),
        col=1, row=cats_order.index(conf) + 1,
    )
# Per Facet updates
fig.for_each_annotation(lambda a: a.update(
    text=(
        a.text.split("=")[-1]   # Remove "sensor_conf=__"
        .replace("_"," ").title() # Make title
    ), textangle = 45))
fig.for_each_yaxis(lambda y: y.update(title=""))

# Overall updates
fig.update_layout(
    template = "plotly_white",
    title_text = "Mean Episode Return",
    margin = dict(t=40, r=10, b=10, l=10),
    xaxis_title = "Training Step",
    legend_title_text = "Implicit Indication",
    legend=dict(
        orientation="h",
        yanchor="top",
        y=1.1,
        xanchor="right",
        x=1.1
    ),
    width=750,
    height=600,
)
fig.update_xaxes(tickangle=45)

fig.write_image(f"training_curves.png")
fig.show()

## Evaluation

- [ ] Pull performance metrics

In [None]:
import plotly.express as px
import plotly.graph_objects as go

df_ev = pd.read_csv(f"{EVALUATE}.csv")

TOP_K  = 15          # keep top-K runs per (eval_type, policy_type, sensor_config)
N_AGENTS = 4/2
# M: ['metrics/returns/policy_0', 'metrics/returns/policy_1', 'metrics/returns/policy_2', 'metrics/returns/policy_3', 'metrics/returns/min', 'metrics/returns/max', 'metrics/returns/mean']
METRIC = 'metrics/returns/mean'
FACTORS = ["eval_type", "policy_type", "sensor_config"]

# Sort by shifted metric and keep top-K per bucket
df_top = (
    df_ev.sort_values(METRIC, ascending=False)
      .groupby(FACTORS, group_keys=False)
      .head(TOP_K)
)

# Fix Metric
df_top["metric"] = (pd.to_numeric(df_top[METRIC], errors="coerce") + 5) 
df_top.loc[df_top["policy_type"] == "induced_hom", "metric"] *= N_AGENTS

# Set Orders
eval_order = ['baseline', 'agent_loss', 'sensor_degradation', 'sensor_improvement', 
    'degrade_coverage', 'improve_coverage', 'shuffled_set', 'novel_span']
sense_order = ["complete", "intersecting_span", "disjoint_span", "incomplete"]
df_top["eval_type"] = pd.Categorical(
    df_top["eval_type"], categories=eval_order, ordered=True)
df_top["sensor_config"] = pd.Categorical(
    df_top["sensor_config"], categories=sense_order, ordered=True)

# Group and Summarize
summary = (
    df_top
    .groupby(FACTORS)["metric"]
    .agg(mean="mean", n="count", sd=lambda x: x.std(ddof=1))
    .reset_index())
summary["se"]   = summary["sd"] / np.sqrt(summary["n"].clip(lower=1))
summary["ci95"] = np.where(summary["n"] > 1, 1.96 * summary["se"], np.nan)

# Adjustments for graphics
summary["sensor_config"] = (
    summary["sensor_config"].map(lambda t: t.replace("_"," ").title())
)
summary["eval_type"] = (
    summary["eval_type"].map(lambda t: t.replace("_"," ").title())
)
summary = summary.replace({"induced_hom": "Implicit Ind.", "default_het": "Heterogeneous"})
invalid_pairs = [
    ["Complete", "Sensor Improvement"],
    ["Complete", "Improve Coverage"],
    ["Intersecting Span", "Improve Coverage"],
    ["Disjoint Span", "Improve Coverage"],
]
for sc,et in invalid_pairs:
    summary = summary[~((summary["eval_type"] == et) & (summary["sensor_config"] == sc))]
# summary





Unnamed: 0,eval_type,policy_type,sensor_config,mean,n,sd,se,ci95
0,Baseline,Heterogeneous,Complete,5.015,15,0.26433,0.06825,0.13377
1,Baseline,Heterogeneous,Intersecting Span,4.341667,15,0.324439,0.08377,0.164189
2,Baseline,Heterogeneous,Disjoint Span,3.8075,15,0.1052,0.027162,0.053238
3,Baseline,Heterogeneous,Incomplete,4.671667,15,0.434851,0.112278,0.220065
4,Baseline,Implicit Ind.,Complete,8.88,15,1.26999,0.32791,0.642704
5,Baseline,Implicit Ind.,Intersecting Span,6.32,15,1.186614,0.306383,0.60051
6,Baseline,Implicit Ind.,Disjoint Span,6.666667,15,2.216389,0.572269,1.121648
7,Baseline,Implicit Ind.,Incomplete,5.32,15,0.65166,0.168258,0.329785
8,Agent Loss,Heterogeneous,Complete,4.941667,15,0.216798,0.055977,0.109715
9,Agent Loss,Heterogeneous,Intersecting Span,4.2625,15,0.35308,0.091165,0.178683


In [213]:
## Bar Comparisons
fig = px.bar(
    summary,
    x="sensor_config",
    y="mean",
    color="policy_type",
    barmode="group",
    facet_row="eval_type",
    error_y="ci95",
    hover_data=["n","se"],
    title="Evaluation of Policies under Different Training Conditions"
)
fig.for_each_annotation(
    # Remove "sensor_conf=__"
    lambda a: a.update(text=(a.text.split("=")[-1]), textangle = 0)
)

fig.update_layout(
    legend=dict(
        x=1.02,   # just outside the plotting area
        y=1.12,
        xanchor="left",
        yanchor="top"
    ),
    xaxis_title = "Sensor Config",
    legend_title_text = "Policies",
    width=800, height=800,
)
fig.for_each_yaxis(lambda y: y.update(title=""))
fig.write_image(f"eval_scale.png")
fig.show()

In [214]:
# Heatmap
piv = summary.pivot_table(index=["eval_type","sensor_config"], columns="policy_type", values="mean")
# Subtract low from high
piv['difference'] = (piv["Implicit Ind."] - piv["Heterogeneous"]) /4

fig = px.imshow(
    piv['difference'].unstack(),
    color_continuous_scale='RdBu',
    # color_continuous_scale='YlGn',
    color_continuous_midpoint=0,
    labels=dict(x="Sensor Config", y="Evaluation Task"),
    text_auto=True,
    # title="Comparison",
)
fig.update_coloraxes(showscale=False) 
fig.update_layout(
    template = "plotly_white",
    width=500, height=600,
)
# fig.update_yaxes(side="right")
fig.write_image(f"performance_delta.png")
fig.show()





In [191]:
summary

# summary[~((summary["eval_type"] == "Sensor Improvement") & (summary["sensor_config"] == "Complete"))]

Unnamed: 0,eval_type,policy_type,sensor_config,mean,n,sd,se,ci95
0,Baseline,Heterogeneous,Complete,5.015000,15,0.264330,0.068250,0.133770
1,Baseline,Heterogeneous,Intersecting Span,4.341667,15,0.324439,0.083770,0.164189
2,Baseline,Heterogeneous,Disjoint Span,3.807500,15,0.105200,0.027162,0.053238
3,Baseline,Heterogeneous,Incomplete,4.671667,15,0.434851,0.112278,0.220065
4,Baseline,Implicit Ind.,Complete,8.880000,15,1.269990,0.327910,0.642704
...,...,...,...,...,...,...,...,...
59,Novel Span,Heterogeneous,Incomplete,4.487500,15,0.442068,0.114141,0.223717
60,Novel Span,Implicit Ind.,Complete,7.880000,15,1.212627,0.313099,0.613674
61,Novel Span,Implicit Ind.,Intersecting Span,5.890000,15,2.040654,0.526895,1.032713
62,Novel Span,Implicit Ind.,Disjoint Span,5.313333,15,0.879472,0.227079,0.445074
