# Results

In [1]:
import wandb
import pandas as pd
import numpy as np
from datetime import datetime
import yaml

In [65]:
def get_runs(entity="jccamml", project="L65-project", algorithm="find_maximum_subarray_kadane"):
    api = wandb.Api(timeout=30)
    wandb_filters = {
        #"config.algorithms": [algorithm],
        "created_at": {"$gt": "2025-03-22T01"}, # greater than
        #"state": "finished",
    }
    runs = api.runs(f"{entity}/{project}", filters=wandb_filters) 
    print(f"Found {len(runs)} runs.")
    data = {}
    for run in runs:
        if run.config['algorithms'] != [algorithm] or run.summary.get('_step',0) < 80000 or run.config['eval_every'] != 500:
            print(f"Skipping run {run.id} with algorithm {run.config['algorithms']} and steps {run.summary.get('_step',0)}")
            continue 
        
        if data.get((run.config['processor_type'], run.config['hint_teacher_forcing'], run.config['seed'])) is None:
            data[(run.config['processor_type'], run.config['hint_teacher_forcing'], run.config['seed'])]  = run

    return data

In [35]:
# Few seconds
data = get_runs()

Found 147 runs.
Skipping run mf2en0ra with algorithm ['bfs'] and steps 0
Skipping run f8aowemb with algorithm ['bfs'] and steps 1181
Skipping run wskeew1n with algorithm ['bfs'] and steps 0
Skipping run ah31c565 with algorithm ['bfs'] and steps 0
Skipping run nbxcff2v with algorithm ['bfs'] and steps 0
Skipping run wyl28e32 with algorithm ['bfs'] and steps 0
Skipping run c40nf94c with algorithm ['bfs'] and steps 0
Skipping run r0tdmrsk with algorithm ['bfs'] and steps 0
Skipping run l4hssvng with algorithm ['bfs'] and steps 2
Skipping run qv2dvuaj with algorithm ['bfs'] and steps 3
Skipping run 140lw5y2 with algorithm ['find_maximum_subarray_kadane'] and steps 0
Skipping run 6q5qu1tn with algorithm ['find_maximum_subarray_kadane'] and steps 0
Skipping run 7o8sf2j5 with algorithm ['find_maximum_subarray_kadane'] and steps 0
Skipping run chz6czcc with algorithm ['find_maximum_subarray_kadane'] and steps 0
Skipping run drsfg7y7 with algorithm ['find_maximum_subarray_kadane'] and steps 0
S

In [36]:
for c in sorted(data.keys()):
    print(c)

('differential_mpnn_maxmax', 0, 0)
('differential_mpnn_maxmax', 0, 1)
('differential_mpnn_maxmax', 0, 2)
('differential_mpnn_maxmax-ablate_outer', 0, 0)
('differential_mpnn_maxmax-ablate_outer', 0, 1)
('differential_mpnn_maxmax-ablate_outer', 0, 2)
('differential_mpnn_maxmax-ablate_subtract', 0, 0)
('differential_mpnn_maxmax-ablate_subtract', 0, 1)
('differential_mpnn_maxmax-ablate_subtract', 0, 2)
('differential_mpnn_maxmax-sum', 0, 0)
('differential_mpnn_maxmax-sum', 0, 1)
('differential_mpnn_maxmax-sum', 0, 2)
('differential_mpnn_msgdiff', 0, 0)
('differential_mpnn_msgdiff', 0, 1)
('differential_mpnn_msgdiff', 0, 2)
('mpnn', 0, 0)
('mpnn', 0, 1)
('mpnn', 0, 2)
('mpnn', 0.5, 0)
('mpnn', 0.5, 1)
('mpnn', 0.5, 2)


# Analysis - Kadane

In [46]:
cache = {}



In [53]:
algorithm = "find_maximum_subarray_kadane"
results = list()
for (name, run) in data.items():
    history_data = run.history(keys=[f'test_score_current_model_{algorithm}'])  # Fetch full history
    history_data = history_data[(history_data["_step"] <= 80000) & (history_data["_step"] % 500 == 0)]  # Keep only <= 80k steps
    if len(history_data) == 161:
        cache[name] = history_data
    results.append(
        {
        "method": f"{name[0]}{"" if name[1]==0 else "-TeacherForced"}",
        "Train Steps": 40000,
        "Seed": name[2],
        "OOD Accuracy": history_data.iloc[61:81][f'test_score_current_model_{algorithm}'].mean()
        }
    )
    results.append(
        {
        "method": f"{name[0]}{"" if name[1]==0 else "-TeacherForced"}",
        "Train Steps": 80000,
        "Seed": name[2],
        "OOD Accuracy": history_data.iloc[-20:][f'test_score_current_model_{algorithm}'].mean()
        }
    )

In [54]:
pd.DataFrame(results).groupby(["method", "Train Steps"])['OOD Accuracy'].mean().reset_index().pivot_table(index='method', columns='Train Steps', values='OOD Accuracy')

Train Steps,40000,80000
method,Unnamed: 1_level_1,Unnamed: 2_level_1
differential_mpnn_maxmax,0.694255,0.725342
differential_mpnn_maxmax-ablate_outer,0.689559,0.709937
differential_mpnn_maxmax-ablate_subtract,0.64467,0.68715
differential_mpnn_maxmax-sum,0.704517,0.702173
differential_mpnn_msgdiff,0.669808,0.699902
mpnn,0.669784,0.711483
mpnn-TeacherForced,0.729915,0.749862


# Results Analysis

In [66]:
data_carla = get_runs("biermann-carla-university-of-cambridge","L65-project", "insertion_sort")

Found 106 runs.
Skipping run eaegztbg with algorithm ['find_maximum_subarray_kadane'] and steps 0
Skipping run h051r8qz with algorithm ['find_maximum_subarray_kadane'] and steps 0
Skipping run o05bop9e with algorithm ['find_maximum_subarray_kadane'] and steps 0
Skipping run x523mjg6 with algorithm ['find_maximum_subarray_kadane'] and steps 0
Skipping run 36gc4ukd with algorithm ['find_maximum_subarray_kadane'] and steps 0
Skipping run fjubtgaj with algorithm ['find_maximum_subarray_kadane'] and steps 0
Skipping run otit1iji with algorithm ['find_maximum_subarray_kadane'] and steps 0
Skipping run q2reu2oz with algorithm ['find_maximum_subarray_kadane'] and steps 0
Skipping run 0r51j0ll with algorithm ['find_maximum_subarray_kadane'] and steps 0
Skipping run 2kvcesxt with algorithm ['find_maximum_subarray_kadane'] and steps 0
Skipping run 7gdz2f4d with algorithm ['find_maximum_subarray_kadane'] and steps 0
Skipping run rct3f7a8 with algorithm ['find_maximum_subarray_kadane'] and steps 49

In [67]:
len(data_carla)

18

In [68]:
algorithm2 = "insertion_sort"
results2 = list()
cache2 = {}

for (name, run) in data_carla.items():
    history_data = run.history(keys=[f'test_score_current_model_{algorithm2}'])  # Fetch full history
    history_data = history_data[(history_data["_step"] <= 80000) & (history_data["_step"] % 500 == 0)]  # Keep only <= 80k steps
    if len(history_data) == 161:
        cache2[name] = history_data
    results2.append(
        {
        "method": f"{name[0]}{"" if name[1]==0 else "-TeacherForced"}",
        "Train Steps": 40000,
        "Seed": name[2],
        "OOD Accuracy": history_data.iloc[61:81][f'test_score_current_model_{algorithm2}'].mean()
        }
    )
    results2.append(
        {
        "method": f"{name[0]}{"" if name[1]==0 else "-TeacherForced"}",
        "Train Steps": 80000,
        "Seed": name[2],
        "OOD Accuracy": history_data.iloc[-20:][f'test_score_current_model_{algorithm2}'].mean()
        }
    )

In [69]:
pd.DataFrame(results2)

Unnamed: 0,method,Train Steps,Seed,OOD Accuracy
0,differential_mpnn_maxmax-ablate_subtract,40000,42,0.75332
1,differential_mpnn_maxmax-ablate_subtract,80000,42,0.743335
2,differential_mpnn_msgdiff,40000,42,0.757886
3,differential_mpnn_msgdiff,80000,42,0.787012
4,differential_mpnn_maxmax-ablate_outer,40000,42,0.641504
5,differential_mpnn_maxmax-ablate_outer,80000,42,0.680225
6,mpnn,40000,42,0.585938
7,mpnn,80000,42,0.7573
8,differential_mpnn_maxmax,40000,42,0.669727
9,differential_mpnn_maxmax,80000,42,0.711206


In [70]:
pd.DataFrame(results2).groupby(["method", "Train Steps"])['OOD Accuracy'].mean().reset_index().pivot_table(index='method', columns='Train Steps', values='OOD Accuracy')

Train Steps,40000,80000
method,Unnamed: 1_level_1,Unnamed: 2_level_1
differential_mpnn_maxmax,0.673828,0.758268
differential_mpnn_maxmax-ablate_outer,0.639819,0.662858
differential_mpnn_maxmax-ablate_subtract,0.710856,0.703556
differential_mpnn_msgdiff,0.638892,0.740649
mpnn,0.635221,0.730534
mpnn-TeacherForced,0.530119,0.556087


# Line Plots

In [71]:
from scipy.stats import sem
def lineplot(ax, histories, timesteps, label, color, linestyle="--", xaxis="#Iterations"):
    # Convert list of arrays to DataFrame (aligns different lengths with NaN padding)
    score_df = pd.DataFrame(histories).T  # Transpose so rows = timesteps
    mean_scores = score_df.mean(axis=1)
    stderr = sem(score_df, axis=1, nan_policy="omit")/(5**0.5)  # Standard error for confidence interval, taking into acocunt eval seeds
    ci_upper = mean_scores + 1.96 * stderr  # 95% confidence upper bound
    ci_lower = mean_scores - 1.96 * stderr  # 95% confidence lower bound
    # Plot the mean score with confidence intervals
    #plt.figure(figsize=(10, 5))
    ax.plot(timesteps, mean_scores, color=color, label=label, linestyle=linestyle)
    ax.fill_between(timesteps, ci_lower, ci_upper, color=color, alpha=0.2)
    ax.set_xlabel(xaxis)
    ax.set_ylabel("OOD Accuracy")

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt 
for c in range(10):
    # TEMPLATE LINE PLOTTING CODE
    c = mpl.colormaps["gnuplot2"](0.5) # insert number in 0 - 1 for color palette
    # MAKE YOUR LINEPLOT HERE

plt.legend(loc="best")