In [2]:
import json

with open('/Users/ianm/projects/olmo-cookbook/baseline_sampler_data.json','r') as fin:
    data = json.load(fin)


In [7]:
keys_to_remove = [key for key in data.keys() if key.startswith("mt_mbpp") and key not in ("mt_mbpp", "mt_mbpp:bpb")]
for key in keys_to_remove:
    del data[key]

In [None]:
with open('/Users/ianm/projects/olmo-cookbook/baseline_sampler_data.json','w') as fout:
    json.dump(data, fout, indent=2)

In [8]:
for k in data.keys():
    print(k)

mt_mbpp
mt_mbpp:bpb
codex_humaneval:3shot:bpb::none
lab_bench_dbqa
lab_bench_dbqa:bpb
lab_bench_protocolqa
lab_bench_protocolqa:bpb
lambada
lambada:bpb
mbpp:3shot:bpb::none
minerva_math_algebra::olmes
minerva_math_algebra:bpb::olmes
minerva_math_counting_and_probability::olmes
minerva_math_counting_and_probability:bpb::olmes
minerva_math_geometry::olmes
minerva_math_geometry:bpb::olmes
minerva_math_intermediate_algebra::olmes
minerva_math_intermediate_algebra:bpb::olmes
minerva_math_number_theory::olmes
minerva_math_number_theory:bpb::olmes
minerva_math_prealgebra::olmes
minerva_math_prealgebra:bpb::olmes
minerva_math_precalculus::olmes
minerva_math_precalculus:bpb::olmes
arc_challenge:rc::olmes:full
arc_challenge:bpb::olmes:full
arc_easy:rc::olmes:full
arc_easy:bpb::olmes:full
basic_skills_arithmetic:rc::olmes
basic_skills_arithmetic:bpb::olmes
basic_skills_coding:rc::olmes
basic_skills_coding:bpb::olmes
basic_skills_common_knowledge:rc::olmes
basic_skills_common_knowledge:bpb::olmes


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.colors import Normalize
from pathlib import Path
import numpy as np

# Construct data set fresh
models_data = [
    {"model": "olmo2-1b", "accuracy": 0.584, "params": 1e9, "tokens": 4e12},
    {"model": "olmo2-7b", "accuracy": 0.687, "params": 7e9, "tokens": 4e12},
    {"model": "olmo2-13b", "accuracy": 0.73,  "params": 13e9, "tokens": 5e12},
    {"model": "meta-llama/Llama-2-13b-hf", "accuracy": 0.749, "params": 13e9, "tokens": 2e12},
    {"model": "deepseek-ai/deepseek-llm-7b-base", "accuracy": 0.723, "params": 7e9, "tokens": 2e12},
    {"model": "Qwen/Qwen2.5-7B", "accuracy": 0.679, "params": 7e9, "tokens": 18e12},
    {"model": "allenai/DataDecide-dclm-baseline-1B", "accuracy": 0.614, "params": 1e9, "tokens": 1e11},
    {"model": "allenai/DataDecide-dclm-baseline-50p-dolma1.7-50p-1B", "accuracy": 0.604, "params": 1e9, "tokens": 1e11},
    {"model": "allenai/DataDecide-dolma1_7-1B", "accuracy": 0.561, "params": 1e9, "tokens": 1e11},
    {"model": "olmo2-1b-step20000", "accuracy": 0.58,  "params": 1e9, "tokens": 42e9},
    {"model": "olmo2-1b-step21000", "accuracy": 0.596, "params": 1e9, "tokens": 45e9},
    {"model": "olmo2-1b-step22000", "accuracy": 0.599, "params": 1e9, "tokens": 47e9},
    {"model": "olmo2-1b-step23000", "accuracy": 0.598, "params": 1e9, "tokens": 49e9},
    {"model": "meta-llama/Meta-Llama-3-8B", "accuracy": 0.751, "params": 8e9, "tokens": 15e12},
    {"model": "meta-llama/Meta-Llama-3.1-8B", "accuracy": 0.749, "params": 8e9, "tokens": 15e12},
]

df = pd.DataFrame(models_data)

# Metrics
df["compute"] = 6 * df["params"] * df["tokens"]
df["t2p_ratio"] = df["tokens"] / df["params"]

# Label renaming
def rename(model):
    mapping = {
        "allenai/DataDecide-dclm-baseline-1B": "DataDecide-DCLM",
        "allenai/DataDecide-dclm-baseline-50p-dolma1.7-50p-1B": "DataDecide-DCLM-Dolma-even-mix",
        "allenai/DataDecide-dolma1_7-1B": "DataDecide-Dolma",
        "meta-llama/Meta-Llama-3-8B": "Llama-3-8B",
        "meta-llama/Meta-Llama-3.1-8B": "Llama-3.1-8B",
    }
    if model in mapping:
        return mapping[model]
    return model.split("/")[-1]

df["label"] = df["model"].apply(rename)

# Marker mapping
marker_map = {
    "DataDecide-DCLM": '^',
    "DataDecide-DCLM-Dolma-even-mix": '^',
    "DataDecide-Dolma": '^',
    "olmo2-1b": 's',
    "olmo2-7b": 's',
    "olmo2-13b": 's',
    "Llama-2-13b-hf": 'o',
    "deepseek-llm-7b-base": 'X',
    "Qwen2.5-7B": 'D',
    "Llama-3-8B": 'v',
    "Llama-3.1-8B": 'P'
}

# Filter checkpoints
ckpt_mask = df["model"].str.contains("olmo2-1b-step")
plot_df = df[~ckpt_mask].copy()

# Compute accuracy std for olmo2-1b
acc_std = df.loc[ckpt_mask, "accuracy"].append(pd.Series([plot_df.loc[plot_df["model"] == "olmo2-1b", "accuracy"].iloc[0]])).std(ddof=0)

# Color normalization
norm = Normalize(vmin=plot_df["t2p_ratio"].min(), vmax=plot_df["t2p_ratio"].max())
cmap = cm.get_cmap("viridis")

plt.figure(figsize=(8,6))

# Scatter each point
for _, row in plot_df.iterrows():
    label = row["label"]
    marker = marker_map.get(label, '*')
    color = cmap(norm(row["t2p_ratio"]))
    plt.scatter(row["compute"], row["accuracy"], marker=marker, color=color, s=70, edgecolor='k')
    plt.annotate(label, (row["compute"], row["accuracy"]), fontsize=8, xytext=(4,2), textcoords='offset points')

# Error bar
olmo1b = plot_df[plot_df["model"] == "olmo2-1b"].iloc[0]
plt.errorbar(olmo1b["compute"], olmo1b["accuracy"], yerr=acc_std, fmt=marker_map["olmo2-1b"],
             color=cmap(norm(olmo1b["t2p_ratio"])), markersize=7, capsize=5, markeredgecolor='k')

# Connect olmo2 line
olmo_series = plot_df[plot_df["model"].isin(["olmo2-1b","olmo2-7b","olmo2-13b"])].sort_values("compute")
plt.plot(olmo_series["compute"], olmo_series["accuracy"], linestyle='--', color='black', alpha=0.6)

# Colorbar
cbar = plt.colorbar(cm.ScalarMappable(norm=norm, cmap=cmap))
cbar.set_label("Token / Parameter Ratio")

plt.xscale("log")
plt.xlabel("Compute (6ND, FLOPs, log scale)")
plt.ylabel("LAMBADA Accuracy")
plt.tight_layout()

plot_path = Path("/mnt/data/compute_vs_lambada_final.png")
plt.savefig(plot_path, dpi=300)
plt.close()

display_dataframe_to_user("Final plot data", plot_df[["label","accuracy","compute","t2p_ratio"]])

plot_path


In [None]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.colors import Normalize
from pathlib import Path
from ace_tools import display_dataframe_to_user

# Try to import adjustText for label adjustment
try:
    from adjustText import adjust_text
    ADJUST_AVAILABLE = True
except ImportError:
    ADJUST_AVAILABLE = False

# Full JSON data from user (pasted entirely)
json_str = """
{"lambada": {"Meta-Llama-3-8B": 0.7634387735299826, "Meta-Llama-3.1-8B": 0.7545119347952649, "allenai--OLMo-2-1124-13B--stage2-ingredient4-step35773-tokens300B": 0.727149233456239, "allenai--OLMo-2-1124-13B--stage2-ingredient4-step34000-tokens286B": 0.725790801474869, "deepseek-llm-7b-base": 0.725790801474869, "allenai--OLMo-2-1124-13B--stage2-ingredient4-step33000-tokens277B": 0.7254026780516204, "allenai--OLMo-2-1124-13B--stage2-ingredient4-step35000-tokens294B": 0.7248204929167475, "allenai--OLMo-2-1124-13B--stage2-ingredient4-step32000-tokens269B": 0.724432369493499, "Qwen2.5-7B": 0.7026974577915778, "allenai--OLMo-2-1124-7B--stage2-ingredient3-step9000-tokens38B": 0.7021152726567048, "allenai--OLMo-2-1124-7B--stage2-ingredient3-step11931-tokens50B": 0.701533087521832, "allenai--OLMo-2-1124-7B--stage2-ingredient3-step10000-tokens42B": 0.6988162235590918, "allenai--OLMo-2-1124-7B--stage2-ingredient3-step11000-tokens47B": 0.6964874830196003, "allenai--OLMo-2-1124-7B--stage2-ingredient3-step8000-tokens34B": 0.6951290510382302, "DataDecide-dclm-baseline-1B": 0.6155637492722685, "DataDecide-dclm-baseline-50p-dolma1.7-50p-1B": 0.6134290704444013, "allenai--OLMo-2-0425-1B--stage2-ingredient3-step21000-tokens45B": 0.6015913060353192, "allenai--OLMo-2-0425-1B--stage2-ingredient3-step22000-tokens47B": 0.5973219483795847, "allenai--OLMo-2-0425-1B--stage2-ingredient3-step23000-tokens49B": 0.5971278866679605, "allenai--OLMo-2-0425-1B--stage2-ingredient3-step23852-tokens51B": 0.5969338249563361, "allenai--OLMo-2-0425-1B--stage2-ingredient3-step20000-tokens42B": 0.5835435668542597, "DataDecide-dolma1_7-1B": 0.562196778575587}}
"""
data = json.loads(json_str)
acc_dict = data["lambada"]

# Separate OLMo checkpoint groups
olmo_ckpts = {
    "olmo2-1b": [],
    "olmo2-7b": [],
    "olmo2-13b": []
}
other = {}

for name, acc in acc_dict.items():
    if name.startswith("allenai--OLMo-2-0425-1B"):
        olmo_ckpts["olmo2-1b"].append(acc)
    elif name.startswith("allenai--OLMo-2-1124-7B"):
        olmo_ckpts["olmo2-7b"].append(acc)
    elif name.startswith("allenai--OLMo-2-1124-13B"):
        olmo_ckpts["olmo2-13b"].append(acc)
    else:
        other[name] = acc

# Base parameter/token values (original, fixed)
base_info = {
    "olmo2-1b": (1e9, 4e12),
    "olmo2-7b": (7e9, 4e12),
    "olmo2-13b": (13e9, 5e12),
    "Meta-Llama-3-8B": (8e9, 15e12),
    "Meta-Llama-3.1-8B": (8e9, 15e12),
    "deepseek-llm-7b-base": (7e9, 2e12),
    "Qwen2.5-7B": (7e9, 18e12),
    "DataDecide-dclm-baseline-1B": (1e9, 1e11),
    "DataDecide-dclm-baseline-50p-dolma1.7-50p-1B": (1e9, 1e11),
    "DataDecide-dolma1_7-1B": (1e9, 1e11),
    "Llama-2-13b-hf": (13e9, 2e12, 0.749)
}

# Build dataframe
records = []
for size, accs in olmo_ckpts.items():
    if accs:
        params, tokens = base_info[size]
        records.append({
            "label": size,
            "accuracy": np.mean(accs),
            "std": np.std(accs, ddof=0),
            "params": params,
            "tokens": tokens
        })

label_map = {
    "Meta-Llama-3-8B": "Llama-3-8B",
    "Meta-Llama-3.1-8B": "Llama-3.1-8B",
    "deepseek-llm-7b-base": "deepseek-llm-7b-base",
    "Qwen2.5-7B": "Qwen2.5-7B",
    "DataDecide-dclm-baseline-1B": "DataDecide-DCLM",
    "DataDecide-dclm-baseline-50p-dolma1.7-50p-1B": "DataDecide-DCLM-Dolma-even-mix",
    "DataDecide-dolma1_7-1B": "DataDecide-Dolma"
}
for model, acc in other.items():
    params, tokens = base_info[model][:2]
    records.append({
        "label": label_map.get(model, model.split("/")[-1]),
        "accuracy": acc,
        "std": 0.0,
        "params": params,
        "tokens": tokens
    })
records.append({
    "label": "Llama-2-13b-hf",
    "accuracy": base_info["Llama-2-13b-hf"][2],
    "std": 0.0,
    "params": base_info["Llama-2-13b-hf"][0],
    "tokens": base_info["Llama-2-13b-hf"][1]
})

df = pd.DataFrame(records)
df["compute"] = 6 * df["params"] * df["tokens"]
df["t2p_ratio"] = df["tokens"] / df["params"]

# Marker mapping
marker_map = {
    "olmo2-1b": 's',
    "olmo2-7b": 's',
    "olmo2-13b": 's',
    "DataDecide-DCLM": '^',
    "DataDecide-DCLM-Dolma-even-mix": '^',
    "DataDecide-Dolma": '^',
    "Llama-2-13b-hf": 'o',
    "deepseek-llm-7b-base": 'X',
    "Qwen2.5-7B": 'D',
    "Llama-3-8B": 'v',
    "Llama-3.1-8B": 'P'
}

# Plot
norm = Normalize(vmin=df["t2p_ratio"].min(), vmax=df["t2p_ratio"].max())
cmap = cm.get_cmap("viridis")
plt.figure(figsize=(8,6))

texts = []
points = []

# Separate olmo df
olmo_df = df[df["label"].isin(["olmo2-1b","olmo2-7b","olmo2-13b"])].sort_values("compute")
x_olmo = olmo_df["compute"].values
y_olmo = olmo_df["accuracy"].values
y_std = olmo_df["std"].values

# Plot OLMo shaded band
plt.fill_between(x_olmo, y_olmo - y_std, y_olmo + y_std, color='gray', alpha=0.25)
plt.plot(x_olmo, y_olmo, linestyle='--', color='black', alpha=0.8)

# Scatter all points
for _, row in df.iterrows():
    color = cmap(norm(row["t2p_ratio"]))
    marker = marker_map.get(row["label"], '*')
    sct = plt.scatter(row["compute"], row["accuracy"], marker=marker, color=color, s=70, edgecolor='k')
    points.append(sct)
    txt = plt.text(row["compute"], row["accuracy"], row["label"], fontsize=8)
    texts.append(txt)

# Adjust labels if adjustText available
if ADJUST_AVAILABLE:
    adjust_text(texts, x=df["compute"].values, y=df["accuracy"].values,
                arrowprops=dict(arrowstyle="-", color='gray', lw=0.5))
else:
    # fallback: offset text slightly and draw simple line
    for txt in texts:
        x, y = txt.get_position()
        txt.set_position((x*1.05, y*1.02))
        plt.annotate("", xy=(x, y), xytext=(x*1.05, y*1.02),
                     arrowprops=dict(arrowstyle="-", color='gray', lw=0.5))

# Colorbar
cbar = plt.colorbar(cm.ScalarMappable(norm=norm, cmap=cmap))
cbar.set_label("Token / Parameter Ratio")
plt.xscale("log")
plt.xlabel("Compute (6ND, FLOPs, log scale)")
plt.ylabel("LAMBADA Accuracy")
plt.tight_layout()

plot_path = Path("/mnt/data/compute_vs_lambada_leaderlines.png")
plt.savefig(plot_path, dpi=300)
plt.close()

display_dataframe_to_user("Dataset for leader line plot", df[["label","accuracy","std","compute"]])

plot_path
