In [None]:
import glob
import os
import pandas as pd
import wandb

In [None]:
layer_name_to_exp = {
    "h3": "dfa/h3",
    "linear_transformer": "dfa/linear_transformer",
    "hyena": "dfa/hyena",
    "transformer": "dfa/transformer",
    "rwkv": "dfa/rwkv",
    "s4d": "dfa/s4d",
    "lstm": "dfa/lstm",
    "retention": "dfa/retnet",
}

In [None]:
# glob all checkpoints
run_folders = glob.glob("experiments/**/wandb/run-*/", recursive=True)
# create a map
name_to_folder = {}
for folder in run_folders:
    folder = folder.replace("//", "/").strip('/')
    subpaths = folder.split('/')
    name = subpaths[-1]
    name = name.split("-")[-1]
    main_folder = "/".join((subpaths[:-2]))
    folder = os.path.join(main_folder, "checkpoints", "val", "loss.ckpt")
    name_to_folder[name] = folder

In [None]:
name_to_folder

In [None]:
api = wandb.Api()
entity, project = "akyurek", "associative_recall_learning_curves_eval"
runs = api.runs(entity + "/" + project)

summary_list, config_list, name_list, attr_list = [], [], [], []
for run in runs:
    # .summary contains output keys/values for
    # metrics such as accuracy.
    #  We call ._json_dict to omit large files
    summary_list.append(run.summary._json_dict)

    # .config contains the hyperparameters.
    #  We remove special values that start with _.
    config_list.append({k: v for k, v in run.config.items()})

    # .name is the human-readable name of the run.
    name_list.append(run.name)

    #
    attr_list.append(run._attrs)


runs_df = pd.DataFrame(
    {
        "summary": summary_list,
        "config": config_list,
        "name": name_list,
        "attr": attr_list,
    }
)

In [None]:
def get_ckpt(x):
    ckpt = x.config["train"]
    ckpt = ckpt["ckpt"]
    if ckpt is None:
        folder = name_to_folder.get(x.attr["name"], None)
        if folder:
            ckpt = os.path.join(
                "/raid/lingo/akyurek/git/iclmodels/",
                folder,
                # "checkpoints",
                # "val",
                # "loss.ckpt",
            )

    return ckpt

In [None]:
def get_nested_arg(x, args):
    for arg in args:
        try:
            x = x[arg]
        except:
            return None
    return x

In [None]:
runs_df

In [None]:
for metric in ["dfa_accuracy", "model_dfa_diff", "loss", "accuracy_ignore_index"]:
    runs_df[f"final_test/{metric}"] = runs_df.summary.map(
        lambda x: x.get(f"final_test/{metric}", None)
    )
for config in [
    "model.layer._name_",
    "dataset.num_examples",
    "model.n_layer",
    "model.d_model",
    "optimizer.lr",
    "optimizer.weight_decay",
    "experiment",
    "hydra.run.dir",
    "model.attn_cfg.n_heads",
    "model.attn_cfg.num_heads",
    "dataset.vocab_size",
    "dataset.input_seq_len",
    "dataset.batch_size",
    "model.attn_layer_idx",
]:
    config_parts = config.split(".")
    # get the value by nested index
    runs_df[config] = runs_df.config.map(lambda x: get_nested_arg(x, config_parts))

# if model.layer._name_ is None, set it to linear attention
runs_df["model.layer._name_"] = runs_df["model.layer._name_"].fillna("linear_transformer")

runs_df["ckpt"] = runs_df.apply(get_ckpt, axis=1)


# runs_df.dropna(subset=["final_test/model_dfa_diff"], inplace=True)
runs_df.sort_values(by="final_test/loss", ascending=True, inplace=True)

# remove columns
runs_df.drop(columns=["summary", "config", "attr"], inplace=True)

In [None]:
runs_df

In [None]:
# make neurips conference quality plots
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

plt.style.use('/raid/lingo/akyurek/mplstyle')
plt.rc('font', serif='Times')
plt.rc('text', usetex=False)
plt.rcParams['figure.dpi'] = 250
plt.rcParams['figure.facecolor'] = 'white'

In [None]:
model_names = {
    "transformer": "Transformer",
    "transformer_2": "Transformer (2 layers)",
    "transformer_1": "Transformer (1 layers)",
    "lstm": "LSTM",
    "hyena": "Hyena",
    "h3": "H3",
    "s4d": "S4D",
    "linear_transformer": "Linear Transformer",
    "rwkv": "RWKV",
    "retention": "RetNet",
}

In [None]:
# fig size
plt.rcParams.update({"figure.figsize": (6, 4)})
data = runs_df.loc[
    runs_df.groupby(["model.layer._name_", "dataset.num_examples"])[
        "final_test/accuracy_ignore_index"
    ].idxmax()
]
data = data.replace({"model.layer._name_": model_names})
# display(data)
ax = sns.lineplot(
    data=data,
    x="dataset.num_examples",
    y="final_test/accuracy_ignore_index",
    hue="model.layer._name_",
    marker="o",
    hue_order=["Transformer", "LSTM", "Hyena", "H3", "S4D", "Linear Transformer", "RWKV", "RetNet"],
)
ax.set_xlabel("# Training Examples")
ax.set_ylabel("Accuracy")
ax.set(xscale="log")
ax.set_xticks([150, 300, 625, 1250, 2500, 5000])
ax.legend(title='Model')
ax.get_xaxis().set_major_formatter(matplotlib.ticker.ScalarFormatter())

In [None]:
import math

In [None]:
runs_to_run = []

data.sort_values(by="dataset.num_examples", ascending=False, inplace=True)
for i, row in data.iterrows():
    model_type = row["model.layer._name_"]
    exp_name = layer_name_to_exp[model_type]

    prefix = f"export PYTHONHASHSEED=0; export CUDA_VISIBLE_DEVICES={i % 16}; python eval.py wandb.project=dfa_best_runs hydra.run.dir='experiments/hiddens_{row['dataset.num_examples']}/{model_type}/' "
    prefix += f"experiment={exp_name} train.test=True dataset.num_test_examples=1000 train.ckpt='{row['ckpt']}' "

    for k in ["dataset.num_examples", "model.n_layer", "model.d_model", "optimizer.lr", "optimizer.weight_decay"]:
        prefix += f"{k}={row[k]} "

    if not math.isnan(row["model.attn_cfg.n_heads"]):
        prefix += f"model.attn_cfg.n_heads={int(row['model.attn_cfg.n_heads'])} "
        if model_type != "retention":
            prefix += "+model.return_attention=True "
    if not math.isnan(row["model.attn_cfg.num_heads"]):
        prefix += f"model.attn_cfg.num_heads={int(row['model.attn_cfg.num_heads'])} "
        if model_type != "retention":
            prefix += "+model.return_attention=True "

    if row["model.attn_layer_idx"]:
        prefix += f"model.attn_layer_idx='{list(row['model.attn_layer_idx'].values())}' "

    prefix += f"> experiments/hiddens_{row['dataset.num_examples']}/logs/{i}  2>&1 &"
    prefix = prefix.strip()

    runs_to_run.append(prefix)



In [None]:
print("\n".join(runs_to_run))

In [None]:
# fig size
plt.rcParams.update({"figure.figsize": (6, 4)})
data = runs_df.loc[
    runs_df.groupby(["model.layer._name_", "dataset.num_examples"])[
        "final_test/model_dfa_diff"
    ].idxmin()
]
ax = sns.lineplot(
    data=data,
    x="dataset.num_examples",
    y="final_test/model_dfa_diff",
    hue="model.layer._name_",
    marker="o",
)
# update x label
ax.set_xlabel("# Training Examples")
ax.set_ylabel("L1")
ax.set(xscale="log")
ax.set_xticks([1000, 2500, 5000, 10000, 20000, 40000])
ax.get_xaxis().set_major_formatter(matplotlib.ticker.ScalarFormatter())
ax.legend(title='Model')

In [None]:
import pickle