# Task Evaluations for OLMo and other LLMs

This notebook shows how to run LM task evaluations in Beaker against HF and OLMo models, using tasks defined in catwalk.

Personal settings:

In [270]:
# Directory for the local repository where gantry will be run from
REPO_DIR = "/Users/oyvindt/gitroot/catwalk"

# Where to cache downloaded evaluations
CACHE_DIR = "/Users/oyvindt/evals/olmo/beaker-result-cache"

## Setup

Setup instructions:
   * Clone latest catwalk repo from https://github.com/OyvindTafjord/catwalk 
   * Install [beaker-gantry](https://github.com/allenai/beaker-gantry) and [beaker-py](https://github.com/allenai/beaker-py) (`pip install beaker-gantry` and `pip install beaker-py`)
   * See [this document](https://docs.google.com/document/d/1HahVawRR2Nf_J_B5Adsxierp4HK01tV8NR9o6NFUgMo/edit?usp=sharing) for how to run local catwalk evaluations (to vet new tasks, etc), otherwise catwalk dependencies do not need to be installed

## Code (run this section first)

This is just some rough example code to help streamline beaker experiment management

In [209]:
from beaker import Beaker
import json
import os
import random
import re
import time

def run_gantry(exp_name, template, args, description=None, overrides=None):
    current_dir = os.getcwd()
    command = template
    gantry_args = " --name "+exp_name
    if description is not None:
        description = description.replace("\"", "\\\"")
        command += " --description \"" + description + "\""
    args_full = args.copy()
    args_full['gantry_args'] = gantry_args
    for key, value in args_full.items():
        command = command.replace("$$"+key.upper()+"$$",value)
    assert "$$" not in command
    if overrides is not None:
        for key, value in overrides.items():
            command = re.sub(f"--{key} \\S+", f"--{key} {value}", command)
    os.chdir(REPO_DIR)
    stream = os.popen(command)
    output = stream.read()
    os.chdir(current_dir)
    return {"command": command, "output": output}  

def make_exp_name(model_name, checkpoint=None):
    random_hex = '%010x' % random.randrange(16**10)
    res = "lmeval-" + re.sub(".*/", "", model_name)
    if checkpoint:
        res += "-" + checkpoint
    res+= "-" + random_hex
    # Make conform to valid Beaker experiment names
    res = re.sub("[^-_.a-zA-Z0-9]", "", res)
    return res 

def get_experiment_status(exp_data):
    exp = resolve_exp(exp_data)
    full_status = exp.jobs[0].status
    return {"finalized": f"{full_status.finalized}", "exit_code": full_status.exit_code}

def resolve_exp(exp_data):
    exp = None
    if 'exp_id' not in exp_data:
        exp = BEAKER.experiment.get(BEAKER.account.name + "/" + exp_data['exp_name'])
        exp_data['id'] = exp.id
    if exp is None:
        exp = BEAKER.experiment.get(exp_data['exp_id'])
    if 'result_id' not in exp_data:
        exp_data['result_id'] = exp.jobs[0].result.beaker
    return exp

def get_result_path(exp_data, cache_dir=CACHE_DIR, force_download=False):
    exp = resolve_exp(exp_data)
    result_id = exp_data['result_id']
    exp_dir = os.path.join(cache_dir, result_id)
    if os.path.exists(exp_dir) and not force_download and len(os.listdir(exp_dir)) > 0:
        return exp_dir
    BEAKER.dataset.fetch(result_id, target=exp_dir)
    return exp_dir

def run_gantry_from_model_spec(model_spec, checkpoint, template, task_set):
    model_name = model_spec['name']
    exp_name = make_exp_name(model_name, checkpoint)
    model_full = f"lm::pretrained={model_name}"
    if checkpoint:
        model_full += f",revision={checkpoint}"
    print(f"Running {exp_name}")
    args = {"model": model_full, "task": task_set}
    if "beaker_model" in model_spec:
        args['beaker_model'] = model_spec['beaker_model']
    run_output = run_gantry(exp_name, template, args, overrides = model_spec.get("overrides"))
    return {"exp_name": exp_name, "model": model_name, "checkpoint": checkpoint, "run_gantry": run_output}  

def load_jsonl(file_name):
    with open(file_name, 'r') as file:
        return [json.loads(line.strip()) for line in file]

def load_json(file_name):
    with open(file_name, 'r') as file:
        return json.loads(file.read())
    

def save_jsonl(file_name, data):
    with open(file_name, 'w') as file:
        for d in data:
            file.write(json.dumps(d))
            file.write("\n")
    return file_name

## Init

Setup beaker

In [131]:
BEAKER = Beaker.from_env()

In [132]:
BEAKER.account.name

'oyvindt'

## Start experiments

Here we'll show an example of running a few Pythia checkpoints against a set of 20 catwalk tasks in default mode (zero-shot). Note that tasks can also be defined in a task_file with individual options for each task.

In [196]:
TASK_SET = "arc_challenge arc_easy boolq copa headqa_en hellaswag logiqa mathqa mrpc openbookqa piqa qnli qqp rte sciq sst wic winogrande wnli wsc"
MODELS_TO_RUN = [ 
    {"name": "EleutherAI/pythia-6.9b", "checkpoints": ["step1000","step10000", "step50000", "step140000"], "overrides": {"max_batch_tokens": 2048*2}},
    {"name": "EleutherAI/pythia-160m", "checkpoints": ["step1000","step10000", "step50000", "step140000"]},
]

In [191]:
experiment_tracker = []

Launch the experiments, using this command with a few variables to fill in (most of the other variables should be self-explanatory, but see [document](https://docs.google.com/document/d/1HahVawRR2Nf_J_B5Adsxierp4HK01tV8NR9o6NFUgMo/edit?usp=sharing) for more details):

In [192]:
GANTRY_TEMPLATE_ZEROSHOT_1 = "gantry run --gpus 1 --venv base --workspace ai2/lm-eval \
--cluster ai2/aristo-cirrascale --beaker-image oyvindt/OLMoEvalV4 \
--env 'HF_DATASETS_CACHE=/net/nfs.cirrascale/aristo/oyvindt/hf_datasets_cache' $$GANTRY_ARGS$$ \
-- python catwalk/run_lm_eval.py --model $$MODEL$$ --task $$TASK$$ --split validation \
--full_output_file /results/predictions.jsonl --metrics_file /results/metrics.json --limit 1000 \
--num_recorded_inputs 3 --batch_size 32 --model_max_length 2048 --max_batch_tokens 20480"

In [193]:
for model_spec in MODELS_TO_RUN:
    for checkpoint in model_spec["checkpoints"]:
        res = run_gantry_from_model_spec(model_spec, checkpoint, GANTRY_TEMPLATE_ZEROSHOT_1, TASK_SET)
        experiment_tracker.append(res)
        time.sleep(0.5)      

Running lmeval-pythia-6.9b-step1000-9113d68b6a
Running lmeval-pythia-6.9b-step10000-cacb9eb3c6
Running lmeval-pythia-6.9b-step50000-715023ac71
Running lmeval-pythia-6.9b-step140000-a8eb0a3c7c
Running lmeval-pythia-160m-step1000-af94704947
Running lmeval-pythia-160m-step10000-3b698b190a
Running lmeval-pythia-160m-step50000-0784ce9ab1
Running lmeval-pythia-160m-step140000-18fc9c1c2f


Check status of experiments

In [216]:
for e in experiment_tracker:
    print((e['exp_name'], get_experiment_status(e)))

('lmeval-pythia-6.9b-step1000-9113d68b6a', {'finalized': '2023-05-09 03:31:25.277995+00:00', 'exit_code': 0})
('lmeval-pythia-6.9b-step10000-cacb9eb3c6', {'finalized': '2023-05-09 03:32:03.977317+00:00', 'exit_code': 0})
('lmeval-pythia-6.9b-step50000-715023ac71', {'finalized': '2023-05-09 03:31:10.948283+00:00', 'exit_code': 0})
('lmeval-pythia-6.9b-step140000-a8eb0a3c7c', {'finalized': '2023-05-09 03:32:08.617660+00:00', 'exit_code': 0})
('lmeval-pythia-160m-step1000-af94704947', {'finalized': '2023-05-09 02:26:21.667439+00:00', 'exit_code': 0})
('lmeval-pythia-160m-step10000-3b698b190a', {'finalized': '2023-05-09 02:26:35.412889+00:00', 'exit_code': 0})
('lmeval-pythia-160m-step50000-0784ce9ab1', {'finalized': '2023-05-09 02:23:52.417012+00:00', 'exit_code': 0})
('lmeval-pythia-160m-step140000-18fc9c1c2f', {'finalized': '2023-05-09 02:23:52.654960+00:00', 'exit_code': 0})


Save for later use (see cached version of this variable at end of notebook):

In [217]:
save_jsonl(os.path.join(CACHE_DIR, "lmeval-experiments-2.jsonl"), experiment_tracker)

'/Users/oyvindt/evals/olmo/beaker-result-cache/lmeval-experiments-2.jsonl'

## Running OLMo

Running OLMo models is very similar, a few more arguments are needed in the command, and for now the model must be available as a beaker dataset (the actual model name doesn't matter here):

In [203]:
TASK_SET = "arc_challenge arc_easy boolq copa headqa_en hellaswag logiqa mathqa mrpc openbookqa piqa qnli qqp rte sciq sst wic winogrande wnli wsc"
OLMO_MODELS_TO_RUN = [ 
    {"name": "olmo-c4-small-euox4j8q-step7300", "beaker_model": "oyvindt/olmo-c4-small-euox4j8q-step7300", "overrides": {"max_batch_tokens": 2048*4}},
]

In [199]:
experiment_tracker_olmo = []

Launch the experiments, using this tweaked template:

In [206]:
GANTRY_TEMPLATE_ZEROSHOT_OLMO_1 = "gantry run --gpus 1 --dataset '$$BEAKER_MODEL$$:/model' \
--venv base --workspace ai2/lm-eval \
--cluster ai2/aristo-cirrascale --beaker-image oyvindt/OLMoEvalV4 \
--env 'HF_DATASETS_CACHE=/net/nfs.cirrascale/aristo/oyvindt/hf_datasets_cache' $$GANTRY_ARGS$$ \
-- python catwalk/run_lm_eval.py --model $$MODEL$$ --task $$TASK$$ --split validation \
--full_output_file /results/predictions.jsonl --metrics_file /results/metrics.json --limit 1000 \
--num_recorded_inputs 3 --model_path /model --model_class olmo_eval.olmo_pretrained.OlmoPretrained \
--batch_size 32 --model_max_length 2048 --max_batch_tokens 20480"

In [210]:
for model_spec in OLMO_MODELS_TO_RUN:
    res = run_gantry_from_model_spec(model_spec, None, GANTRY_TEMPLATE_ZEROSHOT_OLMO_1, TASK_SET)
    experiment_tracker_olmo.append(res)
    time.sleep(0.5)      

Running lmeval-olmo-c4-small-euox4j8q-step7300-30eca3ce7d


In [223]:
get_experiment_status(experiment_tracker_olmo[0])

{'finalized': '2023-05-09 03:10:42.727976+00:00', 'exit_code': 0}

In [227]:
save_jsonl(os.path.join(CACHE_DIR, "lmeval-experiments-3.jsonl"), experiment_tracker_olmo)

'/Users/oyvindt/evals/olmo/beaker-result-cache/lmeval-experiments-3.jsonl'

## Experiment Analysis

First load experiment_tracker again, either from above or grab example output from bottom of this notebook to run on these example experiments

In [None]:
# experiment_tracker = load_jsonl(os.path.join(CACHE_DIR, "lmeval-experiments-1.jsonl"))
# experiment_tracker_olmo = load_jsonl(os.path.join(CACHE_DIR, "lmeval-experiments-3.jsonl"))

In [225]:
experiments = experiment_tracker + experiment_tracker_olmo

Download result files to cache

In [260]:
for e in experiments:
    print(f"Downloading results for {e['exp_name']}...")
    get_result_path(e)

Downloading results for lmeval-pythia-6.9b-step1000-9113d68b6a...
Downloading results for lmeval-pythia-6.9b-step10000-cacb9eb3c6...
Downloading results for lmeval-pythia-6.9b-step50000-715023ac71...
Downloading results for lmeval-pythia-6.9b-step140000-a8eb0a3c7c...
Downloading results for lmeval-pythia-160m-step1000-af94704947...
Downloading results for lmeval-pythia-160m-step10000-3b698b190a...
Downloading results for lmeval-pythia-160m-step50000-0784ce9ab1...
Downloading results for lmeval-pythia-160m-step140000-18fc9c1c2f...
Downloading results for lmeval-olmo-c4-small-euox4j8q-step7300-30eca3ce7d...


Let's first look at the "best" pythia-6.9b-step14000 model a bit

In [228]:
experiment = experiments[3]

In [229]:
os.listdir(get_result_path(experiment))

['metrics.json', 'predictions.jsonl', '.gantry']

In [230]:
metrics = load_json(os.path.join(get_result_path(experiment), "metrics.json"))

Brief description of metrics:
   * acc_raw: use total probability of completion
   * acc_per_token: use probability per token
   * acc_per_char: use probability per character (doesn't make much sense, but EleutherAI uses this for some tasks)
   * acc_uncond: total probability of completion divided by "unconditioned" probability (completion prefixed only by something like "Answer:")
   * predicted_indices_..: frequency of each predicted answer choice, by decreasing frequence (to catch cases where only one answer, like "yes", is almost always chosen)
   * avg_total_probability_mass: total probability for all answer choices (averaged over instances)
   * primary_metric: which of the scoring methods are used for the primary "acc" metric
   * acc: primary accuracy, using one of the above methods (usually acc_per_token or acc_uncond
   * total_token_count: total number of tokens across all instances
   * max_token_count: max token count for a single input (e.g., to see if max model input length is hit)

In [271]:
metrics['metrics'][0]

{'task': 'arc_challenge',
 'model': 'lm::pretrained=EleutherAI/pythia-6.9b,revision=step140000',
 'task_options': {'unconditioned_prompt': 'Answer:',
  'limit': 1000,
  'split': 'validation',
  'batch_size': 32,
  'model_max_length': 2048,
  'max_batch_tokens': 4096,
  'num_recorded_inputs': 3},
 'metrics': {'rc_metrics': {'acc_raw': 0.3177257525083612,
   'predicted_indices_raw': [[0, 0.3612040133779264],
    [1, 0.25418060200668896],
    [3, 0.20735785953177258],
    [2, 0.17725752508361203]],
   'acc_per_token': 0.3612040133779264,
   'predicted_indices_per_token': [[3, 0.2842809364548495],
    [1, 0.25418060200668896],
    [2, 0.23411371237458195],
    [0, 0.22742474916387959]],
   'acc_per_char': 0.36454849498327757,
   'predicted_indices_per_char': [[3, 0.3210702341137124],
    [2, 0.2408026755852843],
    [0, 0.22408026755852842],
    [1, 0.2140468227424749]],
   'acc_uncond': 0.4414715719063545,
   'predicted_indices_uncond': [[2, 0.3076923076923077],
    [1, 0.2408026755852843

Main metric for each task:

In [231]:
for task in metrics['metrics']:
    print(f"{task['task']}: {task['metrics']['rc_metrics']['acc']:.3f}")

arc_challenge: 0.441
arc_easy: 0.619
boolq: 0.405
copa: 0.840
headqa_en: 0.381
hellaswag: 0.560
logiqa: 0.215
mathqa: 0.257
mrpc: 0.654
openbookqa: 0.450
piqa: 0.762
qnli: 0.515
qqp: 0.608
rte: 0.606
sciq: 0.911
sst: 0.623
wic: 0.450
winogrande: 0.595
wnli: 0.620
wsc: 0.635


Load full predictions file

In [237]:
predictions = load_jsonl(os.path.join(get_result_path(experiment), "predictions.jsonl"))

In [238]:
predictions[0].keys()

dict_keys(['task', 'model', 'task_options', 'metrics', 'num_instances', 'processing_time_seconds', 'per_instance'])

In [239]:
predictions[0]['per_instance'][10]

{'instance': {'id': 'Mercury_SC_412337'},
 'prediction': {'model_output': [{'sum_logits': -28.799480438232422,
    'num_tokens': 14,
    'num_chars': 69,
    'num_tokens_all': 43,
    'sum_logits_uncond': -43.893951416015625},
   {'sum_logits': -31.447595596313477,
    'num_tokens': 14,
    'num_chars': 71,
    'num_tokens_all': 43,
    'sum_logits_uncond': -45.0609130859375},
   {'sum_logits': -29.84939193725586,
    'num_tokens': 14,
    'num_chars': 69,
    'num_tokens_all': 43,
    'sum_logits_uncond': -45.9809455871582},
   {'sum_logits': -32.15296936035156,
    'num_tokens': 14,
    'num_chars': 71,
    'num_tokens_all': 43,
    'sum_logits_uncond': -45.82860565185547}],
  'correct_choice': 3,
  'metrics': {'acc_raw': 0,
   'predicted_index_raw': 0,
   'acc_per_token': 0,
   'predicted_index_per_token': 0,
   'acc_per_char': 0,
   'predicted_index_per_char': 0,
   'acc_uncond': 0,
   'predicted_index_uncond': 2,
   'probability_mass': 4.525025555660871e-13,
   'acc': 0}}}

The first 3 (num_recorded_inputs) instances come with the full model inputs, the prompt and continuation for each answer choice:

In [240]:
predictions[0]['per_instance'][0]['model_input']

[['Question: Juan and LaKeisha roll a few objects down a ramp. They want to see which object rolls the farthest. What should they do so they can repeat their investigation?\nAnswer:',
  ' Put the objects in groups.'],
 ['Question: Juan and LaKeisha roll a few objects down a ramp. They want to see which object rolls the farthest. What should they do so they can repeat their investigation?\nAnswer:',
  ' Change the height of the ramp.'],
 ['Question: Juan and LaKeisha roll a few objects down a ramp. They want to see which object rolls the farthest. What should they do so they can repeat their investigation?\nAnswer:',
  ' Choose different objects to roll.'],
 ['Question: Juan and LaKeisha roll a few objects down a ramp. They want to see which object rolls the farthest. What should they do so they can repeat their investigation?\nAnswer:',
  ' Record the details of the investigation.']]

Not a pandas expert, but some hacky code to visualize a few things:

In [241]:
import pandas

In [267]:
by_task = {"model":[]}
for experiment in experiments:
    metrics1 = load_json(os.path.join(get_result_path(experiment), "metrics.json"))
    by_task["model"].append(experiment['model'].replace("EleutherAI/","").replace("-euox4j8q","")+"-"+experiment['checkpoint'])
    for task in metrics1['metrics']:
        task_name = task['task']
        by_task[task_name] = by_task.get(task_name, []) + [task['metrics']['rc_metrics']['acc']]

In [268]:
pandas.DataFrame.from_dict(by_task)

Unnamed: 0,model,arc_challenge,arc_easy,boolq,copa,headqa_en,hellaswag,logiqa,mathqa,mrpc,...,piqa,qnli,qqp,rte,sciq,sst,wic,winogrande,wnli,wsc
0,pythia-6.9b-step1000,0.230769,0.250877,0.414,0.54,0.266,0.281,0.211982,0.205,0.683824,...,0.538,0.473,0.327,0.530686,0.424,0.470183,0.5,0.516,0.56338,0.634615
1,pythia-6.9b-step10000,0.327759,0.447368,0.44,0.67,0.3,0.42,0.213518,0.223,0.683824,...,0.678,0.476,0.327,0.527076,0.835,0.591743,0.50627,0.495,0.56338,0.634615
2,pythia-6.9b-step50000,0.361204,0.564912,0.373,0.72,0.346,0.506,0.201229,0.248,0.681373,...,0.732,0.531,0.349,0.523466,0.887,0.676606,0.479624,0.59,0.422535,0.634615
3,pythia-6.9b-step140000,0.441472,0.619298,0.405,0.84,0.381,0.56,0.215054,0.257,0.654412,...,0.762,0.515,0.608,0.606498,0.911,0.622706,0.449843,0.595,0.619718,0.634615
4,pythia-160m-step1000,0.254181,0.285965,0.391,0.51,0.24,0.3,0.228879,0.198,0.683824,...,0.546,0.473,0.327,0.527076,0.452,0.494266,0.5,0.51,0.56338,0.634615
5,pythia-160m-step10000,0.250836,0.384211,0.407,0.61,0.248,0.345,0.201229,0.214,0.683824,...,0.602,0.473,0.327,0.501805,0.758,0.511468,0.5,0.526,0.591549,0.634615
6,pythia-160m-step50000,0.257525,0.431579,0.387,0.66,0.276,0.358,0.196621,0.223,0.683824,...,0.632,0.473,0.328,0.552347,0.788,0.510321,0.49373,0.521,0.56338,0.634615
7,pythia-160m-step140000,0.284281,0.407018,0.401,0.65,0.298,0.358,0.205837,0.227,0.676471,...,0.627,0.475,0.327,0.527076,0.744,0.509174,0.5,0.512,0.56338,0.634615
8,olmo-c4-small-step7300,0.287625,0.522807,0.456,0.77,0.291,0.506,0.184332,0.223,0.64951,...,0.756,0.53,0.498,0.472924,0.824,0.557339,0.504702,0.582,0.521127,0.557692


There's a lot more data available for analysis, such as catching "degenerate" tasks (always answer "yes") or compare different normalization approaches to the completions in ranked classification tasks.

## Cached experiment_tracker and experiment_tracker_olmo variables

Full experiment_tracker and experiment_tracker_olmo variables so can go directly to analysis

In [232]:
experiment_tracker = [{'exp_name': 'lmeval-pythia-6.9b-step1000-9113d68b6a',
  'model': 'EleutherAI/pythia-6.9b',
  'checkpoint': 'step1000',
  'run_gantry': {'command': "gantry run --gpus 1 --venv base --workspace ai2/lm-eval --cluster ai2/aristo-cirrascale --beaker-image oyvindt/OLMoEvalV4 --env 'HF_DATASETS_CACHE=/net/nfs.cirrascale/aristo/oyvindt/hf_datasets_cache'  --name lmeval-pythia-6.9b-step1000-9113d68b6a -- python catwalk/run_lm_eval.py --model lm::pretrained=EleutherAI/pythia-6.9b,revision=step1000 --task arc_challenge arc_easy boolq copa headqa_en hellaswag logiqa mathqa mrpc openbookqa piqa qnli qqp rte sciq sst wic winogrande wnli wsc --split validation --full_output_file /results/predictions.jsonl --metrics_file /results/metrics.json --limit 1000 --num_recorded_inputs 3 --batch_size 32 --model_max_length 2048 --max_batch_tokens 4096",
   'output': '\n\x1b[1;36m                                             o=======[]   \x1b[0m\n\x1b[1;36m   __ _                    _               _ |_      []   \x1b[0m\n\x1b[1;36m  / _` |  __ _    _ _     | |_      _ _   | || |     []   \x1b[0m\n\x1b[1;36m  \\__, | / _` |  | \' \\    |  _|    | \'_|   \\_, |   _/ ]_  \x1b[0m\n\x1b[1;36m  |___/  \\__,_|  |_||_|   _\\__|   _|_|_   _|__/   |_____| \x1b[0m\n\x1b[1;34m_|"""""|_|"""""|_|"""""|_|"""""|_|"""""|_| """"| \x1b[0m\n\x1b[1;34m `---------------------------------------------\' \x1b[0m\n\nExperiment submitted, see progress at \n\x1b[4;94mhttps://beaker.org/ex/01GZZ6GKETZJWQSCXZ2S0N4RVM\x1b[0m\n'},
  'id': '01GZZ6GKETZJWQSCXZ2S0N4RVM',
  'result_id': '01GZZ6GNX72PWMHKERDP8EKJ9S'},
 {'exp_name': 'lmeval-pythia-6.9b-step10000-cacb9eb3c6',
  'model': 'EleutherAI/pythia-6.9b',
  'checkpoint': 'step10000',
  'run_gantry': {'command': "gantry run --gpus 1 --venv base --workspace ai2/lm-eval --cluster ai2/aristo-cirrascale --beaker-image oyvindt/OLMoEvalV4 --env 'HF_DATASETS_CACHE=/net/nfs.cirrascale/aristo/oyvindt/hf_datasets_cache'  --name lmeval-pythia-6.9b-step10000-cacb9eb3c6 -- python catwalk/run_lm_eval.py --model lm::pretrained=EleutherAI/pythia-6.9b,revision=step10000 --task arc_challenge arc_easy boolq copa headqa_en hellaswag logiqa mathqa mrpc openbookqa piqa qnli qqp rte sciq sst wic winogrande wnli wsc --split validation --full_output_file /results/predictions.jsonl --metrics_file /results/metrics.json --limit 1000 --num_recorded_inputs 3 --batch_size 32 --model_max_length 2048 --max_batch_tokens 4096",
   'output': '\n\x1b[1;36m                                             o=======[]   \x1b[0m\n\x1b[1;36m   __ _                    _               _ |_      []   \x1b[0m\n\x1b[1;36m  / _` |  __ _    _ _     | |_      _ _   | || |     []   \x1b[0m\n\x1b[1;36m  \\__, | / _` |  | \' \\    |  _|    | \'_|   \\_, |   _/ ]_  \x1b[0m\n\x1b[1;36m  |___/  \\__,_|  |_||_|   _\\__|   _|_|_   _|__/   |_____| \x1b[0m\n\x1b[1;34m_|"""""|_|"""""|_|"""""|_|"""""|_|"""""|_| """"| \x1b[0m\n\x1b[1;34m `---------------------------------------------\' \x1b[0m\n\nExperiment submitted, see progress at \n\x1b[4;94mhttps://beaker.org/ex/01GZZ6GPCVCSXXPZ02Z7RWS5J8\x1b[0m\n'},
  'id': '01GZZ6GPCVCSXXPZ02Z7RWS5J8',
  'result_id': '01GZZ6H7P43ZFN5KEZN0F1EWXR'},
 {'exp_name': 'lmeval-pythia-6.9b-step50000-715023ac71',
  'model': 'EleutherAI/pythia-6.9b',
  'checkpoint': 'step50000',
  'run_gantry': {'command': "gantry run --gpus 1 --venv base --workspace ai2/lm-eval --cluster ai2/aristo-cirrascale --beaker-image oyvindt/OLMoEvalV4 --env 'HF_DATASETS_CACHE=/net/nfs.cirrascale/aristo/oyvindt/hf_datasets_cache'  --name lmeval-pythia-6.9b-step50000-715023ac71 -- python catwalk/run_lm_eval.py --model lm::pretrained=EleutherAI/pythia-6.9b,revision=step50000 --task arc_challenge arc_easy boolq copa headqa_en hellaswag logiqa mathqa mrpc openbookqa piqa qnli qqp rte sciq sst wic winogrande wnli wsc --split validation --full_output_file /results/predictions.jsonl --metrics_file /results/metrics.json --limit 1000 --num_recorded_inputs 3 --batch_size 32 --model_max_length 2048 --max_batch_tokens 4096",
   'output': '\n\x1b[1;36m                                             o=======[]   \x1b[0m\n\x1b[1;36m   __ _                    _               _ |_      []   \x1b[0m\n\x1b[1;36m  / _` |  __ _    _ _     | |_      _ _   | || |     []   \x1b[0m\n\x1b[1;36m  \\__, | / _` |  | \' \\    |  _|    | \'_|   \\_, |   _/ ]_  \x1b[0m\n\x1b[1;36m  |___/  \\__,_|  |_||_|   _\\__|   _|_|_   _|__/   |_____| \x1b[0m\n\x1b[1;34m_|"""""|_|"""""|_|"""""|_|"""""|_|"""""|_| """"| \x1b[0m\n\x1b[1;34m `---------------------------------------------\' \x1b[0m\n\nExperiment submitted, see progress at \n\x1b[4;94mhttps://beaker.org/ex/01GZZ6GS83CJRRGCNY5SCR63D8\x1b[0m\n'},
  'id': '01GZZ6GS83CJRRGCNY5SCR63D8',
  'result_id': '01GZZ6H7TWJR2Y4NJDJQ7DWF5Z'},
 {'exp_name': 'lmeval-pythia-6.9b-step140000-a8eb0a3c7c',
  'model': 'EleutherAI/pythia-6.9b',
  'checkpoint': 'step140000',
  'run_gantry': {'command': "gantry run --gpus 1 --venv base --workspace ai2/lm-eval --cluster ai2/aristo-cirrascale --beaker-image oyvindt/OLMoEvalV4 --env 'HF_DATASETS_CACHE=/net/nfs.cirrascale/aristo/oyvindt/hf_datasets_cache'  --name lmeval-pythia-6.9b-step140000-a8eb0a3c7c -- python catwalk/run_lm_eval.py --model lm::pretrained=EleutherAI/pythia-6.9b,revision=step140000 --task arc_challenge arc_easy boolq copa headqa_en hellaswag logiqa mathqa mrpc openbookqa piqa qnli qqp rte sciq sst wic winogrande wnli wsc --split validation --full_output_file /results/predictions.jsonl --metrics_file /results/metrics.json --limit 1000 --num_recorded_inputs 3 --batch_size 32 --model_max_length 2048 --max_batch_tokens 4096",
   'output': '\n\x1b[1;36m                                             o=======[]   \x1b[0m\n\x1b[1;36m   __ _                    _               _ |_      []   \x1b[0m\n\x1b[1;36m  / _` |  __ _    _ _     | |_      _ _   | || |     []   \x1b[0m\n\x1b[1;36m  \\__, | / _` |  | \' \\    |  _|    | \'_|   \\_, |   _/ ]_  \x1b[0m\n\x1b[1;36m  |___/  \\__,_|  |_||_|   _\\__|   _|_|_   _|__/   |_____| \x1b[0m\n\x1b[1;34m_|"""""|_|"""""|_|"""""|_|"""""|_|"""""|_| """"| \x1b[0m\n\x1b[1;34m `---------------------------------------------\' \x1b[0m\n\nExperiment submitted, see progress at \n\x1b[4;94mhttps://beaker.org/ex/01GZZ6GW3CQQ3W2TY66MZNG6AY\x1b[0m\n'},
  'id': '01GZZ6GW3CQQ3W2TY66MZNG6AY',
  'result_id': '01GZZ6H7ZJD5S9TQE02E6JSAZN'},
 {'exp_name': 'lmeval-pythia-160m-step1000-af94704947',
  'model': 'EleutherAI/pythia-160m',
  'checkpoint': 'step1000',
  'run_gantry': {'command': "gantry run --gpus 1 --venv base --workspace ai2/lm-eval --cluster ai2/aristo-cirrascale --beaker-image oyvindt/OLMoEvalV4 --env 'HF_DATASETS_CACHE=/net/nfs.cirrascale/aristo/oyvindt/hf_datasets_cache'  --name lmeval-pythia-160m-step1000-af94704947 -- python catwalk/run_lm_eval.py --model lm::pretrained=EleutherAI/pythia-160m,revision=step1000 --task arc_challenge arc_easy boolq copa headqa_en hellaswag logiqa mathqa mrpc openbookqa piqa qnli qqp rte sciq sst wic winogrande wnli wsc --split validation --full_output_file /results/predictions.jsonl --metrics_file /results/metrics.json --limit 1000 --num_recorded_inputs 3 --batch_size 32 --model_max_length 2048 --max_batch_tokens 20480",
   'output': '\n\x1b[1;36m                                             o=======[]   \x1b[0m\n\x1b[1;36m   __ _                    _               _ |_      []   \x1b[0m\n\x1b[1;36m  / _` |  __ _    _ _     | |_      _ _   | || |     []   \x1b[0m\n\x1b[1;36m  \\__, | / _` |  | \' \\    |  _|    | \'_|   \\_, |   _/ ]_  \x1b[0m\n\x1b[1;36m  |___/  \\__,_|  |_||_|   _\\__|   _|_|_   _|__/   |_____| \x1b[0m\n\x1b[1;34m_|"""""|_|"""""|_|"""""|_|"""""|_|"""""|_| """"| \x1b[0m\n\x1b[1;34m `---------------------------------------------\' \x1b[0m\n\nExperiment submitted, see progress at \n\x1b[4;94mhttps://beaker.org/ex/01GZZ4YQ3CMVABGSJRN85BC00Q\x1b[0m\n'},
  'id': '01GZZ4YQ3CMVABGSJRN85BC00Q',
  'result_id': '01GZZ4Z76EBDX4K9BYB40CBDYK'},
 {'exp_name': 'lmeval-pythia-160m-step10000-3b698b190a',
  'model': 'EleutherAI/pythia-160m',
  'checkpoint': 'step10000',
  'run_gantry': {'command': "gantry run --gpus 1 --venv base --workspace ai2/lm-eval --cluster ai2/aristo-cirrascale --beaker-image oyvindt/OLMoEvalV4 --env 'HF_DATASETS_CACHE=/net/nfs.cirrascale/aristo/oyvindt/hf_datasets_cache'  --name lmeval-pythia-160m-step10000-3b698b190a -- python catwalk/run_lm_eval.py --model lm::pretrained=EleutherAI/pythia-160m,revision=step10000 --task arc_challenge arc_easy boolq copa headqa_en hellaswag logiqa mathqa mrpc openbookqa piqa qnli qqp rte sciq sst wic winogrande wnli wsc --split validation --full_output_file /results/predictions.jsonl --metrics_file /results/metrics.json --limit 1000 --num_recorded_inputs 3 --batch_size 32 --model_max_length 2048 --max_batch_tokens 20480",
   'output': '\n\x1b[1;36m                                             o=======[]   \x1b[0m\n\x1b[1;36m   __ _                    _               _ |_      []   \x1b[0m\n\x1b[1;36m  / _` |  __ _    _ _     | |_      _ _   | || |     []   \x1b[0m\n\x1b[1;36m  \\__, | / _` |  | \' \\    |  _|    | \'_|   \\_, |   _/ ]_  \x1b[0m\n\x1b[1;36m  |___/  \\__,_|  |_||_|   _\\__|   _|_|_   _|__/   |_____| \x1b[0m\n\x1b[1;34m_|"""""|_|"""""|_|"""""|_|"""""|_|"""""|_| """"| \x1b[0m\n\x1b[1;34m `---------------------------------------------\' \x1b[0m\n\nExperiment submitted, see progress at \n\x1b[4;94mhttps://beaker.org/ex/01GZZ4YRTK4EV277WCT6QQBB64\x1b[0m\n'},
  'id': '01GZZ4YRTK4EV277WCT6QQBB64',
  'result_id': '01GZZ4Z7B2V7NMM81RHGHD0WYH'},
 {'exp_name': 'lmeval-pythia-160m-step50000-0784ce9ab1',
  'model': 'EleutherAI/pythia-160m',
  'checkpoint': 'step50000',
  'run_gantry': {'command': "gantry run --gpus 1 --venv base --workspace ai2/lm-eval --cluster ai2/aristo-cirrascale --beaker-image oyvindt/OLMoEvalV4 --env 'HF_DATASETS_CACHE=/net/nfs.cirrascale/aristo/oyvindt/hf_datasets_cache'  --name lmeval-pythia-160m-step50000-0784ce9ab1 -- python catwalk/run_lm_eval.py --model lm::pretrained=EleutherAI/pythia-160m,revision=step50000 --task arc_challenge arc_easy boolq copa headqa_en hellaswag logiqa mathqa mrpc openbookqa piqa qnli qqp rte sciq sst wic winogrande wnli wsc --split validation --full_output_file /results/predictions.jsonl --metrics_file /results/metrics.json --limit 1000 --num_recorded_inputs 3 --batch_size 32 --model_max_length 2048 --max_batch_tokens 20480",
   'output': '\n\x1b[1;36m                                             o=======[]   \x1b[0m\n\x1b[1;36m   __ _                    _               _ |_      []   \x1b[0m\n\x1b[1;36m  / _` |  __ _    _ _     | |_      _ _   | || |     []   \x1b[0m\n\x1b[1;36m  \\__, | / _` |  | \' \\    |  _|    | \'_|   \\_, |   _/ ]_  \x1b[0m\n\x1b[1;36m  |___/  \\__,_|  |_||_|   _\\__|   _|_|_   _|__/   |_____| \x1b[0m\n\x1b[1;34m_|"""""|_|"""""|_|"""""|_|"""""|_|"""""|_| """"| \x1b[0m\n\x1b[1;34m `---------------------------------------------\' \x1b[0m\n\nExperiment submitted, see progress at \n\x1b[4;94mhttps://beaker.org/ex/01GZZ4YTHWCBAZVJNYG042BG7M\x1b[0m\n'},
  'id': '01GZZ4YTHWCBAZVJNYG042BG7M',
  'result_id': '01GZZ4Z7FKJBRHHF8NW5FNFJDX'},
 {'exp_name': 'lmeval-pythia-160m-step140000-18fc9c1c2f',
  'model': 'EleutherAI/pythia-160m',
  'checkpoint': 'step140000',
  'run_gantry': {'command': "gantry run --gpus 1 --venv base --workspace ai2/lm-eval --cluster ai2/aristo-cirrascale --beaker-image oyvindt/OLMoEvalV4 --env 'HF_DATASETS_CACHE=/net/nfs.cirrascale/aristo/oyvindt/hf_datasets_cache'  --name lmeval-pythia-160m-step140000-18fc9c1c2f -- python catwalk/run_lm_eval.py --model lm::pretrained=EleutherAI/pythia-160m,revision=step140000 --task arc_challenge arc_easy boolq copa headqa_en hellaswag logiqa mathqa mrpc openbookqa piqa qnli qqp rte sciq sst wic winogrande wnli wsc --split validation --full_output_file /results/predictions.jsonl --metrics_file /results/metrics.json --limit 1000 --num_recorded_inputs 3 --batch_size 32 --model_max_length 2048 --max_batch_tokens 20480",
   'output': '\n\x1b[1;36m                                             o=======[]   \x1b[0m\n\x1b[1;36m   __ _                    _               _ |_      []   \x1b[0m\n\x1b[1;36m  / _` |  __ _    _ _     | |_      _ _   | || |     []   \x1b[0m\n\x1b[1;36m  \\__, | / _` |  | \' \\    |  _|    | \'_|   \\_, |   _/ ]_  \x1b[0m\n\x1b[1;36m  |___/  \\__,_|  |_||_|   _\\__|   _|_|_   _|__/   |_____| \x1b[0m\n\x1b[1;34m_|"""""|_|"""""|_|"""""|_|"""""|_|"""""|_| """"| \x1b[0m\n\x1b[1;34m `---------------------------------------------\' \x1b[0m\n\nExperiment submitted, see progress at \n\x1b[4;94mhttps://beaker.org/ex/01GZZ4YW7S555PBRH31SJ463A3\x1b[0m\n'},
  'id': '01GZZ4YW7S555PBRH31SJ463A3',
  'result_id': '01GZZ4Z7M4G263SK6GZN7K1AF8'}]

In [259]:
experiment_tracker_olmo = [{'exp_name': 'lmeval-olmo-c4-small-euox4j8q-step7300-30eca3ce7d',
  'model': 'olmo-c4-small-euox4j8q-step7300',
  'checkpoint': None,
  'run_gantry': {'command': "gantry run --gpus 1 --dataset 'oyvindt/olmo-c4-small-euox4j8q-step7300:/model' --venv base --workspace ai2/lm-eval --cluster ai2/aristo-cirrascale --beaker-image oyvindt/OLMoEvalV4 --env 'HF_DATASETS_CACHE=/net/nfs.cirrascale/aristo/oyvindt/hf_datasets_cache'  --name lmeval-olmo-c4-small-euox4j8q-step7300-30eca3ce7d -- python catwalk/run_lm_eval.py --model lm::pretrained=olmo-c4-small-euox4j8q-step7300 --task arc_challenge arc_easy boolq copa headqa_en hellaswag logiqa mathqa mrpc openbookqa piqa qnli qqp rte sciq sst wic winogrande wnli wsc --split validation --full_output_file /results/predictions.jsonl --metrics_file /results/metrics.json --limit 1000 --num_recorded_inputs 3 --model_path /model --model_class olmo_eval.olmo_pretrained.OlmoPretrained --batch_size 32 --model_max_length 2048 --max_batch_tokens 8192",
   'output': '\n\x1b[1;36m                                             o=======[]   \x1b[0m\n\x1b[1;36m   __ _                    _               _ |_      []   \x1b[0m\n\x1b[1;36m  / _` |  __ _    _ _     | |_      _ _   | || |     []   \x1b[0m\n\x1b[1;36m  \\__, | / _` |  | \' \\    |  _|    | \'_|   \\_, |   _/ ]_  \x1b[0m\n\x1b[1;36m  |___/  \\__,_|  |_||_|   _\\__|   _|_|_   _|__/   |_____| \x1b[0m\n\x1b[1;34m_|"""""|_|"""""|_|"""""|_|"""""|_|"""""|_| """"| \x1b[0m\n\x1b[1;34m `---------------------------------------------\' \x1b[0m\n\nExperiment submitted, see progress at \n\x1b[4;94mhttps://beaker.org/ex/01GZZ75J196QP1JEYTPG0YEEB9\x1b[0m\n'},
  'id': '01GZZ75J196QP1JEYTPG0YEEB9',
  'result_id': '01GZZ760XEJ6AXZEKFM1B9TT98'}]
