# Set up lm_evaluation_harness

In [None]:
# see https://github.com/EleutherAI/lm-evaluation-harness for details and other tasks for evaluation

In [None]:
!git clone https://github.com/EleutherAI/lm-evaluation-harness
!cd lm-evaluation-harness
!pip install -e .

In [5]:
from lm_eval import api
import os
import json
import pandas as pd

# Get access to huggingface models

To access the phi and pythia models that have been continued-pretrained on BeanCounter, do the following steps:

To access the continued-pretrained phi model, go to https://huggingface.co/bradfordlevy/phi-1_5-bc-cp, agree to share contact information, login using huggingface-cli and then access the model. Learn how to generate access token here: https://huggingface.co/docs/hub/en/security-tokens

Additionally, please follow the same above steps if you want to access the continued-pretrained pythia model here: https://huggingface.co/bradfordlevy/pythia-1.4b-bc-cp.

Afterwards, make sure you are logged in via the huggingface cli with the command below

In [None]:
from huggingface_hub import login
login()

# Big bench hard

In [8]:
YAML_leaderboard_bbh_string = '''
group: leaderboard_bbh
task:
  - leaderboard_bbh_boolean_expressions
  - leaderboard_bbh_causal_judgement
  - leaderboard_bbh_date_understanding
  - leaderboard_bbh_disambiguation_qa
  - leaderboard_bbh_formal_fallacies
  - leaderboard_bbh_geometric_shapes
  - leaderboard_bbh_hyperbaton
  - leaderboard_bbh_logical_deduction_five_objects
  - leaderboard_bbh_logical_deduction_seven_objects
  - leaderboard_bbh_logical_deduction_three_objects
  - leaderboard_bbh_movie_recommendation
  - leaderboard_bbh_navigate
  - leaderboard_bbh_object_counting
  - leaderboard_bbh_penguins_in_a_table
  - leaderboard_bbh_reasoning_about_colored_objects
  - leaderboard_bbh_ruin_names
  - leaderboard_bbh_salient_translation_error_detection
  - leaderboard_bbh_snarks
  - leaderboard_bbh_sports_understanding
  - leaderboard_bbh_temporal_sequences
  - leaderboard_bbh_tracking_shuffled_objects_five_objects
  - leaderboard_bbh_tracking_shuffled_objects_seven_objects
  - leaderboard_bbh_tracking_shuffled_objects_three_objects
  - leaderboard_bbh_web_of_lies
'''

with open('./yaml_configs/leaderboard_bbh.yaml', 'w') as f:
    f.write(YAML_leaderboard_bbh_string)

In [None]:
!lm_eval \
    --model hf \
    --model_args pretrained=bradfordlevy/phi-1_5-bc-cp \
    --include_path ./yaml_configs/leaderboard_bbh.yaml \
    --tasks leaderboard_bbh \
    --batch_size auto \
    --use_cache ./resulting_scores/learderboard_bbh/phi1_5_bc_cp/ \
    --output_path ./resulting_scores/learderboard_bbh/phi1_5_bc_cp/ \
    --log_samples

Loading checkpoint shards: 100%|██████████████████| 2/2 [00:00<00:00,  2.01it/s]
100%|███████████████████████████████████████| 250/250 [00:00<00:00, 1078.49it/s]
100%|███████████████████████████████████████| 187/187 [00:00<00:00, 1075.92it/s]
100%|███████████████████████████████████████| 250/250 [00:00<00:00, 1066.04it/s]
100%|███████████████████████████████████████| 250/250 [00:00<00:00, 1074.12it/s]
100%|███████████████████████████████████████| 250/250 [00:00<00:00, 1075.04it/s]
100%|███████████████████████████████████████| 250/250 [00:00<00:00, 1056.76it/s]
100%|███████████████████████████████████████| 250/250 [00:00<00:00, 1069.63it/s]
100%|███████████████████████████████████████| 250/250 [00:00<00:00, 1046.19it/s]
100%|████████████████████████████████████████| 250/250 [00:00<00:00, 690.80it/s]
100%|███████████████████████████████████████| 250/250 [00:00<00:00, 1051.83it/s]
100%|███████████████████████████████████████| 250/250 [00:00<00:00, 1068.06it/s]
100%|███████████████████████

In [3]:
!lm_eval \
    --model hf \
    --model_args pretrained=microsoft/phi-1_5 \
    --include_path ./yaml_configs/leaderboard_bbh.yaml \
    --tasks leaderboard_bbh \
    --batch_size auto \
    --use_cache ./resulting_scores/learderboard_bbh/phi1_5/ \
    --output_path ./resulting_scores/learderboard_bbh/phi1_5/ \
    --log_samples

100%|████████████████████████████████████████| 250/250 [00:00<00:00, 944.90it/s]
100%|████████████████████████████████████████| 187/187 [00:00<00:00, 937.03it/s]
100%|███████████████████████████████████████| 250/250 [00:00<00:00, 1005.75it/s]
100%|███████████████████████████████████████| 250/250 [00:00<00:00, 1046.95it/s]
100%|███████████████████████████████████████| 250/250 [00:00<00:00, 1046.32it/s]
100%|███████████████████████████████████████| 250/250 [00:00<00:00, 1031.74it/s]
100%|███████████████████████████████████████| 250/250 [00:00<00:00, 1048.06it/s]
100%|███████████████████████████████████████| 250/250 [00:00<00:00, 1024.17it/s]
100%|████████████████████████████████████████| 250/250 [00:00<00:00, 685.19it/s]
100%|███████████████████████████████████████| 250/250 [00:00<00:00, 1038.58it/s]
100%|███████████████████████████████████████| 250/250 [00:00<00:00, 1036.41it/s]
100%|███████████████████████████████████████| 250/250 [00:00<00:00, 1050.92it/s]
100%|███████████████████████

In [4]:
!lm_eval \
    --model hf \
    --model_args pretrained=EleutherAI/pythia-1.4b \
    --include_path ./yaml_configs/leaderboard_bbh.yaml \
    --tasks leaderboard_bbh \
    --batch_size auto \
    --use_cache ./resulting_scores/learderboard_bbh/pythia-1_4/ \
    --output_path ./resulting_scores/learderboard_bbh/pythia-1_4/ \
    --log_samples

100%|████████████████████████████████████████| 250/250 [00:00<00:00, 927.09it/s]
100%|████████████████████████████████████████| 187/187 [00:00<00:00, 927.77it/s]
100%|███████████████████████████████████████| 250/250 [00:00<00:00, 1006.90it/s]
100%|███████████████████████████████████████| 250/250 [00:00<00:00, 1039.76it/s]
100%|███████████████████████████████████████| 250/250 [00:00<00:00, 1038.82it/s]
100%|███████████████████████████████████████| 250/250 [00:00<00:00, 1026.31it/s]
100%|███████████████████████████████████████| 250/250 [00:00<00:00, 1042.09it/s]
100%|███████████████████████████████████████| 250/250 [00:00<00:00, 1037.69it/s]
100%|████████████████████████████████████████| 250/250 [00:00<00:00, 678.36it/s]
100%|███████████████████████████████████████| 250/250 [00:00<00:00, 1030.80it/s]
100%|███████████████████████████████████████| 250/250 [00:00<00:00, 1039.65it/s]
100%|███████████████████████████████████████| 250/250 [00:00<00:00, 1043.75it/s]
100%|███████████████████████

In [None]:
!lm_eval \
    --model hf \
    --model_args pretrained=bradfordlevy/pythia-1.4b-bc-cp \
    --include_path yaml_configs/leaderboard_bbh.yaml \
    --tasks leaderboard_bbh \
    --batch_size auto \
    --use_cache ./resulting_scores/learderboard_bbh/pythia-1_4b-bc/ \
    --output_path ./resulting_scores/learderboard_bbh/pythia-1_4b-bc/ \
    --log_samples

Loading checkpoint shards: 100%|██████████████████| 2/2 [00:00<00:00,  2.02it/s]
100%|████████████████████████████████████████| 250/250 [00:00<00:00, 931.18it/s]
100%|████████████████████████████████████████| 187/187 [00:00<00:00, 939.13it/s]
100%|███████████████████████████████████████| 250/250 [00:00<00:00, 1000.07it/s]
100%|███████████████████████████████████████| 250/250 [00:00<00:00, 1035.59it/s]
100%|███████████████████████████████████████| 250/250 [00:00<00:00, 1041.38it/s]
100%|███████████████████████████████████████| 250/250 [00:00<00:00, 1028.87it/s]
100%|███████████████████████████████████████| 250/250 [00:00<00:00, 1046.23it/s]
100%|███████████████████████████████████████| 250/250 [00:00<00:00, 1049.58it/s]
100%|████████████████████████████████████████| 250/250 [00:00<00:00, 671.48it/s]
100%|███████████████████████████████████████| 250/250 [00:00<00:00, 1032.53it/s]
100%|███████████████████████████████████████| 250/250 [00:00<00:00, 1038.82it/s]
100%|███████████████████████

## compile bbh results

In [53]:
phi1_5_res = './resulting_scores/learderboard_bbh/phi1_5/microsoft__phi-1_5/results_2024-08-14T17-42-50.718050.json'
phi1_5_cp_bc_res = './resulting_scores/learderboard_bbh/phi1_5_bc_cp/__mnt__spinning__beancounter_replication__continually_pretrained_models__phi-1_5-bc-cp-highlr-hf/results_2024-08-14T17-33-24.500762.json'
pythia_res = './resulting_scores/learderboard_bbh/pythia-1_4/EleutherAI__pythia-1.4b/results_2024-08-15T00-07-39.589084.json'
pythia_bc_res = './resulting_scores/learderboard_bbh/pythia-1_4b-bc/__mnt__spinning__beancounter_replication__continually_pretrained_models__pythia-1_4b-bc/results_2024-08-15T00-37-51.179928.json'

In [1]:
def get_avg_accuracy(res_path):
    with open(res_path) as outfile:
        res = json.load(outfile)
    acc_norms = []
    for key, vals in res['results'].items():
        if 'acc_norm,none' in vals:
            acc_norms.append(vals['acc_norm,none'])
    return sum(acc_norms)/len(acc_norms)

In [58]:
get_avg_accuracy(phi1_5_res)

0.33455561253143856

In [60]:
get_avg_accuracy(phi1_5_cp_bc_res)

0.3123777656052416

In [62]:
get_avg_accuracy(pythia_res)

0.3173759830740473

In [64]:
get_avg_accuracy(pythia_bc_res)

0.31354264974071405

# MMLU-pro

In [7]:
YAML_leaderboard_mmlu_pro_string = '''
dataset_path: TIGER-Lab/MMLU-Pro # a copy of `cais/leaderboard_mmlu` with no auxiliary_train split
task: leaderboard_mmlu_pro
test_split: test
fewshot_split: validation
fewshot_config:
  sampler: first_n
output_type: multiple_choice
doc_to_text: !function utils.doc_to_text
doc_to_choice: !function utils.doc_to_choice
doc_to_target: answer
metric_list:
  - metric: acc
    aggregation: mean
    higher_is_better: true
num_fewshot: 5
metadata:
  version: 0.1
'''
with open('./yaml_configs/leaderboard_mmlu_pro.yaml', 'w') as f:
    f.write(YAML_leaderboard_mmlu_pro_string)

In [None]:
!lm_eval \
    --model hf \
    --model_args pretrained=bradfordlevy/phi-1_5-bc-cp \
    --include_path ./yaml_configs/leaderboard_mmlu_pro.yaml \
    --tasks leaderboard_mmlu_pro \
    --batch_size auto \
    --use_cache ./resulting_scores/learderboard_mmlu_pro/phi1_5_bc_cp/ \
    --output_path ./resulting_scores/learderboard_mmlu_pro/phi1_5_bc_cp/ \
    --log_samples

Loading checkpoint shards: 100%|██████████████████| 2/2 [00:00<00:00,  2.08it/s]
100%|██████████████████████████████████| 12032/12032 [00:00<00:00, 21990.42it/s]
Checking cached requests: 100%|██████| 113995/113995 [00:05<00:00, 22128.61it/s]
Running loglikelihood requests:   0%|                | 0/113995 [00:00<?, ?it/s]Passed argument batch_size = auto:1. Detecting largest batch size
Determined largest batch size: 8
Running loglikelihood requests: 100%|██| 113995/113995 [15:23<00:00, 123.45it/s]
fatal: not a git repository (or any of the parent directories): .git
hf (pretrained=/mnt/spinning/beancounter_replication/continually_pretrained_models/phi-1_5-bc-cp-highlr-hf), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: auto (8)
|       Tasks        |Version|Filter|n-shot|Metric|   |Value |   |Stderr|
|--------------------|------:|------|-----:|------|---|-----:|---|-----:|
|leaderboard_mmlu_pro|    0.1|none  |     5|acc   |↑  |0.1252|±  | 0.003|



In [10]:
!lm_eval \
    --model hf \
    --model_args pretrained=microsoft/phi-1_5 \
    --include_path ./yaml_configs/leaderboard_mmlu_pro.yaml \
    --tasks leaderboard_mmlu_pro \
    --batch_size auto \
    --use_cache ./resulting_scores/learderboard_mmlu_pro/phi1_5/ \
    --output_path ./resulting_scores/learderboard_mmlu_pro/phi1_5/ \
    --log_samples

100%|██████████████████████████████████| 12032/12032 [00:00<00:00, 21668.29it/s]
Checking cached requests: 100%|██████| 113995/113995 [00:08<00:00, 13302.86it/s]
Running loglikelihood requests:   0%|                | 0/113995 [00:00<?, ?it/s]Passed argument batch_size = auto:1. Detecting largest batch size
Determined largest batch size: 8
Running loglikelihood requests: 100%|██| 113995/113995 [05:35<00:00, 339.48it/s]
fatal: not a git repository (or any of the parent directories): .git
hf (pretrained=microsoft/phi-1_5), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: auto (8)
|       Tasks        |Version|Filter|n-shot|Metric|   |Value|   |Stderr|
|--------------------|------:|------|-----:|------|---|----:|---|-----:|
|leaderboard_mmlu_pro|    0.1|none  |     5|acc   |↑  | 0.17|±  |0.0034|



In [11]:
!lm_eval \
    --model hf \
    --model_args pretrained=EleutherAI/pythia-1.4b \
    --include_path ./yaml_configs/leaderboard_mmlu_pro.yaml \
    --tasks leaderboard_mmlu_pro \
    --batch_size auto \
    --use_cache ./resulting_scores/learderboard_mmlu_pro/pythia-1_4/ \
    --output_path ./resulting_scores/learderboard_mmlu_pro/pythia-1_4/ \
    --log_samples

100%|██████████████████████████████████| 12032/12032 [00:00<00:00, 21587.46it/s]
Checking cached requests: 100%|██████| 113995/113995 [00:05<00:00, 19143.21it/s]
Running loglikelihood requests:   0%|                | 0/113995 [00:00<?, ?it/s]Passed argument batch_size = auto:1. Detecting largest batch size
Determined largest batch size: 8
Running loglikelihood requests: 100%|██| 113995/113995 [06:55<00:00, 274.32it/s]
fatal: not a git repository (or any of the parent directories): .git
hf (pretrained=EleutherAI/pythia-1.4b), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: auto (8)
|       Tasks        |Version|Filter|n-shot|Metric|   |Value |   |Stderr|
|--------------------|------:|------|-----:|------|---|-----:|---|-----:|
|leaderboard_mmlu_pro|    0.1|none  |     5|acc   |↑  |0.1125|±  |0.0029|



In [None]:
!lm_eval \
    --model hf \
    --model_args pretrained=bradfordlevy/pythia-1.4b-bc-cp \
    --include_path ./yaml_configs/leaderboard_mmlu_pro.yaml \
    --tasks leaderboard_mmlu_pro \
    --batch_size auto \
    --use_cache ./resulting_scores/learderboard_mmlu_pro/pythia-1_4b-bc/ \
    --output_path ./resulting_scores/learderboard_mmlu_pro/pythia-1_4b-bc/ \
    --log_samples

Loading checkpoint shards: 100%|██████████████████| 2/2 [00:00<00:00,  2.00it/s]
100%|██████████████████████████████████| 12032/12032 [00:00<00:00, 22567.92it/s]
Checking cached requests: 100%|██████| 113995/113995 [00:05<00:00, 22190.97it/s]
Running loglikelihood requests:   0%|                | 0/113995 [00:00<?, ?it/s]Passed argument batch_size = auto:1. Detecting largest batch size
Determined largest batch size: 4
Running loglikelihood requests: 100%|██| 113995/113995 [14:29<00:00, 131.05it/s]
fatal: not a git repository (or any of the parent directories): .git
hf (pretrained=/mnt/spinning/beancounter_replication/continually_pretrained_models/pythia-1_4b-bc), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: auto (4)
|       Tasks        |Version|Filter|n-shot|Metric|   |Value |   |Stderr|
|--------------------|------:|------|-----:|------|---|-----:|---|-----:|
|leaderboard_mmlu_pro|    0.1|none  |     5|acc   |↑  |0.1134|±  |0.0029|



# IFeval

In [13]:
YAML_leaderboard_ifeval_string = '''
task: leaderboard_ifeval
dataset_path: wis-k/instruction-following-eval
dataset_name: null
output_type: generate_until
test_split: train
num_fewshot: 0
doc_to_text: prompt
doc_to_target: 0
generation_kwargs:
  until: []
  do_sample: false
  temperature: 0.0
  max_gen_toks: 1280
process_results: !function utils.process_results
metric_list:
  - metric: prompt_level_strict_acc
    aggregation: mean
    higher_is_better: true
  - metric: inst_level_strict_acc
    aggregation: !function utils.agg_inst_level_acc
    higher_is_better: true
  - metric: prompt_level_loose_acc
    aggregation: mean
    higher_is_better: true
  - metric: inst_level_loose_acc
    aggregation: !function utils.agg_inst_level_acc
    higher_is_better: true
metadata:
  version: 2.0
fewshot_config:
  sampler: first_n
'''

with open('leaderboard_ifeval.yaml', 'w') as f:
    f.write(YAML_leaderboard_ifeval_string)

In [None]:
!lm_eval \
    --model hf \
    --model_args pretrained=bradfordlevy/phi-1_5-bc-cp \
    --include_path ./yaml_configs/leaderboard_ifeval.yaml \
    --tasks leaderboard_ifeval \
    --batch_size auto \
    --use_cache ./resulting_scores/learderboard_ifeval/phi1_5_bc_cp/ \
    --output_path ./resulting_scores/learderboard_ifeval/phi1_5_bc_cp/ \
    --log_samples

Loading checkpoint shards: 100%|██████████████████| 2/2 [00:00<00:00,  2.01it/s]
100%|█████████████████████████████████████| 541/541 [00:00<00:00, 193991.49it/s]
Checking cached requests: 100%|████████████| 541/541 [00:00<00:00, 21159.45it/s]
Running generate_until requests:   0%|                  | 0/541 [00:00<?, ?it/s]Passed argument batch_size = auto. Detecting largest batch size
Determined Largest batch size: 8
Running generate_until requests: 100%|████████| 541/541 [24:32<00:00,  2.72s/it]
fatal: not a git repository (or any of the parent directories): .git
hf (pretrained=/mnt/spinning/beancounter_replication/continually_pretrained_models/phi-1_5-bc-cp-highlr-hf), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: auto
|      Tasks       |Version|Filter|n-shot|        Metric         |   |Value |   |Stderr|
|------------------|------:|------|-----:|-----------------------|---|-----:|---|------|
|leaderboard_ifeval|      2|none  |     0|inst_level_loose_acc   |↑  |0.25

In [15]:
!lm_eval \
    --model hf \
    --model_args pretrained=microsoft/phi-1_5 \
    --include_path ./yaml_configs/leaderboard_ifeval.yaml \
    --tasks leaderboard_ifeval \
    --batch_size auto \
    --use_cache ./resulting_scores/learderboard_ifeval/phi1_5/ \
    --output_path ./resulting_scores/learderboard_ifeval/phi1_5/ \
    --log_samples

100%|█████████████████████████████████████| 541/541 [00:00<00:00, 213303.11it/s]
Checking cached requests: 100%|████████████| 541/541 [00:00<00:00, 13778.12it/s]
Running generate_until requests:   0%|                  | 0/541 [00:00<?, ?it/s]Passed argument batch_size = auto. Detecting largest batch size
Determined Largest batch size: 8
Running generate_until requests: 100%|████████| 541/541 [12:47<00:00,  1.42s/it]
fatal: not a git repository (or any of the parent directories): .git
hf (pretrained=microsoft/phi-1_5), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: auto
|      Tasks       |Version|Filter|n-shot|        Metric         |   |Value |   |Stderr|
|------------------|------:|------|-----:|-----------------------|---|-----:|---|------|
|leaderboard_ifeval|      2|none  |     0|inst_level_loose_acc   |↑  |0.2734|±  |   N/A|
|                  |       |none  |     0|inst_level_strict_acc  |↑  |0.2698|±  |   N/A|
|                  |       |none  |     0|prompt_le

In [16]:
!lm_eval \
    --model hf \
    --model_args pretrained=EleutherAI/pythia-1.4b \
    --include_path ./yaml_configs/leaderboard_ifeval.yaml \
    --tasks leaderboard_ifeval \
    --batch_size auto \
    --use_cache ./resulting_scores/learderboard_ifeval/pythia-1_4/ \
    --output_path ./resulting_scores/learderboard_ifeval/pythia-1_4/ \
    --log_samples

100%|█████████████████████████████████████| 541/541 [00:00<00:00, 121317.28it/s]
Checking cached requests: 100%|████████████| 541/541 [00:00<00:00, 11394.30it/s]
Running generate_until requests:   0%|                  | 0/541 [00:00<?, ?it/s]Passed argument batch_size = auto. Detecting largest batch size
Determined Largest batch size: 8
Running generate_until requests: 100%|████████| 541/541 [17:50<00:00,  1.98s/it]
fatal: not a git repository (or any of the parent directories): .git
hf (pretrained=EleutherAI/pythia-1.4b), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: auto
|      Tasks       |Version|Filter|n-shot|        Metric         |   |Value |   |Stderr|
|------------------|------:|------|-----:|-----------------------|---|-----:|---|------|
|leaderboard_ifeval|      2|none  |     0|inst_level_loose_acc   |↑  |0.3070|±  |   N/A|
|                  |       |none  |     0|inst_level_strict_acc  |↑  |0.2986|±  |   N/A|
|                  |       |none  |     0|prom

In [None]:
!lm_eval \
    --model hf \
    --model_args pretrained=bradfordlevy/pythia-1.4b-bc-cp \
    --include_path ./yaml_configs/leaderboard_ifeval.yaml \
    --tasks leaderboard_ifeval \
    --batch_size auto \
    --use_cache ./resulting_scores/learderboard_ifeval/pythia-1_4b-bc/ \
    --output_path ./resulting_scores/learderboard_ifeval/pythia-1_4b-bc/ \
    --log_samples

Loading checkpoint shards: 100%|██████████████████| 2/2 [00:00<00:00,  2.06it/s]
100%|█████████████████████████████████████| 541/541 [00:00<00:00, 230180.41it/s]
Checking cached requests: 100%|████████████| 541/541 [00:00<00:00, 23329.48it/s]
Running generate_until requests:   0%|                  | 0/541 [00:00<?, ?it/s]Passed argument batch_size = auto. Detecting largest batch size
Determined Largest batch size: 4
Running generate_until requests: 100%|████████| 541/541 [36:51<00:00,  4.09s/it]
fatal: not a git repository (or any of the parent directories): .git
hf (pretrained=/mnt/spinning/beancounter_replication/continually_pretrained_models/pythia-1_4b-bc), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: auto
|      Tasks       |Version|Filter|n-shot|        Metric         |   |Value |   |Stderr|
|------------------|------:|------|-----:|-----------------------|---|-----:|---|------|
|leaderboard_ifeval|      2|none  |     0|inst_level_loose_acc   |↑  |0.3022|±  |  

# MUSR

In [18]:
YAML_leaderboard_musr_string = '''group: leaderboard_musr
task:
  - leaderboard_musr_murder_mysteries
  - leaderboard_musr_object_placements
  - leaderboard_musr_team_allocation
'''

with open('./yaml_configs/leaderboard_musr.yaml', 'w') as f:
    f.write(YAML_leaderboard_musr_string)

In [None]:
!lm_eval \
    --model hf \
    --model_args pretrained=bradfordlevy/phi-1_5-bc-cp \
    --include_path ./yaml_configs/leaderboard_musr.yaml \
    --tasks leaderboard_musr \
    --batch_size auto \
    --use_cache ./resulting_scores/learderboard_musr/phi1_5_bc_cp/ \
    --output_path ./resulting_scores/learderboard_musr/phi1_5_bc_cp/ \
    --log_samples

Loading checkpoint shards: 100%|██████████████████| 2/2 [00:00<00:00,  2.05it/s]
100%|███████████████████████████████████████| 250/250 [00:00<00:00, 6674.07it/s]
100%|███████████████████████████████████████| 256/256 [00:00<00:00, 6343.22it/s]
100%|███████████████████████████████████████| 250/250 [00:00<00:00, 6477.93it/s]
Checking cached requests: 100%|██████████| 2198/2198 [00:00<00:00, 11701.54it/s]
Running loglikelihood requests:   0%|                  | 0/2198 [00:00<?, ?it/s]Passed argument batch_size = auto:1. Detecting largest batch size
Determined largest batch size: 8
Running loglikelihood requests: 100%|███████| 2198/2198 [02:49<00:00, 12.94it/s]
fatal: not a git repository (or any of the parent directories): .git
hf (pretrained=/mnt/spinning/beancounter_replication/continually_pretrained_models/phi-1_5-bc-cp-highlr-hf), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: auto (8)
|                Tasks                |Version|Filter|n-shot| Metric |   |Value |   

In [20]:
!lm_eval \
    --model hf \
    --model_args pretrained=microsoft/phi-1_5 \
    --include_path ./yaml_configs/leaderboard_musr.yaml \
    --tasks leaderboard_musr \
    --batch_size auto \
    --use_cache ./resulting_scores/learderboard_musr/phi1_5/ \
    --output_path ./resulting_scores/learderboard_musr/phi1_5/ \
    --log_samples

100%|███████████████████████████████████████| 250/250 [00:00<00:00, 6532.61it/s]
100%|███████████████████████████████████████| 256/256 [00:00<00:00, 6422.94it/s]
100%|███████████████████████████████████████| 250/250 [00:00<00:00, 6542.40it/s]
Checking cached requests: 100%|██████████| 2198/2198 [00:00<00:00, 11562.40it/s]
Running loglikelihood requests:   0%|                  | 0/2198 [00:00<?, ?it/s]Passed argument batch_size = auto:1. Detecting largest batch size
Determined largest batch size: 8
Running loglikelihood requests: 100%|███████| 2198/2198 [01:00<00:00, 36.39it/s]
fatal: not a git repository (or any of the parent directories): .git
hf (pretrained=microsoft/phi-1_5), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: auto (8)
|                Tasks                |Version|Filter|n-shot| Metric |   |Value |   |Stderr|
|-------------------------------------|-------|------|-----:|--------|---|-----:|---|-----:|
|leaderboard_musr                     |    N/A|      

In [21]:
!lm_eval \
    --model hf \
    --model_args pretrained=EleutherAI/pythia-1.4b \
    --include_path ./yaml_configs/leaderboard_musr.yaml \
    --tasks leaderboard_musr \
    --batch_size auto \
    --use_cache ./resulting_scores/learderboard_musr/pythia-1_4/ \
    --output_path ./resulting_scores/learderboard_musr/pythia-1_4/ \
    --log_samples

100%|███████████████████████████████████████| 250/250 [00:00<00:00, 6861.24it/s]
100%|███████████████████████████████████████| 256/256 [00:00<00:00, 6654.90it/s]
100%|███████████████████████████████████████| 250/250 [00:00<00:00, 6711.10it/s]
Checking cached requests: 100%|██████████| 2198/2198 [00:00<00:00, 11549.11it/s]
Running loglikelihood requests:   0%|                  | 0/2198 [00:00<?, ?it/s]Passed argument batch_size = auto:1. Detecting largest batch size
Determined largest batch size: 8
Running loglikelihood requests: 100%|███████| 2198/2198 [01:24<00:00, 26.01it/s]
fatal: not a git repository (or any of the parent directories): .git
hf (pretrained=EleutherAI/pythia-1.4b), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: auto (8)
|                Tasks                |Version|Filter|n-shot| Metric |   |Value |   |Stderr|
|-------------------------------------|-------|------|-----:|--------|---|-----:|---|-----:|
|leaderboard_musr                     |    N/A| 

In [None]:
!lm_eval \
    --model hf \
    --model_args pretrained=bradfordlevy/pythia-1.4b-bc-cp \
    --include_path ./yaml_configs/leaderboard_musr.yaml \
    --tasks leaderboard_musr \
    --batch_size auto \
    --use_cache ./resulting_scores/learderboard_musr/pythia-1_4b-bc/ \
    --output_path ./resulting_scores/learderboard_musr/pythia-1_4b-bc/ \
    --log_samples

Loading checkpoint shards: 100%|██████████████████| 2/2 [00:00<00:00,  2.08it/s]
100%|███████████████████████████████████████| 250/250 [00:00<00:00, 6924.90it/s]
100%|███████████████████████████████████████| 256/256 [00:00<00:00, 6803.50it/s]
100%|███████████████████████████████████████| 250/250 [00:00<00:00, 6871.54it/s]
Checking cached requests: 100%|██████████| 2198/2198 [00:00<00:00, 18391.81it/s]
Running loglikelihood requests:   0%|                  | 0/2198 [00:00<?, ?it/s]Passed argument batch_size = auto:1. Detecting largest batch size
Determined largest batch size: 8
Running loglikelihood requests: 100%|███████| 2198/2198 [02:49<00:00, 12.95it/s]
fatal: not a git repository (or any of the parent directories): .git
hf (pretrained=/mnt/spinning/beancounter_replication/continually_pretrained_models/pythia-1_4b-bc), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: auto (8)
|                Tasks                |Version|Filter|n-shot| Metric |   |Value |   |Stderr|


In [12]:
phi1_5_res = './resulting_scores/learderboard_musr/phi1_5/microsoft__phi-1_5/results_2024-08-15T15-04-50.744667.json'
phi1_5_cp_bc_res = './resulting_scores/learderboard_musr/phi1_5_bc_cp/__mnt__spinning__beancounter_replication__continually_pretrained_models__phi-1_5-bc-cp-highlr-hf/results_2024-08-15T15-01-16.452693.json'
pythia_res = './resulting_scores/learderboard_musr/pythia-1_4/EleutherAI__pythia-1.4b/results_2024-08-15T15-11-25.678050.json'
pythia_bc_res = './resulting_scores/learderboard_musr/pythia-1_4b-bc/__mnt__spinning__beancounter_replication__continually_pretrained_models__pythia-1_4b-bc/results_2024-08-15T15-30-02.650601.json'

In [6]:
get_avg_accuracy(phi1_5_res)

0.34041666666666665

In [9]:
get_avg_accuracy(phi1_5_cp_bc_res)

0.34711458333333334

In [11]:
get_avg_accuracy(pythia_res)

0.34978125

In [13]:
get_avg_accuracy(pythia_bc_res)

0.3644791666666667

# GPQA

In [36]:
YAML_leaderboard_gpqa_string = '''
group: leaderboard_gpqa
task:
  - leaderboard_gpqa_diamond
  - leaderboard_gpqa_extended
  - leaderboard_gpqa_main
'''

with open('leaderboard_gpqa.yaml', 'w') as f:
    f.write(YAML_leaderboard_gpqa_string)

In [None]:
!lm_eval \
    --model hf \
    --model_args pretrained=bradfordlevy/phi-1_5-bc-cp \
    --include_path ./yaml_configs/leaderboard_gpqa.yaml \
    --tasks leaderboard_gpqa \
    --batch_size auto \
    --use_cache ./resulting_scores/learderboard_gpqa/phi1_5_bc_cp/ \
    --output_path ./resulting_scores/learderboard_gpqa/phi1_5_bc_cp/ \
    --log_samples

Loading checkpoint shards: 100%|██████████████████| 2/2 [00:00<00:00,  2.07it/s]
100%|███████████████████████████████████████| 198/198 [00:00<00:00, 3144.41it/s]
100%|███████████████████████████████████████| 546/546 [00:00<00:00, 3376.76it/s]
100%|███████████████████████████████████████| 448/448 [00:00<00:00, 3529.50it/s]
Checking cached requests: 100%|██████████| 4768/4768 [00:00<00:00, 23614.40it/s]
Running loglikelihood requests:   0%|                  | 0/4768 [00:00<?, ?it/s]Passed argument batch_size = auto:1. Detecting largest batch size
Determined largest batch size: 8
Running loglikelihood requests: 100%|███████| 4768/4768 [01:38<00:00, 48.21it/s]
fatal: not a git repository (or any of the parent directories): .git
hf (pretrained=/mnt/spinning/beancounter_replication/continually_pretrained_models/phi-1_5-bc-cp-highlr-hf), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: auto (8)
|           Tasks            |Version|Filter|n-shot| Metric |   |Value |   |Stderr|


In [38]:
!lm_eval \
    --model hf \
    --model_args pretrained=microsoft/phi-1_5 \
    --include_path ./yaml_configs/leaderboard_gpqa.yaml \
    --tasks leaderboard_gpqa \
    --batch_size auto \
    --use_cache ./resulting_scores/learderboard_gpqa/phi1_5/ \
    --output_path ./resulting_scores/learderboard_gpqa/phi1_5/ \
    --log_samples

100%|███████████████████████████████████████| 198/198 [00:00<00:00, 3115.88it/s]
100%|███████████████████████████████████████| 546/546 [00:00<00:00, 3278.15it/s]
100%|███████████████████████████████████████| 448/448 [00:00<00:00, 3454.39it/s]
Checking cached requests: 100%|██████████| 4768/4768 [00:00<00:00, 13723.19it/s]
Running loglikelihood requests:   0%|                  | 0/4768 [00:00<?, ?it/s]Passed argument batch_size = auto:1. Detecting largest batch size
Determined largest batch size: 8
Running loglikelihood requests: 100%|██████| 4768/4768 [00:37<00:00, 128.75it/s]
fatal: not a git repository (or any of the parent directories): .git
hf (pretrained=microsoft/phi-1_5), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: auto (8)
|           Tasks            |Version|Filter|n-shot| Metric |   |Value |   |Stderr|
|----------------------------|-------|------|-----:|--------|---|-----:|---|-----:|
|leaderboard_gpqa            |    N/A|      |      |        |   |      

In [39]:
!lm_eval \
    --model hf \
    --model_args pretrained=EleutherAI/pythia-1.4b \
    --include_path ./yaml_configs/leaderboard_gpqa.yaml \
    --tasks leaderboard_gpqa \
    --batch_size auto \
    --use_cache ./resulting_scores/learderboard_gpqa/pythia-1_4b/ \
    --output_path ./resulting_scores/learderboard_gpqa/pythia-1_4b/ \
    --log_samples

100%|███████████████████████████████████████| 198/198 [00:00<00:00, 3121.72it/s]
100%|███████████████████████████████████████| 546/546 [00:00<00:00, 3371.51it/s]
100%|███████████████████████████████████████| 448/448 [00:00<00:00, 3477.29it/s]
Checking cached requests: 100%|██████████| 4768/4768 [00:00<00:00, 13532.79it/s]
Running loglikelihood requests:   0%|                  | 0/4768 [00:00<?, ?it/s]Passed argument batch_size = auto:1. Detecting largest batch size
Determined largest batch size: 8
Running loglikelihood requests: 100%|██████| 4768/4768 [00:34<00:00, 138.97it/s]
fatal: not a git repository (or any of the parent directories): .git
hf (pretrained=EleutherAI/pythia-1.4b), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: auto (8)
|           Tasks            |Version|Filter|n-shot| Metric |   |Value |   |Stderr|
|----------------------------|-------|------|-----:|--------|---|-----:|---|-----:|
|leaderboard_gpqa            |    N/A|      |      |        |   | 

In [None]:
!lm_eval \
    --model hf \
    --model_args pretrained=bradfordlevy/pythia-1.4b-bc-cp \
    --include_path ./yaml_configs/leaderboard_gpqa.yaml \
    --tasks leaderboard_gpqa \
    --batch_size auto \
    --use_cache ./resulting_scores/learderboard_gpqa/pythia-1_4b_bc/ \
    --output_path ./resulting_scores/learderboard_gpqa/pythia-1_4b_bc/ \
    --log_samples

Loading checkpoint shards: 100%|██████████████████| 2/2 [00:00<00:00,  2.05it/s]
100%|███████████████████████████████████████| 198/198 [00:00<00:00, 3132.97it/s]
100%|███████████████████████████████████████| 546/546 [00:00<00:00, 3418.92it/s]
100%|███████████████████████████████████████| 448/448 [00:00<00:00, 3511.22it/s]
Checking cached requests: 100%|██████████| 4768/4768 [00:00<00:00, 23441.37it/s]
Running loglikelihood requests:   0%|                  | 0/4768 [00:00<?, ?it/s]Passed argument batch_size = auto:1. Detecting largest batch size
Determined largest batch size: 4
Running loglikelihood requests: 100%|███████| 4768/4768 [01:26<00:00, 55.16it/s]
fatal: not a git repository (or any of the parent directories): .git
hf (pretrained=/mnt/spinning/beancounter_replication/continually_pretrained_models/pythia-1_4b-bc), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: auto (4)
|           Tasks            |Version|Filter|n-shot| Metric |   |Value |   |Stderr|
|--------

In [15]:
phi1_5_res = './resulting_scores/learderboard_gpqa/phi1_5/microsoft__phi-1_5/results_2024-08-15T16-04-07.002330.json'
phi1_5_cp_bc_res = './resulting_scores/learderboard_gpqa/phi1_5_bc_cp/__mnt__spinning__beancounter_replication__continually_pretrained_models__phi-1_5-bc-cp-highlr-hf/results_2024-08-15T16-02-56.861924.json' 
pythia_res = './resulting_scores/learderboard_gpqa/pythia-1_4b/EleutherAI__pythia-1.4b/results_2024-08-15T16-05-24.665140.json'
pythia_bc_res = './resulting_scores/learderboard_gpqa/pythia-1_4b_bc/__mnt__spinning__beancounter_replication__continually_pretrained_models__pythia-1_4b-bc/results_2024-08-15T17-15-19.349928.json' 

In [16]:
get_avg_accuracy(phi1_5_res)

0.27324758574758573

In [17]:
get_avg_accuracy(phi1_5_cp_bc_res)

0.2479279979279979

In [18]:
get_avg_accuracy(pythia_res)

0.24952073389573393

In [19]:
get_avg_accuracy(pythia_bc_res)

0.267439158064158