In [None]:
!pip install datasets==2.5.0 # https://github.com/huggingface/datasets/issues/5111
!git clone https://huggingface.co/datasets/bigscience/evaluation-results


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets==2.5.0
  Downloading datasets-2.5.0-py3-none-any.whl (431 kB)
[K     |████████████████████████████████| 431 kB 5.4 MB/s 
Collecting xxhash
  Downloading xxhash-3.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 57.9 MB/s 
Collecting multiprocess
  Downloading multiprocess-0.70.14-py37-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 69.5 MB/s 
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 60.6 MB/s 
[?25hCollecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting dill<0.3.6
  Downloading dill-0.3.5.1-py2.py3-none-any.whl (95 kB)
[K     |████████████████████████████████| 95 kB 4.5 MB/s 
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.

In [None]:
from datasets.utils.logging import set_verbosity_error
set_verbosity_error()

from datasets import disable_progress_bar
disable_progress_bar()

In [None]:
from datasets import load_dataset


bloom = load_dataset("evaluation-results", "bloom")
bloom_7b1 = load_dataset("evaluation-results", "bloom-7b1")
bloom_3b = load_dataset("evaluation-results", "bloom-3b")
bloom_1b7 = load_dataset("evaluation-results", "bloom-1b7")
bloom_1b1 = load_dataset("evaluation-results", "bloom-1b1")
bloom_560m = load_dataset("evaluation-results", "bloom-560m")


bloomz = load_dataset("evaluation-results", "bloomz")
bloomz_7b1 = load_dataset("evaluation-results", "bloomz-7b1")
bloomz_3b = load_dataset("evaluation-results", "bloomz-3b")
bloomz_1b7 = load_dataset("evaluation-results", "bloomz-1b7")
bloomz_1b1 = load_dataset("evaluation-results", "bloomz-1b1")
bloomz_560m = load_dataset("evaluation-results", "bloomz-560m")


bloomz_mt = load_dataset("evaluation-results", "bloomz-mt")
bloomz_7b1_mt = load_dataset("evaluation-results", "bloomz-7b1-mt")

bloomz_7b1_p3 = load_dataset("evaluation-results", "bloomz-7b1-p3")
bloomz_p3 = load_dataset("evaluation-results", "bloomz-p3")

mt0_xxl = load_dataset("evaluation-results", "mt0-xxl")
mt0_xxl_mt = load_dataset("evaluation-results", "mt0-xxl-mt")

mt5_xxl = load_dataset("evaluation-results", "mt5-xxl")

Downloading and preparing dataset evaluation-results/bloom-1b1 to /root/.cache/huggingface/datasets/evaluation-results/bloom-1b1/1.0.0/94bec96f1bf52030cd1d63cbd84b8620d21e2c9d9127c1bc7575ddc71b31932e...
Dataset evaluation-results downloaded and prepared to /root/.cache/huggingface/datasets/evaluation-results/bloom-1b1/1.0.0/94bec96f1bf52030cd1d63cbd84b8620d21e2c9d9127c1bc7575ddc71b31932e. Subsequent calls will reuse this data.
Downloading and preparing dataset evaluation-results/bloom-560m to /root/.cache/huggingface/datasets/evaluation-results/bloom-560m/1.0.0/94bec96f1bf52030cd1d63cbd84b8620d21e2c9d9127c1bc7575ddc71b31932e...
Dataset evaluation-results downloaded and prepared to /root/.cache/huggingface/datasets/evaluation-results/bloom-560m/1.0.0/94bec96f1bf52030cd1d63cbd84b8620d21e2c9d9127c1bc7575ddc71b31932e. Subsequent calls will reuse this data.
Downloading and preparing dataset evaluation-results/bloomz to /root/.cache/huggingface/datasets/evaluation-results/bloomz/1.0.0/94bec9

In [None]:
import numpy as np
EVAL_MODELS = {
    "BLOOMZ": bloomz,
    "BLOOMZ-MT": bloomz_mt,
    "mT0": mt0_xxl,
    "mT0-MT": mt0_xxl_mt,  
}

EVAL_TASKS = {
    "xnli": "XNLI",
    "xcopa": "XCOPA",
    "Muennighoff/xstory_cloze": "XStoryCloze",
    "Muennighoff/xwinograd": "XWinograd",
}

EVAL_LANGS = ['en', 'es', 'pt', 'fr', 'ar', 'id', 'zh', 'hi', 'vi', 'ur', 'ta', 'eu',]

for task_name in EVAL_TASKS:
    score_en = {}
    score_mt = {}
    score_ht = {}
    for name in EVAL_MODELS:
        model_type = EVAL_MODELS[name]
        
        task_ds = model_type['test'].filter(lambda x: x["task_name"].startswith(task_name))
      
        prompt_ds_ht = task_ds.filter(lambda x: x["prompt_name"].endswith("ht"))
        prompt_ds_mt = task_ds.filter(lambda x: x["prompt_name"].endswith("mt"))
        prompt_ds_en = task_ds.filter(lambda x: not(x["prompt_name"].endswith(("ht", "mt"))))
        
        score_en[name] = np.mean([x["score"] for x in prompt_ds_en if x['task_name'][-2:] in EVAL_LANGS])
        score_mt[name] = np.mean([x["score"] for x in prompt_ds_mt if x['task_name'][-2:] in EVAL_LANGS])
        score_ht[name] = np.mean([x["score"] for x in prompt_ds_ht if x['task_name'][-2:] in EVAL_LANGS]) if len(prompt_ds_ht) else -1

    def print_line(scores):
        # max scores for BLOOMZ and mT0
        best_scores = [np.max([scores[name] for name in ['BLOOMZ', 'BLOOMZ-MT']]), 
                       np.max([scores[name] for name in ['mT0', 'mT0-MT']])]
        for name in EVAL_MODELS:
            if scores[name] in best_scores:
              print(" & \\textbf{" + str(round(scores[name] * 100,2)) + "}", end = '')
            else:
              print(" & " + str(round(scores[name] * 100,2)), end = '')
        print(" \\\\")

    print("\\midrule")
    print(EVAL_TASKS[task_name], end='')
    print(" & EN", end='')
    print_line(score_en)
    
    print(" & MT", end='')
    print_line(score_mt)

    if task_name == "xnli":
      print(" & HT", end='')
      print_line(score_ht)


\midrule
XNLI & EN & \textbf{53.58} & 49.74 & 48.43 & \textbf{51.52} \\
 & MT & 37.87 & \textbf{42.03} & 39.83 & \textbf{42.64} \\
 & HT & 41.13 & \textbf{44.55} & 45.19 & \textbf{47.03} \\
\midrule
XCOPA & EN & 75.5 & \textbf{75.75} & \textbf{84.45} & 81.6 \\
 & MT & 71.95 & \textbf{74.25} & \textbf{82.9} & 80.2 \\
\midrule
XStoryCloze & EN & \textbf{84.42} & 84.07 & 82.52 & \textbf{82.58} \\
 & MT & 84.37 & \textbf{85.31} & \textbf{84.01} & 83.31 \\
\midrule
XWinograd & EN & \textbf{60.07} & 59.15 & 70.49 & \textbf{73.24} \\
 & MT & 58.48 & \textbf{60.14} & 66.89 & \textbf{72.33} \\


In [None]:
# Inspired by Appendix H from GPT-3
# Columns: Model ()

import numpy as np


# Languages not pretrained on
l2_xwino = ["jp", "ru"]
l2_xstory = ["ru", "my"]
l2_xcopa = ["et", "ht", "it", "qu", "tr"]
l2_xnli = ["bg", "de", "el", "ru", "th", "tr"]
L2 = set(l2_xwino + l2_xstory + l2_xcopa + l2_xnli)

# Langs ordered by PCTG in xP3
LANGS = ['en', 'es', 'pt', 'fr', 'ar', 'id', 'zh', 'hi', 'code', 'vi', 'ur', 'te', 'ta', 'bn', 'mr', 'sw', 'gu', 'pa', 'ne', 'yo', 'ig', 'ny', 'zu', 'xh', 'sn', 'ts', 'rw', 'lg', 'tn', 'nso', 'rn', 'ml', 'kn', 'or', 'as', 'ln', 'wo', 'tum', 'ki', 'st', 'fon', 'ca', 'eu', 'ak', 'bm', 'tw']
EVAL_LANGS = ['en', 'es', 'pt', 'fr', 'ar', 'id', 'zh', 'hi', 'vi', 'ur', 'ta', 'eu',]
EVAL_LANGS += ["Avg"] # Add Average score at the end

DS_TO_NAME = {
    "anli": "ANLI",
    "xnli": "XNLI",
    "super_glue": "SuperGLUE",
    "xcopa": "XCOPA",
    "story_cloze": "StoryCloze",
    "Muennighoff/xstory_cloze": "XStoryCloze",
    "winogrande": "Winogrande XL",
    "Muennighoff/xwinograd": "XWinograd",
}

DS_TO_SPLIT = {
    "super_glue": "validation",
    "xnli": "validation",
    "story_cloze": "validation",
    "Muennighoff/xstory_cloze": "validation",
    "xcopa": "validation",
    "winogrande": "validation",
    "Muennighoff/xwinograd": "test"
}

MT5_MODEL_TO_RES = {
    "mT5 XXL": mt5_xxl
}

MT0_MODEL_TO_RES = {
    "mT0-13B": mt0_xxl
}

BLOOM_MODEL_TO_RES = {
    "BLOOM-560M": bloom_560m,
    "BLOOM-1.1B": bloom_1b1,
    "BLOOM-1.7B": bloom_1b7,
    "BLOOM-3B": bloom_3b,
    "BLOOM-7.1B": bloom_7b1,
    "BLOOM": bloom,
}

BLOOMZ_MODEL_TO_RES = {
    "BLOOMZ-560M": bloomz_560m,
    "BLOOMZ-1.1B": bloomz_1b1,
    "BLOOMZ-1.7B": bloomz_1b7,
    "BLOOMZ-3B": bloomz_3b,
    "BLOOMZ-7.1B": bloomz_7b1,
    "BLOOMZ-7.1B-MT": bloomz_7b1_mt,
    "BLOOMZ-7.1B-P3": bloomz_7b1_p3,
    "BLOOMZ": bloomz,
    "BLOOMZ-MT": bloomz_mt,
    "BLOOMZ-P3": bloomz_p3,    
}

def get_task(task_name):
    if "wino" in task_name: return "Coref. Res."
    elif ("story" in task_name) or ("copa" in task_name): return "Completion"
    return "NLI"

HEADER = "\multicolumn{6}{c}{} & \multicolumn{6}{c}{Pretrained} & \multicolumn{11}{c}{Pretrained + Multitask finetuned} \\\\"
HEADER += "\n" + "Task & Dataset & Config & Split & Prompt & Metric"
HEADER += " & " + " & ".join(list(BLOOM_MODEL_TO_RES.keys()))
HEADER += " & " + " & ".join(list(BLOOMZ_MODEL_TO_RES.keys()))
HEADER += " & " + " & ".join(list(MT0_MODEL_TO_RES.keys()))
HEADER += " \\\\"

TABLE = HEADER

RES_DICT = {}
for ds, ds_name in DS_TO_NAME.items():
    print(f"Running dataset {ds}")
    for name, res_data in {**BLOOM_MODEL_TO_RES, **BLOOMZ_MODEL_TO_RES, **MT0_MODEL_TO_RES}.items():
        ds_data = res_data["test"].filter(lambda x: (x["evaluation_framework"]  == "bigscience/bloomz") and (x["task_name"].startswith(ds)), load_from_cache_file=False)

        # Iterate through subdatasets
        for task_name in set(ds_data["task_name"]):
            task_ds = ds_data.filter(lambda x: x["task_name"] == task_name)
            config = task_name.split("_")[-1]
            task = get_task(task_name)
            split = DS_TO_SPLIT.get(ds, "validation")

            #print(res_data)
            #print(f"Unexpected len {set(task_ds['prompt_name'])} for {ds} for {task_name} for {name}.")

            prompt_ds_ht = task_ds.filter(lambda x: x["prompt_name"].endswith("ht"))
            prompt_ds_mt = task_ds.filter(lambda x: x["prompt_name"].endswith("mt"))
            prompt_ds_en = task_ds.filter(lambda x: not(x["prompt_name"].endswith(("ht", "mt"))))

            for prompt_ds, prompt in [(prompt_ds_en, "EN"), (prompt_ds_ht, "HT"), (prompt_ds_mt, "MT")]:
                if len(prompt_ds) == 0: continue
                elif len(prompt_ds) != 5: print(f"Unexpected len {len(prompt_ds)} for {ds} for {task_name} for {prompt} for {name}.")
                score_median = np.median([x["score"] for x in prompt_ds])
                score_max = np.max([x["score"] for x in prompt_ds])

                RES_DICT.setdefault(task, {})
                RES_DICT[task].setdefault(ds, {})
                RES_DICT[task][ds].setdefault(config, {})
                RES_DICT[task][ds][config].setdefault(split, {})
                RES_DICT[task][ds][config][split].setdefault(prompt, {})
                RES_DICT[task][ds][config][split][prompt].setdefault("Median Acc", {})
                RES_DICT[task][ds][config][split][prompt].setdefault("Max Acc", {})
                RES_DICT[task][ds][config][split][prompt]["Median Acc"][name] = format(score_median * 100, '.2f')
                RES_DICT[task][ds][config][split][prompt]["Max Acc"][name] = format(score_max * 100, '.2f')


task, ds, config, split, prompt = "Program Synthesis", "openai_humaneval", "None", "test", "EN"
for name, res_data in {**BLOOM_MODEL_TO_RES, **BLOOMZ_MODEL_TO_RES, **MT0_MODEL_TO_RES}.items():
    ds_data = res_data["test"].filter(lambda x: (x["evaluation_framework"]  == "bloom-code-evaluation"), load_from_cache_file=False)
    for k in [1, 10, 100]:
        k_data = ds_data.filter(lambda x: x["metric"].startswith(f"pass@{k}-"))
        if len(k_data) == 0: continue
        RES_DICT.setdefault(task, {})
        RES_DICT[task].setdefault(ds, {})
        RES_DICT[task][ds].setdefault(config, {})
        RES_DICT[task][ds][config].setdefault(split, {})
        RES_DICT[task][ds][config][split].setdefault(prompt, {})
        RES_DICT[task][ds][config][split][prompt].setdefault(f"Pass@{k}", {})
        RES_DICT[task][ds][config][split][prompt][f"Pass@{k}"][name] = format(np.max(k_data["score"]) * 100, '.2f')


for task, vals in RES_DICT.items():
    for ds, sub_vals in sorted(vals.items()):
        for config, sub_sub_vals in sorted(sub_vals.items()):
            for split, sub_sub_sub_vals in sorted(sub_sub_vals.items()):
                for prompt, sub_sub_sub_sub_vals in sorted(sub_sub_sub_vals.items()):
                    for metric, sub_sub_sub_sub_sub_vals in sorted(sub_sub_sub_sub_vals.items()):
                        ONE_LINE = f"{task} & {ds} & {config} & {split} & {prompt} & {metric}"
                        #for name in MT5_MODEL_TO_RES:
                        #    ONE_LINE += " & " + sub_sub_sub_sub_sub_vals.get(name, "-")
                        for name in BLOOM_MODEL_TO_RES:
                            ONE_LINE += " & " + sub_sub_sub_sub_sub_vals.get(name, "-")
                        for name in BLOOMZ_MODEL_TO_RES:
                            ONE_LINE += " & " + sub_sub_sub_sub_sub_vals.get(name, "-")
                        for name in MT0_MODEL_TO_RES:
                            ONE_LINE += " & " + sub_sub_sub_sub_sub_vals.get(name, "-")                            
                        TABLE += "\n" + ONE_LINE + "\\\\"

# Escape _ in Latex
TABLE = TABLE.replace("_", "\_")
TABLE = TABLE.replace("Muennighoff/", "")


In [None]:
print(TABLE)