# Preparing datasets

In [1]:
from datasets import load_dataset
import pandas as pd
import json
import random
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


## MMLU

In [2]:
def load_mmlu_subset(subset):
    print(f"LOADING MMLU SUBSET {subset}")
    dataset = load_dataset("lukaemon/mmlu", subset)
    results = []
    for split in ["train", "test", "validation"]:
        df_split = pd.DataFrame(dataset[split])
        df_split["split"] = split
        df_split["subset"] = f"mmlu-{subset}"
        results.append(df_split)
    df = pd.concat(results).reset_index(drop=True)
    df["dataset"] = "mmlu"
    df["formatter"] = "mmlu"
    df["question"] = df["input"]
    df["variables"] = df.apply(
        lambda row: {
            "A": row["A"],
            "B": row["B"],
            "C": row["C"],
            "D": row["D"],
        },
        axis=1
    )
    df["variables"] = df["variables"].apply(json.dumps)
    df = df[["question", "variables", "target", "dataset", "formatter", "subset", "split"]]
    return df

In [3]:
df_mmlu = [
    load_mmlu_subset(config_name)
    for config_name in ['high_school_european_history', 'business_ethics', 'clinical_knowledge',
                        'medical_genetics', 'high_school_us_history', 'high_school_physics',
                        'high_school_world_history', 'virology', 'high_school_microeconomics',
                        'econometrics', 'college_computer_science', 'high_school_biology',
                        'abstract_algebra', 'professional_accounting', 'philosophy',
                        'professional_medicine', 'nutrition', 'global_facts', 'machine_learning',
                        'security_studies', 'public_relations', 'professional_psychology',
                        'prehistory', 'anatomy', 'human_sexuality', 'college_medicine',
                        'high_school_government_and_politics', 'college_chemistry',
                        'logical_fallacies', 'high_school_geography', 'elementary_mathematics',
                        'human_aging', 'college_mathematics', 'high_school_psychology',
                        'formal_logic', 'high_school_statistics', 'international_law',
                        'high_school_mathematics', 'high_school_computer_science', 'conceptual_physics',
                        'miscellaneous', 'high_school_chemistry', 'marketing',
                        'professional_law', 'management', 'college_physics',
                        'jurisprudence', 'world_religions', 'sociology',
                        'us_foreign_policy', 'high_school_macroeconomics', 'computer_security',
                        'moral_scenarios', 'moral_disputes', 'electrical_engineering',
                        'astronomy', 'college_biology']
]
df_mmlu = pd.concat(df_mmlu).reset_index(drop=True)
df_mmlu.head()

LOADING MMLU SUBSET high_school_european_history
LOADING MMLU SUBSET business_ethics
LOADING MMLU SUBSET clinical_knowledge
LOADING MMLU SUBSET medical_genetics
LOADING MMLU SUBSET high_school_us_history
LOADING MMLU SUBSET high_school_physics
LOADING MMLU SUBSET high_school_world_history
LOADING MMLU SUBSET virology
LOADING MMLU SUBSET high_school_microeconomics
LOADING MMLU SUBSET econometrics
LOADING MMLU SUBSET college_computer_science
LOADING MMLU SUBSET high_school_biology
LOADING MMLU SUBSET abstract_algebra
LOADING MMLU SUBSET professional_accounting
LOADING MMLU SUBSET philosophy
LOADING MMLU SUBSET professional_medicine
LOADING MMLU SUBSET nutrition
LOADING MMLU SUBSET global_facts
LOADING MMLU SUBSET machine_learning
LOADING MMLU SUBSET security_studies
LOADING MMLU SUBSET public_relations
LOADING MMLU SUBSET professional_psychology
LOADING MMLU SUBSET prehistory
LOADING MMLU SUBSET anatomy
LOADING MMLU SUBSET human_sexuality
LOADING MMLU SUBSET college_medicine
LOADING MMLU

Unnamed: 0,question,variables,target,dataset,formatter,subset,split
0,This question refers to the following informat...,"{""A"": ""The ideas of personal liberty and natio...",A,mmlu,mmlu,mmlu-high_school_european_history,train
1,This question refers to the following informat...,"{""A"": ""Capitalist"", ""B"": ""Scientific"", ""C"": ""C...",C,mmlu,mmlu,mmlu-high_school_european_history,train
2,This question refers to the following informat...,"{""A"": ""They served as a catalyst for the growt...",A,mmlu,mmlu,mmlu-high_school_european_history,train
3,This question refers to the following informat...,"{""A"": ""give the English king a new position of...",D,mmlu,mmlu,mmlu-high_school_european_history,train
4,This question refers to the following informat...,"{""A"": ""In ancient Rome, religious worship was ...",A,mmlu,mmlu,mmlu-high_school_european_history,test


## GSM8K

In [4]:
dataset_gsm8k = load_dataset("gsm8k", "main")
dataset_gsm8k

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 7473
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 1319
    })
})

In [5]:
df_gsm8k = pd.concat([
    pd.DataFrame(dataset_gsm8k[split]).assign(split=split)
    for split in ["train", "test"]
])
df_gsm8k["dataset"] = "gsm8k"
df_gsm8k["formatter"] = "gsm8k"
df_gsm8k["subset"] = "gsm8k"
df_gsm8k["variables"] = df_gsm8k["answer"].apply(lambda answer: answer.split("####")[0].strip())\
    .apply(lambda cot: {"chain_of_thoughts": cot}) \
    .apply(json.dumps)
df_gsm8k["target"] = df_gsm8k["answer"].apply(lambda answer: answer.split("####")[1].strip())
df_gsm8k = df_gsm8k[["question", "variables", "target", "dataset", "formatter", "subset", "split"]]
df_gsm8k.head()

Unnamed: 0,question,variables,target,dataset,formatter,subset,split
0,Natalia sold clips to 48 of her friends in Apr...,"{""chain_of_thoughts"": ""Natalia sold 48/2 = <<4...",72,gsm8k,gsm8k,gsm8k,train
1,Weng earns $12 an hour for babysitting. Yester...,"{""chain_of_thoughts"": ""Weng earns 12/60 = $<<1...",10,gsm8k,gsm8k,gsm8k,train
2,Betty is saving money for a new wallet which c...,"{""chain_of_thoughts"": ""In the beginning, Betty...",5,gsm8k,gsm8k,gsm8k,train
3,"Julie is reading a 120-page book. Yesterday, s...","{""chain_of_thoughts"": ""Maila read 12 x 2 = <<1...",42,gsm8k,gsm8k,gsm8k,train
4,James writes a 3-page letter to 2 different fr...,"{""chain_of_thoughts"": ""He writes each friend 3...",624,gsm8k,gsm8k,gsm8k,train


## BBH

In [6]:
def load_bbh_subset(subset):
    print(f"LOAD BBH {subset} SUBSET")
    dataset = load_dataset("lukaemon/bbh", subset)
    df = pd.DataFrame(dataset["test"])
    df["question"] = df["input"]
    df["variables"] = "{}"
    df["dataset"] = "bbh"
    df["formatter"] = f"bbh-{subset}"
    df["subset"] = f"bbh-{subset}"
    df["split"] = "test"
    df = df[["question", "variables", "target", "dataset", "formatter", "subset", "split"]]
    return df

In [7]:
df_bbh = [
    load_bbh_subset(subset)
    for subset in ['boolean_expressions', 'causal_judgement', 'date_understanding',
                   'disambiguation_qa', 'dyck_languages', 'formal_fallacies',
                   'geometric_shapes', 'hyperbaton', 'logical_deduction_five_objects',
                   'logical_deduction_seven_objects', 'logical_deduction_three_objects',
                   'movie_recommendation', 'multistep_arithmetic_two',
                   'navigate', 'object_counting', 'penguins_in_a_table',
                   'reasoning_about_colored_objects', 'ruin_names', 'salient_translation_error_detection',
                   'snarks', 'sports_understanding', 'temporal_sequences',
                   'tracking_shuffled_objects_five_objects', 'tracking_shuffled_objects_seven_objects',
                   'tracking_shuffled_objects_three_objects', 'web_of_lies', 'word_sorting']
]
df_bbh = pd.concat(df_bbh).reset_index(drop=True)
df_bbh.head()

LOAD BBH boolean_expressions SUBSET
LOAD BBH causal_judgement SUBSET
LOAD BBH date_understanding SUBSET
LOAD BBH disambiguation_qa SUBSET
LOAD BBH dyck_languages SUBSET
LOAD BBH formal_fallacies SUBSET
LOAD BBH geometric_shapes SUBSET
LOAD BBH hyperbaton SUBSET
LOAD BBH logical_deduction_five_objects SUBSET
LOAD BBH logical_deduction_seven_objects SUBSET
LOAD BBH logical_deduction_three_objects SUBSET
LOAD BBH movie_recommendation SUBSET
LOAD BBH multistep_arithmetic_two SUBSET
LOAD BBH navigate SUBSET
LOAD BBH object_counting SUBSET
LOAD BBH penguins_in_a_table SUBSET
LOAD BBH reasoning_about_colored_objects SUBSET
LOAD BBH ruin_names SUBSET
LOAD BBH salient_translation_error_detection SUBSET
LOAD BBH snarks SUBSET
LOAD BBH sports_understanding SUBSET
LOAD BBH temporal_sequences SUBSET
LOAD BBH tracking_shuffled_objects_five_objects SUBSET
LOAD BBH tracking_shuffled_objects_seven_objects SUBSET
LOAD BBH tracking_shuffled_objects_three_objects SUBSET
LOAD BBH web_of_lies SUBSET
LOAD BB

Unnamed: 0,question,variables,target,dataset,formatter,subset,split
0,not ( True ) and ( True ) is,{},False,bbh,bbh-boolean_expressions,bbh-boolean_expressions,test
1,True and not not ( not False ) is,{},True,bbh,bbh-boolean_expressions,bbh-boolean_expressions,test
2,not True or False or ( False ) is,{},False,bbh,bbh-boolean_expressions,bbh-boolean_expressions,test
3,False or not ( True ) and False is,{},False,bbh,bbh-boolean_expressions,bbh-boolean_expressions,test
4,True or not False and True and False is,{},True,bbh,bbh-boolean_expressions,bbh-boolean_expressions,test


## HumanEval

In [8]:
dataset_humaneval = load_dataset("openai_humaneval")
dataset_humaneval

DatasetDict({
    test: Dataset({
        features: ['task_id', 'prompt', 'canonical_solution', 'test', 'entry_point'],
        num_rows: 164
    })
})

In [9]:
df_humaneval = pd.DataFrame(dataset_humaneval["test"])
df_humaneval["question"] = df_humaneval["prompt"]
df_humaneval["target"] = df_humaneval["canonical_solution"]
df_humaneval["variables"] = df_humaneval["test"].apply(lambda val: {"test": val})\
    .apply(json.dumps)
df_humaneval["dataset"] = "humaneval"
df_humaneval["formatter"] = "humaneval"
df_humaneval["subset"] = "main"
df_humaneval["split"] = "test"
df_humaneval = df_humaneval[["question", "variables", "target", "dataset", "formatter", "subset", "split"]]
df_humaneval.head()

Unnamed: 0,question,variables,target,dataset,formatter,subset,split
0,from typing import List\n\n\ndef has_close_ele...,"{""test"": ""\n\nMETADATA = {\n 'author': 'jt'...","for idx, elem in enumerate(numbers):\n ...",humaneval,humaneval,main,test
1,from typing import List\n\n\ndef separate_pare...,"{""test"": ""\n\nMETADATA = {\n 'author': 'jt'...",result = []\n current_string = []\n ...,humaneval,humaneval,main,test
2,\n\ndef truncate_number(number: float) -> floa...,"{""test"": ""\n\nMETADATA = {\n 'author': 'jt'...",return number % 1.0\n,humaneval,humaneval,main,test
3,from typing import List\n\n\ndef below_zero(op...,"{""test"": ""\n\nMETADATA = {\n 'author': 'jt'...",balance = 0\n\n for op in operations:\n...,humaneval,humaneval,main,test
4,from typing import List\n\n\ndef mean_absolute...,"{""test"": ""\n\nMETADATA = {\n 'author': 'jt'...",mean = sum(numbers) / len(numbers)\n re...,humaneval,humaneval,main,test


## Join

In [10]:
df = pd.concat([df_mmlu, df_gsm8k, df_bbh, df_humaneval])\
    .reset_index(drop=True)
df.head()

Unnamed: 0,question,variables,target,dataset,formatter,subset,split
0,This question refers to the following informat...,"{""A"": ""The ideas of personal liberty and natio...",A,mmlu,mmlu,mmlu-high_school_european_history,train
1,This question refers to the following informat...,"{""A"": ""Capitalist"", ""B"": ""Scientific"", ""C"": ""C...",C,mmlu,mmlu,mmlu-high_school_european_history,train
2,This question refers to the following informat...,"{""A"": ""They served as a catalyst for the growt...",A,mmlu,mmlu,mmlu-high_school_european_history,train
3,This question refers to the following informat...,"{""A"": ""give the English king a new position of...",D,mmlu,mmlu,mmlu-high_school_european_history,train
4,This question refers to the following informat...,"{""A"": ""In ancient Rome, religious worship was ...",A,mmlu,mmlu,mmlu-high_school_european_history,test


## Rearrange splits

Some datasets have only test splits. That's fine but we want to do some train-validation-test processing.

In [11]:
REARRANGE_RANDOM_STATE = 42

In [12]:
df.groupby(["dataset", "split"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,question,variables,target,formatter,subset
dataset,split,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
bbh,test,6511,6511,6511,6511,6511
gsm8k,test,1319,1319,1319,1319,1319
gsm8k,train,7473,7473,7473,7473,7473
humaneval,test,164,164,164,164,164
mmlu,test,13985,13985,13985,13985,13985
mmlu,train,228,228,228,228,228
mmlu,validation,1474,1474,1474,1474,1474


In [13]:
def rearrange_split(df, dataset, rearranging, random_state):
    masks = {}
    counts = {}
    for source_split in rearranging:
        masks[source_split] = (df["dataset"] == dataset) & (df["split"] == source_split)
        counts[source_split] = masks[source_split].sum()
    random.seed(random_state)
    for source_split, dst_splits in rearranging.items():
        source_mask = masks[source_split]
        source_count = counts[source_split]
        
        new_splits = dict(dst_splits)
        new_splits[source_split] = 1.0 - sum(dst_splits.values())
        
        new_split_names = sorted(new_splits.keys())
        new_split_probas = [new_splits[name] for name in new_split_names]
        
        new_split_choices = [
            random.choices(
                new_split_names,
                weights=new_split_probas,
                k=1
            )[0]
            for _ in range(source_count)
        ]
        
        df.loc[source_mask, "split"] = new_split_choices
    
    return df

In [14]:
df = rearrange_split(df, "bbh", {"test": {"train": 0.33, "validation": 0.33}}, REARRANGE_RANDOM_STATE)
df = rearrange_split(df, "gsm8k", {"train": {"validation": 0.5}}, REARRANGE_RANDOM_STATE)
df = rearrange_split(df, "humaneval", {"test": {"train": 0.33, "validation": 0.33}}, REARRANGE_RANDOM_STATE)

In [15]:
df.groupby(["dataset", "split"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,question,variables,target,formatter,subset
dataset,split,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
bbh,test,2204,2204,2204,2204,2204
bbh,train,2147,2147,2147,2147,2147
bbh,validation,2160,2160,2160,2160,2160
gsm8k,test,1319,1319,1319,1319,1319
gsm8k,train,3725,3725,3725,3725,3725
gsm8k,validation,3748,3748,3748,3748,3748
humaneval,test,65,65,65,65,65
humaneval,train,51,51,51,51,51
humaneval,validation,48,48,48,48,48
mmlu,test,13985,13985,13985,13985,13985


## Subsampling

Too much data for such a small experiment, so I will limit it to subsample of original for some sets

In [16]:
SUBSAMPLE_RANDOM_STATE = 42

In [17]:
def subsample(df, dataset, split, keep_count, random_state):
    mask_other_datasets = df["dataset"] != dataset
    mask_same_dataset = ~mask_other_datasets
    mask_other_splits = mask_same_dataset & (df["split"] != split)
    mask_same_split = mask_same_dataset & (df["split"] == split)
    mask_keep = mask_other_datasets | mask_other_splits
    mask_remove = mask_same_split
    
    _, indices_keep = train_test_split(
        list(df.loc[mask_remove].index),
        stratify=list(df.loc[mask_remove, "subset"]),
        random_state=random_state,
        test_size=keep_count,
    )
    mask_remove[indices_keep] = False
    
    mask_keep = mask_keep | (~mask_remove)
    
    return df.loc[mask_keep]

In [18]:
df = subsample(df, "gsm8k", "train", 100, SUBSAMPLE_RANDOM_STATE)
df = subsample(df, "gsm8k", "validation", 100, SUBSAMPLE_RANDOM_STATE)
df = subsample(df, "gsm8k", "test", 100, SUBSAMPLE_RANDOM_STATE)

In [19]:
df = subsample(df, "bbh", "train", 100, SUBSAMPLE_RANDOM_STATE)
df = subsample(df, "bbh", "validation", 100, SUBSAMPLE_RANDOM_STATE)
df = subsample(df, "bbh", "test", 100, SUBSAMPLE_RANDOM_STATE)

In [20]:
df = subsample(df, "mmlu", "test", 300, SUBSAMPLE_RANDOM_STATE)
df = subsample(df, "mmlu", "validation", 300, SUBSAMPLE_RANDOM_STATE)

In [21]:
df.groupby(["dataset", "split"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,question,variables,target,formatter,subset
dataset,split,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
bbh,test,100,100,100,100,100
bbh,train,100,100,100,100,100
bbh,validation,100,100,100,100,100
gsm8k,test,100,100,100,100,100
gsm8k,train,100,100,100,100,100
gsm8k,validation,100,100,100,100,100
humaneval,test,65,65,65,65,65
humaneval,train,51,51,51,51,51
humaneval,validation,48,48,48,48,48
mmlu,test,300,300,300,300,300


## Saving dataset and LLM prompt templates

In [22]:
for subset, index in df.groupby("subset").apply(lambda df: df.sample(1, random_state=42).index[0]).to_dict().items():
    print(subset)
    print(df.loc[index])
    print("-" * 80)

bbh-boolean_expressions
question     False and not ( not not False ) is
variables                                    {}
target                                    False
dataset                                     bbh
formatter               bbh-boolean_expressions
subset                  bbh-boolean_expressions
split                                     train
Name: 24705, dtype: object
--------------------------------------------------------------------------------
bbh-causal_judgement
question     How would a typical person answer each of the ...
variables                                                   {}
target                                                      No
dataset                                                    bbh
formatter                                 bbh-causal_judgement
subset                                    bbh-causal_judgement
split                                               validation
Name: 24801, dtype: object
-------------------------------------------

In [23]:
formatters = {}
formatters["bbh-boolean_expressions"] = "USER: Calculate the following expression {question}. <s> ASSISTANT: Let's think step by step. "
formatters["bbh-boolean_expressions-no-cot"] = "USER: Calculate the following expression {question}. Give an immediate answer. <s> ASSISTANT: {target} <s>"
formatters["bbh-boolean_expressions-cot"] = formatters["bbh-boolean_expressions"] + " {chain_of_thoughts} <s> ASSISTANT: {target} <s>"
formatters["bbh-causal_judgement"] = "USER: {question}. <s> ASSISTANT: Let's think step by step. "
formatters["bbh-causal_judgement-cot"] = formatters["bbh-causal_judgement"] + " {chain_of_thoughts} <s> ASSISTANT: {target} <s>"
formatters["bbh-causal_judgement-no-cot"] = "USER: {question}. Give an immediate answer. <s> ASSISTANT: {target} <s>"
formatters["bbh-date_understanding"] = "USER: {question}. <s> ASSISTANT: Let's think step by step. "
formatters["bbh-date_understanding-cot"] = formatters["bbh-date_understanding"] + " {chain_of_thoughts} <s> ASSISTANT: {target} <s>"
formatters["bbh-date_understanding-no-cot"] = "USER: {question}. Give an immediate answer. <s> ASSISTANT: {target} <s>"
formatters["bbh-disambiguation_qa"] = "USER: {question}. <s> ASSISTANT: Let's think step by step. "
formatters["bbh-disambiguation_qa-cot"] = formatters["bbh-disambiguation_qa"] + " {chain_of_thoughts} <s> ASSISTANT: {target} <s>"
formatters["bbh-disambiguation_qa-no-cot"] = "USER: {question}. Give an immediate answer. <s> ASSISTANT: {target} <s>"
formatters["bbh-dyck_languages"] = "USER: {question}. <s> ASSISTANT: Let's think step by step. "
formatters["bbh-dyck_languages-cot"] = formatters["bbh-dyck_languages"] + " {chain_of_thoughts} <s> ASSISTANT: {target} <s>"
formatters["bbh-dyck_languages-no-cot"] = "USER: {question}. Give an immediate answer. <s> ASSISTANT: {target} <s>"
formatters["bbh-formal_fallacies"] = "USER: {question}. <s> ASSISTANT: Let's think step by step. "
formatters["bbh-formal_fallacies-cot"] = formatters["bbh-formal_fallacies"] + " {chain_of_thoughts} <s> ASSISTANT: {target} <s>"
formatters["bbh-formal_fallacies-no-cot"] = "USER: {question}. Give an immediate answer. <s> ASSISTANT: {target} <s> "
formatters["bbh-geometric_shapes"] = "USER: {question}. <s> ASSISTANT: Let's think step by step. "
formatters["bbh-geometric_shapes-cot"] = formatters["bbh-geometric_shapes"] + " {chain_of_thoughts} <s> ASSISTANT: {target} <s>"
formatters["bbh-geometric_shapes-no-cot"] = "USER: {question}. Give an immediate answer. <s> ASSISTANT: {target} <s> "
formatters["bbh-hyperbaton"] = "USER: {question}. <s> ASSISTANT: Let's think step by step. "
formatters["bbh-hyperbaton-cot"] = formatters["bbh-hyperbaton"] + " {chain_of_thoughts} <s> ASSISTANT: {target} <s>"
formatters["bbh-hyperbaton-no-cot"] = "USER: {question}. Give an immediate answer. <s> ASSISTANT: {target} <s>"
formatters["bbh-logical_deduction_five_objects"] = "USER: {question}. <s> ASSISTANT: Let's think step by step. "
formatters["bbh-logical_deduction_five_objects-cot"] = formatters["bbh-logical_deduction_five_objects"] + " {chain_of_thoughts} <s> ASSISTANT: {target} <s>"
formatters["bbh-logical_deduction_five_objects-no-cot"] = "USER: {question}. Give an immediate answer. <s> ASSISTANT: {target} <s>"
formatters["bbh-logical_deduction_seven_objects"] = "USER: {question}. <s> ASSISTANT: Let's think step by step. "
formatters["bbh-logical_deduction_seven_objects-cot"] = formatters["bbh-logical_deduction_seven_objects"] + " {chain_of_thoughts} <s> ASSISTANT: {target} <s>"
formatters["bbh-logical_deduction_seven_objects-no-cot"] = "USER: {question}. Give an immediate answer <s> ASSISTANT: {target} <s>"
formatters["bbh-logical_deduction_three_objects"] = "USER: {question}. <s> ASSISTANT: Let's think step by step. "
formatters["bbh-logical_deduction_three_objects-cot"] = formatters["bbh-logical_deduction_three_objects"] + " {chain_of_thoughts} <s> ASSISTANT: {target} <s>"
formatters["bbh-logical_deduction_three_objects-no-cot"] = "USER: {question}. Give an immediate answer. <s> ASSISTANT: {target} <s>"
formatters["bbh-movie_recommendation"] = "USER: {question}. <s> ASSISTANT: Let's think step by step. "
formatters["bbh-movie_recommendation-cot"] = formatters["bbh-movie_recommendation"] + " {chain_of_thoughts} <s> ASSISTANT: {target} <s>"
formatters["bbh-movie_recommendation-no-cot"] = "USER: {question}. Give an immediate answer. <s> ASSISTANT: {target} <s>"
formatters["bbh-multistep_arithmetic_two"] = "USER: I have the following math expression, calculate it `{question}`. <s> ASSISTANT: Let's think step by step. "
formatters["bbh-multistep_arithmetic_two-cot"] = formatters["bbh-multistep_arithmetic_two"] + " {chain_of_thoughts} <s> ASSISTANT: {target} <s>"
formatters["bbh-multistep_arithmetic_two-no-cot"] = "USER: I have the following math expression, calculate it `{question}`. Give an immediate answer. <s> ASSISTANT: {target} <s>"
formatters["bbh-navigate"] = "USER: {question}. <s> ASSISTANT: Let's think step by step. "
formatters["bbh-navigate-cot"] = formatters["bbh-navigate"] + " {chain_of_thoughts} <s> ASSISTANT: {target} <s>"
formatters["bbh-navigate-no-cot"] = "USER: {question}. Give an immediate answer. <s> ASSISTANT: {target} <s>"
formatters["bbh-object_counting"] = "USER: {question}. <s> ASSISTANT: Let's think step by step. "
formatters["bbh-object_counting-cot"] = formatters["bbh-object_counting"] + " {chain_of_thoughts} <s> ASSISTANT: {target} <s>"
formatters["bbh-object_counting-no-cot"] = "USER: {question}. Give an immediate answer. <s> ASSISTANT: {target} <s>"
formatters["bbh-penguins_in_a_table"] = "USER: {question}. <s> ASSISTANT: Let's think step by step. "
formatters["bbh-penguins_in_a_table-cot"] = formatters["bbh-penguins_in_a_table"] + " {chain_of_thoughts} <s> ASSISTANT: {target} <s>"
formatters["bbh-penguins_in_a_table-no-cot"] = "USER: {question}. Give an immediate answer. <s> ASSISTANT: {target} <s>"
formatters["bbh-reasoning_about_colored_objects"] = "USER: {question}. <s> ASSISTANT: Let's think step by step. "
formatters["bbh-reasoning_about_colored_objects-cot"] = formatters["bbh-reasoning_about_colored_objects"] + " {chain_of_thoughts} <s> ASSISTANT: {target} <s>"
formatters["bbh-reasoning_about_colored_objects-no-cot"] = "USER: {question}. Give an immediate answer. <s> ASSISTANT: {target} <s>"
formatters["bbh-ruin_names"] = "USER: {question}. <s> ASSISTANT: Let's think step by step. "
formatters["bbh-ruin_names-cot"] = formatters["bbh-ruin_names"] + " {chain_of_thoughts} <s> ASSISTANT: {target} <s>"
formatters["bbh-ruin_names-no-cot"] = "USER: {question}. Give an immediate answer. <s> ASSISTANT: {target} <s>"
formatters["bbh-salient_translation_error_detection"] = "USER: {question}. <s> ASSISTANT: Let's think step by step. "
formatters["bbh-salient_translation_error_detection-cot"] = formatters["bbh-salient_translation_error_detection"] + " {chain_of_thoughts} <s> ASSISTANT: {target} <s>"
formatters["bbh-salient_translation_error_detection-no-cot"] = "USER: {question}. Give an immediate answer. <s> ASSISTANT: {target} <s>"
formatters["bbh-snarks"] = "USER: {question}. <s> ASSISTANT: Let's think step by step. "
formatters["bbh-snarks-cot"] = formatters["bbh-snarks"] + " {chain_of_thoughts} <s> ASSISTANT: {target} <s>"
formatters["bbh-snarks-no-cot"] = "USER: {question}. Give an immediate answer. <s> ASSISTANT: {target} <s>"
formatters["bbh-sports_understanding"] = "USER: {question}. <s> ASSISTANT: Let's think step by step. "
formatters["bbh-sports_understanding-cot"] = formatters["bbh-sports_understanding"] + " {chain_of_thoughts} <s> ASSISTANT: {target} <s>"
formatters["bbh-sports_understanding-no-cot"] = "USER: {question}. Give an immediate answer. <s> ASSISTANT: {target} <s>"
formatters["bbh-temporal_sequences"] = "USER: {question}. <s> ASSISTANT: Let's think step by step. "
formatters["bbh-temporal_sequences-cot"] = formatters["bbh-temporal_sequences"] + " {chain_of_thoughts} <s> ASSISTANT: {target} <s>"
formatters["bbh-temporal_sequences-no-cot"] = "USER: {question}. Give an immediate answer. <s> ASSISTANT: {target} <s>"
formatters["bbh-tracking_shuffled_objects_five_objects"] = "USER: {question}. <s> ASSISTANT: Let's think step by step. "
formatters["bbh-tracking_shuffled_objects_five_objects-cot"] = formatters["bbh-tracking_shuffled_objects_five_objects"] + " {chain_of_thoughts} <s> ASSISTANT: {target} <s>"
formatters["bbh-tracking_shuffled_objects_five_objects-no-cot"] = "USER: {question}. Give an immediate answer. <s> ASSISTANT: {target} <s>"
formatters["bbh-tracking_shuffled_objects_seven_objects"] = "USER: {question}. <s> ASSISTANT: Let's think step by step. "
formatters["bbh-tracking_shuffled_objects_seven_objects-cot"] = formatters["bbh-tracking_shuffled_objects_seven_objects"] + " {chain_of_thoughts} <s> ASSISTANT: {target} <s>"
formatters["bbh-tracking_shuffled_objects_seven_objects-no-cot"] = "USER: {question}. Give an immediate answer. <s> ASSISTANT: {target} <s>"
formatters["bbh-tracking_shuffled_objects_three_objects"] = "USER: {question}. <s> ASSISTANT: Let's think step by step. "
formatters["bbh-tracking_shuffled_objects_three_objects-cot"] = formatters["bbh-tracking_shuffled_objects_three_objects"] + " {chain_of_thoughts} <s> ASSISTANT: {target} <s>"
formatters["bbh-tracking_shuffled_objects_three_objects-no-cot"] = "USER: {question}. Give an immediate answer. <s> ASSISTANT: {target} <s>"
formatters["bbh-web_of_lies"] = "USER: {question}. <s> ASSISTANT: Let's think step by step. "
formatters["bbh-web_of_lies-cot"] = formatters["bbh-web_of_lies"] + " {chain_of_thoughts} <s> ASSISTANT: {target} <s>"
formatters["bbh-web_of_lies-no-cot"] = "USER: {question}. Give an immediate answer. <s> ASSISTANT: {target} <s>"
formatters["bbh-word_sorting"] = "USER: {question}. <s> ASSISTANT: Let's think step by step. "
formatters["bbh-word_sorting-cot"] = formatters["bbh-word_sorting"] + " {chain_of_thoughts} <s> ASSISTANT: {target} <s>"
formatters["bbh-word_sorting-no-cot"] = "USER: {question}. Give an immediate answer. <s> ASSISTANT: {target} <s>"
formatters["gsm8k"] = "USER: {question}. <s> ASSISTANT: Let's think step by step. {chain_of_thoughts} "
formatters["gsm8k-cot"] = formatters["gsm8k"] + " <s> ASSISTANT: {target} <s>"
formatters["gsm8k-no-cot"] = "USER: {question}. Give an immediate answer. <s> ASSISTANT: {target} <s> "
formatters["humaneval"] = "USER: We have the following code ```\n{question}\n``` and tests ```{test}```, help me write main code. <s> ASSISTANT: Let's think step by step. "
formatters["humaneval-cot"] = formatters["humaneval"] + " {chain_of_thoughts} <s> ASSISTANT: ```\n{target}\n``` <s>"
formatters["humaneval-no-cot"] = "USER: We have the following code ```\n{question}\n``` and tests ```{test}```, help me write main code. Give an immediate answer. <s> ASSISTANT: ```\n{target}\n``` <s>"
formatters["mmlu"] = "USER: {question}\nA: {A}\nB: {B}\nC: {C}\nD: {D} <s> ASSISTANT: Let's think step by step. "
formatters["mmlu-cot"] = formatters["mmlu"] + " {chain_of_thoughts} <s> ASSISTANT: {target} <s>"
formatters["mmlu-no-cot"] = "USER: {question}\nA: {A}\nB: {B}\nC: {C}\nD: {D}\nGive an immediate answer. <s> ASSISTANT: {target} <s>"

In [24]:
for formatter in formatters:
    if formatter.endswith("-cot"):
        continue
    assert f"{formatter}-cot" in formatters, f"No {formatter}-cot"
    assert f"{formatter}-no-cot" in formatters, f"No {formatter}-no-cot"

In [25]:
pd.Series(formatters).reset_index().rename(columns={"index": "formatter", 0: "text"}).to_csv("formatters.csv", index=False)

In [26]:
df.to_csv("llama-reasoning.csv", index=False)