In [2]:
import datasets
import os
import openai
import numpy as np
with open(os.path.expanduser('~/.openai_api_key'), 'r') as file:
    openai.api_key = file.read().replace('\n', '')

import adatest
import re
import json
import jsonlines
import seqio
import os
os.environ['CURL_CA_BUNDLE'] = "/etc/ssl/certs/ca-bundle.crt"
# from bigbench.bbseqio import tasks
vocabulary=seqio.SentencePieceVocabulary("/gscratch/zlab/bparan/projects/cascades/models/t5-spiece.model")
from sklearn.metrics import accuracy_score
from typing import List
import tqdm

with open(os.path.expanduser('~/.openai_api_key'), 'r') as file:
    openai.api_key = file.read().replace('\n', '')
print(openai.api_key)
cache_dir = '/gscratch/zlab/bparan/projects/cascades/data'

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


### GPT-3 Model for prompting

In [3]:
class OpenAIModel(adatest.Model):
    def __init__(self, model="text-davinci-002", quote="\"", temperature=0.7, top_p=1, max_length=30, n=1):
        self.model = model
        self.api_key = openai.api_key
        self.quote = quote
        self.temperature = temperature
        self.top_p = top_p
        self.max_length = max_length
        self.n = n
    def __call__(self, strings):
        resp = openai.Completion.create(
            model=self.model,
            prompt=strings,
            max_tokens=self.max_length,
            temperature=self.temperature,
            top_p=self.top_p,
            n=self.n,
            stop=self.quote,
        )
        return [x["text"] for x in resp['choices']]

gpt3 = OpenAIModel(model="text-davinci-002",  max_length=200, quote='', n=1)


def get_bb_data(task_name, task_sequence_length, zero=False, few=False, explain=False):
    seq_length = task_sequence_length
    # By default seqio returns 3_shot.
    if zero:
        task = seqio.get_mixture_or_task(task_name)
    elif few:
        task = seqio.get_mixture_or_task(task_name.replace('0_shot', '3_shot'))

    ds = task.get_dataset(split="all", sequence_length={"inputs": seq_length, "targets": seq_length})
    instances = []
    for enum, ex in enumerate(ds):
        instances.append({"guid": enum, 
                          "input": ex['inputs_pretokenized'].numpy().decode(),
                          "label": ex["targets_pretokenized"].numpy().decode()})
    return instances

### Prompt to propose an instruction

In [4]:
def propose_decomposition(decomp_prompt, io_pairs, n=20):
    gpt3 = OpenAIModel(model="text-davinci-002",  max_length=400, quote='---', n=n)
    prompt = '''%s. Here are examples of input-output pairs for the task I'm trying to break down.
----
%s
----
Steps:
1.'''%(decomp_prompt, io_pairs)
    return gpt3(prompt)

In [5]:
def propose_instruction(instruct_prompt, io_pairs, n=20):
    gpt3 = OpenAIModel(model="text-davinci-002",  max_length=400, quote='---', n=n)
    prompt = '''%s. Here are examples of input-output pairs for this task.
----
%s
----
I can do this task by'''%(instruct_prompt, io_pairs)
    return gpt3(prompt)

### Automatic Decomposition Helper functions

In [6]:
def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]

In [7]:
def get_subset(inputs, labels, n=100):
    idxs = np.random.choice(len(inputs), n, replace=False)
    labs = np.array([labels[i] for i in idxs])
    subset = [inputs[i] for i in idxs]
    return labs, subset

# Tasks 

For each tasks, we compute:
* Best current performance using BigBench instruction
* Best human decomposition performance over N runs: Known decomps or ones that we come up with. A further variant of this is (a) Decompositing into individual GPT-3 calls with few-shot prompting (decompositional prompting) and (b) Making and integrating external affordance calls when needed.
* Automatic instruction generation (APE): Reporting on top-K instructions. APE reports average over top-10 for 200 instructions. They also have an efficient score estimation technique whereby promising candidates (evaluated based on a small subset) are given more compute resource. 
* Automatic decomposition generation, followed by zero-shot application to downstream task. Reporting average performance over top-k decompositions
* Automtic decomposition: Instruction refinement and decomposition ensembling
* Potential affordance calls and decompsoitions with those calls. 
* Human-LLM collaborative decompositions

Things to keep track of:
* Evaluation metric computation
* Generated sequence length 
* Fitting in as many decompositions into the promtp as possible

Things to think about:
* Affordance calls and their integration into the decomposition
* Human GPT-3 Collaboration for decompositions

#### Anachronisms

In [12]:
# Get data
d = datasets.load_dataset('bigbench', 'anachronisms')
inputs = d['train']['inputs'] + d['validation']['inputs']
inputs = [x.split('\n')[0] for x in inputs]
labels = np.array([int(x[0] == 'Yes') for x in d['train']['targets'] + d['validation']['targets']])

Downloading and preparing dataset bigbench/anachronisms (download: Unknown size, generated: 95.60 KiB, post-processed: Unknown size, total: 95.60 KiB) to /mmfs1/home/bparan/.cache/huggingface/datasets/bigbench/anachronisms/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0...


Generating default split:   0%|          | 0/230 [00:00<?, ? examples/s]

2022-10-16 11:36:33.156433: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Generating train split:   0%|          | 0/184 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/46 [00:00<?, ? examples/s]

Dataset bigbench downloaded and prepared to /mmfs1/home/bparan/.cache/huggingface/datasets/bigbench/anachronisms/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [16]:
# Human Decomp 
def anachronism(x):
    gpt3 = OpenAIModel(model="text-davinci-002",  max_length=200, quote='---', n=1)
    prompt = '''Given a sentence and the time periods of each entity in it, tell me if it could have happened or not.
Sentence: I wrote about shakespeare
Entities and dates:
I -> 21st century
Shakespeare -> 16th century
Could the sentence be true based on the dates alone: Yes
----
Sentence: Shakespeare wrote about me

Entities and dates:
Shakespeare -> 16th century
I -> 21st century

Could the sentence be true based on the dates alone: No
----
Sentence: %s''' % x
    return gpt3(prompt)

perf_array = []
runs = 2
for run in range(runs): 
    answers = []
    for x in inputs:
        answers.append(anachronism(x))
    preds = np.array([int(x[0].endswith('No')) for x in answers])
    perf_array.append((preds == labels).mean())
print("Human Performance:")
print("Mean", np.mean(perf_array))
print("Std. Dev", np.mean(perf_array))

Human Performance:
Mean 0.7826086956521738
Std. Dev 0.7826086956521738


In [25]:
# Automatic instruction runs.

instruct_prompt = 'I want to figure out whether a sentence contains anachronisms or not. An anachronism is a mistake in chronology, or a person, thing, or event that is out of its proper time.'
io_pairs = """Input: George Washington fought in the American Civil War.
Output: No
Input: The Mongolian horse rider used his bow to hunt the velociraptor.
Output: Yes
Input: Beats from the MPC3000 helped inspire many original blues artists.
Output: No
Input: Attila the Hun acted in the live-action remake of Mulan.
Output: Yes
Input: Kurt Cobain starred in the 1990 television show "Twin Peaks".
Output: Yes"""

instructions = propose_instruction(instruct_prompt, io_pairs, 50)

def get_anachronism_ape_fn(instruction, batch_size=10):
#     decomposition = '1.'+ decomposition
#     last_n = int(re.findall(r'(\d+)\.', decomposition)[-1])
#     decomposition += '\n%s. Output YES if there is an anachronism, and NO otherwise' % (last_n + 1)
    instruction = instruction.strip()
    def decomposition_ape_fn(sentences):
        gpt3 = OpenAIModel(model="text-davinci-002",  max_length=400, quote='---', n=1)
        out = []
        for chunk in chunks(sentences, batch_size):
            prompts = ['''An anachronism is a mistake in chronology, or a person, thing, or event that is out of its proper time. Figure out whether a sentence contains anachronisms or not, using this instruction.
Instruction:
%s
----
Sentence: %s
Is this an Anachronism? Output YES if there is an anachronism, and NO otherwise.''' % (instruction, x) for x in chunk]
            out.extend(gpt3(prompts))
        return out
    return decomposition_ape_fn

labs, subset = get_subset(inputs, labels, n=100)
all_preds = []
pps = []
accs = []
for z, instruction in enumerate(instructions):
    print('Instruction', z)
    fn = get_anachronism_ape_fn(instruction, batch_size=20)
    this_preds = fn(subset)
    pp = np.array([1 if 'yes' in x.lower() else 0 for x in this_preds])
    all_preds.append(this_preds)
    pps.append(pp)
    accs.append((pp==labs).mean())
    print((pp==labs).mean())
    


Instruction 0
0.63
Instruction 1
0.61
Instruction 2
0.5
Instruction 3
0.58
Instruction 4
0.63
Instruction 5
0.62
Instruction 6
0.62
Instruction 7
0.58
Instruction 8
0.66
Instruction 9
0.68


In [None]:
all_preds[0]

In [28]:
# Automatic decomposition runs

decomp_prompt = 'I want to break down the task of figuring out whether a sentence contains anachronisms or not, into individual steps. An anachronism is a mistake in chronology, or a person, thing, or event that is out of its proper time.'
decompositions = propose_decomposition(decomp_prompt, io_pairs, 10)

def get_anachronism_fn(decomposition, batch_size=10):
    decomposition = '1.'+ decomposition
    last_n = int(re.findall(r'(\d+)\.', decomposition)[-1])
#     decomposition += '\n%s. Output YES if there is an anachronism, and NO otherwise' % (last_n + 1)
    def decomposition_fn(sentences):
        gpt3 = OpenAIModel(model="text-davinci-002",  max_length=400, quote='---', n=1)
        out = []
        for chunk in chunks(sentences, batch_size):
            prompts = ['''Figure out whether a sentence contains anachronisms or not, using the following steps
Steps:
%s
----
Sentence: %s
Is this an Anachronism? Show me how you arrived at this answer step-wise. Output YES if there is an anachronism, and NO otherwise.''' % (decomposition, x) for x in chunk]
            out.extend(gpt3(prompts))
        return out
    return decomposition_fn


labs, subset = get_subset(inputs, labels, n=100)
preds = []
pps = []
accs = []
all_preds = []
for z, decomposition in enumerate(decompositions):
    print('Decomposition', z)
    fn = get_anachronism_fn(decomposition, batch_size=20)
    this_preds = fn(subset)
#     pp = np.array([1 if 'contains an anachronism' in x.lower() else 0 for x in this_preds])
    pp = np.array([1 if 'yes' in x.lower() else 0 for x in this_preds])
    preds.append(this_preds)
    pps.append(pp)
    accs.append((pp==labs).mean())
    print((pp==labs).mean())
    all_preds.append(this_preds)

Decomposition 0
0.55
Decomposition 1
0.66
Decomposition 2
0.59
Decomposition 3
0.57
Decomposition 4
0.54
Decomposition 5
0.64
Decomposition 6
0.6
Decomposition 7
0.59
Decomposition 8
0.66
Decomposition 9
0.59


#### Dataset from decomposed prompting (K'th letter concatenation)

In [23]:
# load data 
import urllib.request
url = 'https://raw.githubusercontent.com/allenai/DecomP/main/datasets/letter_cat/n5_eg100_pos2_space.json'
response = urllib.request.urlopen(url)
data = json.loads(response.read())
dev_inputs = [d['question'] for d in data['1']['qa_pairs']]
dev_labels = [d['answer']['spans'][0] for d in data['1']['qa_pairs']]

url = 'https://raw.githubusercontent.com/allenai/DecomP/main/datasets/letter_cat/n4_eg100_pos2_space.json'
response = urllib.request.urlopen(url)
data = json.loads(response.read())
train_inputs = [d['question'] for d in data['1']['qa_pairs']]
train_labels = [d['answer']['spans'][0] for d in data['1']['qa_pairs']]

len(dev_inputs)
# dev_inputs[0]

100

In [47]:
idx = np.random.randint(0, len(train_inputs), 1)
for id_ in idx:
    print(train_inputs[id_])
    words = train_inputs[id_].split(' "')[1].split('" ')[0].split()
    word_list = ' '.join(words)
    for word in words:
        print("Word: %s"%word)
        print("3rd Letter: %s"%word[2])
    print("Concatenation: \n%s" %train_labels[id_])
    print('----')

Take the letters at position 3 of the words in "Dorothy Eric Mensah Moyo" and concatenate them using a space.
Word: Dorothy
3rd Letter: r
Word: Eric
3rd Letter: i
Word: Mensah
3rd Letter: n
Word: Moyo
3rd Letter: y
Concatenation: 
r i n y
----


In [42]:
# manual decomposition
out = []
batch_size = 10
for chunk in tqdm.tqdm(chunks(inputs, batch_size)):
    prompts = [x for x in chunk]
#     print(prompts)
    out.extend(gpt3(prompts))
pp = np.array([1 if p.strip().lower() == l else 0 for p, l in zip(out, labels)])
pp.sum()/len(inputs)

10it [00:13,  1.36s/it]


0.0

#### Dataset from decomposed prompting (List reversal)

In [9]:
# load data 
import urllib.request
url = 'https://raw.githubusercontent.com/allenai/DecomP/main/datasets/reverse/test_4_normal_words.json'
response = urllib.request.urlopen(url)
data = json.loads(response.read())
inputs = [d['question'] for d in data['alg_qa']['qa_pairs']]
labels = [d['answer']['spans'][0] for d in data['alg_qa']['qa_pairs']]
# len(data['1']['qa_pairs'])
len(data['alg_qa']['qa_pairs'])

90

In [10]:
idx = np.random.randint(0, len(inputs), 1)
for id_ in idx:
    print(inputs[id_])
    print(labels[id_])
    print('----')

Reverse the sequence "camera, file, light bulb, notebook".
notebook, light bulb, file, camera
----


In [241]:
# manual decomposition
out = []
batch_size = 10
for chunk in tqdm.tqdm(chunks(inputs, batch_size)):
    prompts = [x for x in chunk]
#     print(prompts)
    out.extend(gpt3(prompts))
pp = np.array([1 if p.strip().lower() == l else 0 for p, l in zip(out, labels)])
pp.sum()/len(inputs)

4it [00:07,  1.99s/it]


0.0

#### Tasks in Self-prompt (Ofir's paper) 
Musique and 2wikimultihop

In [40]:
# Musique
data_files = {split:os.path.join(cache_dir, 'musique', 'data', 'musique_full_v1.0_%s.jsonl'%split) for split in ['train', 'dev']}
d = datasets.load_dataset('json', data_files=data_files)
len(d['dev'])



  0%|          | 0/2 [00:00<?, ?it/s]

4834

In [35]:
from collections import Counter
# Counter([ex['answerable'] for ex in d['dev']])
idx = np.random.randint(0, len(d['train']), 1)
for num, ex in enumerate(d['train']):
    if num not in idx:
        continue
    print("Question:", ex['question'])
    prev_answer = None
    for subnum, decomp in enumerate(ex['question_decomposition']):
        if prev_answer:
            print("Follow up question:", decomp['question'].replace("#%d"%subnum, prev_answer))
        else:
            print("Follow up question:", decomp['question'])
        print("Intermediate answer:", decomp['answer'])
        prev_answer = decomp['answer']
    print("So the final answer is %s"% ex['answer'])
    print('----')

Question: What is the name of the southern tip of the continent that has the country where The Legend of the Lost was filmed?
Follow up question: where was the legend of the lost filmed
Intermediate answer: Libya
Follow up question: What continent is Libya located on?
Intermediate answer: Africa
Follow up question: what is the name of the southern tip of Africa
Intermediate answer: Cape Agulhas
So the final answer is Cape Agulhas
----


In [None]:
# 2wikimultihop
data_files = {split:os.path.join(cache_dir, '2wikimultihop', 'data', '%s.json'%split) for split in ['train', 'dev']}
d = datasets.load_dataset('json', data_files=data_files)



Downloading and preparing dataset json/default to /mmfs1/home/bparan/.cache/huggingface/datasets/json/default-8da56daef1cdd353/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

#### Tasks in Flipped learning - Known Unknown 

In [112]:
# load data
d = datasets.load_dataset('bigbench', 'known_unknowns', cache_dir=cache_dir)
inputs = d['train']['inputs'] #+ d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] #+ d['validation']['targets']
len(d['validation']['inputs'])



  0%|          | 0/3 [00:00<?, ?it/s]

16

In [113]:
idx = np.random.randint(0, len(inputs), 10)
for id_, (inp, label) in enumerate(zip(inputs, labels)):
#     if label[0] != 'Unknown':
#         continue
    if id_ not in idx:
        continue
    print(inp)
    print(label[0])
    print('----')

Q: What was the approximate population of Cheyenne, Wyoming in 2020?
  choice: Unknown
  choice: 65,000
A:
65,000
----
Q: What is a kind of bird that eats fish?
  choice: Unknown
  choice: Heron
A:
Heron
----
Q: How many calories worth of food does a 20-pound cat eat per day?
  choice: Unknown
  choice: 310 calories
A:
310 calories
----
Q: What tree’s leaf is on the Canadian flag?
  choice: Unknown
  choice: Maple tree
A:
Maple tree
----
Q: On what date in the past hundred years did humans collectively sleep the most?
  choice: Unknown
  choice: July 31, 2009
A:
Unknown
----
Q: How many hours of sleep a night is considered healthy for an adult?
  choice: Unknown
  choice: Between 7 and 9 hours
A:
Between 7 and 9 hours
----
Q: Who was the first person to whistle?
  choice: Unknown
  choice: T.J. Cummings
A:
Unknown
----
Q: What is the color of Jesse Engel's second-favorite guitar?
  choice: Unknown
  choice: Blue
A:
Unknown
----


#### Tasks in Flipped learning - Strategy QA

In [60]:
# load data
d = datasets.load_dataset('bigbench', 'strategyqa', cache_dir=cache_dir)
inputs = d['train']['inputs'] #+ d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] #+ d['validation']['targets']



  0%|          | 0/3 [00:00<?, ?it/s]

In [62]:
idx = np.random.randint(0, len(inputs), 15)
for id_, (inp, label) in enumerate(zip(inputs, labels)):
#     if label[0] != 'Unknown':
#         continue
    if id_ not in idx:
        continue
    print(inp)
    print(label[0])
    print('----')

Q: Did Queen Elizabeth The Queen Mother and her daughter share name with Tudor queen?
A:
Yes. Queen Elizabeth the Queen Mother gave birth to Queen Elizabeth II in 1926. The Tudor dynasty had a number of Queens including: Mary I of England, Elizabeth I of England, and Margaret Tudor, Queen of Scots.
----
Q: Are selfies more dangerous than plague in modern times?
A:
Yes. There are an average of 7 human plague cases reported each year according to the CDC. Selfies have caused people to fall off of cliffs while trying to get the perfect picture. From October 2011 and November 2017, there were 259 selfie deaths in 137 incidents.
----
Q: Is 3D printing able to make adenovirus?
A:
No. 3D printers come with limitations in terms of precision. The standard nozzle output is about 0.4 mm. Therefore, if you are planning to print something that is below 0.4mm, you will not get a useful item. Adenoviruses are medium-sized (90–100 nm), nonenveloped (without an outer lipid bilayer) viruses with an icos

#### Tasks in Flipped learning - Hindu Knowledge

In [63]:
d = datasets.load_dataset('bigbench', 'hindu_knowledge', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']



  0%|          | 0/3 [00:00<?, ?it/s]

In [65]:
idx = np.random.randint(0, len(inputs), 1)
for id_, (inp, label) in enumerate(zip(inputs, labels)):
#     if label[0] != 'Unknown':
#         continue
    if id_ not in idx:
        continue
    print(inp)
    print(label[0])
    print('----')

Q: In Hindu mythology, the avatar of Vishnu known as Vamana asks for three pieces of land from which ruler?
  choice: Dhritasashthra
  choice: Virata
  choice: Bali
  choice: Janamejaya
A:
Bali
----


#### Tasks in Flipped learning - Movie Dialogue

In [118]:
d = datasets.load_dataset('bigbench', 'movie_dialog_same_or_different', cache_dir=cache_dir)
inputs = d['train']['inputs'] #+ d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] #+ d['validation']['targets']
len(inputs)



  0%|          | 0/3 [00:00<?, ?it/s]

40000

In [121]:
idx = np.random.randint(0, len(inputs), 1)
for id_, (inp, label) in enumerate(zip(inputs, labels)):
#     if label[0] != 'Unknown':
#         continue
    if id_ not in idx:
        continue
    print(inp)
    print(label[0])
    print('----')

The following is a conversation between two people, but the transcript doesn't mark who said what: I doubt if a few bombs would wake up Dawn on a Sunday morning. This phoney war get's on my nerves. If we're going to have a war, I wish they'd get it started. Just ignore her, Mac. ----- In the preceding conversation, were the sentences "If we're going to have a war, I wish they'd get it started." and "Just ignore her, Mac." said by the same or different individuals?
Answer:
different
----


#### Tasks in Flipped learning - Code Description

In [72]:
d = datasets.load_dataset('bigbench', 'code_line_description', cache_dir=cache_dir)
inputs = d['train']['inputs'] #+ d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] #+ d['validation']['targets']



  0%|          | 0/3 [00:00<?, ?it/s]

In [73]:
idx = np.random.randint(0, len(inputs), 15)
for id_, (inp, label) in enumerate(zip(inputs, labels)):
#     if label[0] != 'Unknown':
#         continue
    if id_ not in idx:
        continue
    print(inp)
    print(label[0])
    print('----')

Python code:
j,b = False,True
if j and b:
    print('pass')
else:
    print('fail')

  choice: prints fail since j is false
  choice: since both and b are true prints pass
  choice: combines a and b
  choice: prints value of both a and b

English language description:
prints fail since j is false
----
Python code:
numbers_list = [2, 6, 8, 10, 11, 4, 12, 7, 13, 17, 0, 3, 21]
filtered_list = list(filter(lambda num: (num > 7), numbers_list))
print(filtered_list)

  choice: prints lambda
  choice: returns a filtered list
  choice: prints a list of numbers greater than 7 from numbers_list
  choice: prints numbers from 2 to 6

English language description:
prints a list of numbers greater than 7 from numbers_list
----
Python code:
if num > 5:
    print('negative')
else:
    print('positive')

  choice: checks if a number is greater than zero
  choice: if number is greater than 5 prints 'negative' otherwise prints 'positive'
  choice: assigns 'positive' or 'negative' to variable num
  choice:

#### Tasks in Flipped learning - conceptual_combinations

In [76]:
d = datasets.load_dataset('bigbench', 'conceptual_combinations', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']



  0%|          | 0/3 [00:00<?, ?it/s]

In [77]:
idx = np.random.randint(0, len(inputs), 15)
for id_, (inp, label) in enumerate(zip(inputs, labels)):
#     if label[0] != 'Unknown':
#         continue
    if id_ not in idx:
        continue
    print(inp)
    print(label[0])
    print('----')

concept: folding companies. Question: Which of the following sentences best characterizes folding companies?
 option: Folding companies contribute to unemployment.
 option: Folding companies produce folds.
 option: Folding companies can fit in small spaces.
 option: Folding companies manufacture origami.
Answer:
Folding companies contribute to unemployment.
----
Concept: hissing tires. Question: Which of the following sentences best characterizes hissing tires?
 option: Hissing tires are to be expected in warm weather.
 option: Hissing tires disapprove of your driving.
 option: Hissing tires are designed by sound engineers.
 option: Hissing tires are unsafe for driving.
Answer:
Hissing tires are unsafe for driving.
----
Concept: Wicked witches. Question: Which of the following sentences best characterizes wicked witches?
 option: Wicked witches eat all the sugar.
 option: Wicked witches wear white gowns.
 option: Wicked witches are skateboarders with magical powers.
 option: Wicked wit

#### Tasks in Flipped learning - language_identification

In [189]:
d = datasets.load_dataset('bigbench', 'language_identification', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']

Downloading and preparing dataset bigbench/language_identification (download: Unknown size, generated: 14.10 MiB, post-processed: Unknown size, total: 14.10 MiB) to /gscratch/zlab/bparan/projects/cascades/data/bigbench/language_identification/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0...


Generating default split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/8000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Dataset bigbench downloaded and prepared to /gscratch/zlab/bparan/projects/cascades/data/bigbench/language_identification/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

#### Tasks in Flipped learning - vitaminc_fact_verification

In [78]:
d = datasets.load_dataset('bigbench', 'vitaminc_fact_verification', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']



  0%|          | 0/3 [00:00<?, ?it/s]

In [79]:
idx = np.random.randint(0, len(inputs), 15)
for id_, (inp, label) in enumerate(zip(inputs, labels)):
#     if label[0] != 'Unknown':
#         continue
    if id_ not in idx:
        continue
    print(inp)
    print(label[0])
    print('----')

Based only on the information contained in a brief quote from Wikipedia, answer whether the related claim is True, False or Neither. Use Neither when the Wikipedia quote does not provide the necessary information to resolve the question.


Passage: Boeing 787 Dreamliner: The aircraft 's initial designation was the 7E7 , prior to its renaming in January 2005 .
Claim: Boeing 787 Dreamliner has always had the same name .
True, False, or Neither?
False
----
Based only on the information contained in a brief quote from Wikipedia, answer whether the related claim is True, False or Neither. Use Neither when the Wikipedia quote does not provide the necessary information to resolve the question.


Passage: Golden Horseshoe: The primary airport of the region is Toronto Pearson International Airport ( officially Lester B. Pearson International Airport ) , located in Mississauga , which is the busiest in Canada and the 33rd busiest in the world , handling over 41 million passengers in 2015 , and o

#### Tasks in Flipped learning - formal_fallacies_syllogisms_negation

In [80]:
d = datasets.load_dataset('bigbench', 'formal_fallacies_syllogisms_negation', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']



  0%|          | 0/3 [00:00<?, ?it/s]

#### Tasks in Flipped learning - misconceptions

In [81]:
d = datasets.load_dataset('bigbench', 'misconceptions', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']



  0%|          | 0/3 [00:00<?, ?it/s]

In [199]:
inputs[10]

'Based only on the information contained in a brief quote from Wikipedia, answer whether the related claim is True, False or Neither. Use Neither when the Wikipedia quote does not provide the necessary information to resolve the question.\n\n\nPassage: Garfield-COLON- A Tail of Two Kitties: The film received negative reviews from critics and it earned $ 141.7 million on a $ 60 million budget.\nClaim: Garfield : A Tail of Two Kitties received bleak reviews .\nTrue, False, or Neither?'

#### Tasks in Flipped learning - logical_deduction

In [82]:
d = datasets.load_dataset('bigbench', 'logical_deduction', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']



  0%|          | 0/3 [00:00<?, ?it/s]

#### Tasks in Flipped learning - winowhy

In [83]:
d = datasets.load_dataset('bigbench', 'winowhy', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']



  0%|          | 0/3 [00:00<?, ?it/s]

#### Tasks in Flipped learning - novel_concepts

In [84]:
d = datasets.load_dataset('bigbench', 'novel_concepts', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']



  0%|          | 0/3 [00:00<?, ?it/s]

#### Tasks in Auto-Cot - MAWPS 

In [85]:
data = datasets.load_dataset('omarxadel/MaWPS-ar', 'test', cache_dir=cache_dir)
inputs = [list(d.values())[0] for d in data['validation']]
labels = []
for d in data['validation']:
    try:
        ans = eval(list(d.values())[1].split("=")[-1].strip())
        if isinstance(ans, int):
            labels.append(ans)
        elif (ans).is_integer():
            labels.append(int(ans))
        else:
            labels.append(float("%.2f" % ans))
        
    except:
        ans = eval(list(d.values())[1].split("=")[0].strip())
        if isinstance(ans, int):
            labels.append(ans)
        elif (ans).is_integer():
            labels.append(int(ans))
        else:
            labels.append(float("%.2f" % ans))
    



  0%|          | 0/3 [00:00<?, ?it/s]

#### Tasks in Auto-CoT (GSM8K) 

In [91]:
data = datasets.load_dataset('gsm8k', 'main', cache_dir=cache_dir)['test']
inputs = [d['question'] for d in data]
labels = [d['answer'].split('#### ')[-1] for d in data]



  0%|          | 0/2 [00:00<?, ?it/s]

#### Tasks on Auto-CoT (AQUA-RAT)

In [90]:
data = datasets.load_dataset('aqua_rat', 'raw', cache_dir=cache_dir)['validation']
inputs = [d['question'] + " ".join(d['options']) for d in data]
labels = [d['correct'] for d in data]



  0%|          | 0/3 [00:00<?, ?it/s]

#### Tasks on Auto-CoT (Commonsense QA)

In [88]:
data = datasets.load_dataset('commonsense_qa', cache_dir=cache_dir)['validation']
inputs = [d['question']+ " " + " ".join([k + ") " + v for k, v in zip(d['choices']['label'], d['choices']['text'])]) for d in data]
labels = [d['answerKey'] for d in data]



  0%|          | 0/3 [00:00<?, ?it/s]

#### AMA Tasks (From Super-Glue they include boolQ, cb, copa, multirc, record, rte, wsc, WiC)

In [94]:
# BoolQ
data = datasets.load_dataset('super_glue', 'boolq', cache_dir=cache_dir)['validation']
inputs = [d['passage']+ " " + d['question'][0].title() + d['question'][1:]  + "?" for d in data]
label_dict = {0:'False', 1:'True'}
labels = [label_dict[d['label']] for d in data]
# Similar transformations to be made for other Superglue tasks: cb, copa, multirc, record, rte, wsc, wic



  0%|          | 0/3 [00:00<?, ?it/s]

#### AMA Tasks (From Adversarial NLI)

In [96]:
# Can also look at dev_r2, dev_r2
data = datasets.load_dataset('anli', cache_dir=cache_dir)['dev_r3']
inputs = ["Sentence1: " + d['premise'] + "\nSentence2: " +d['hypothesis'] for d in data]
label_dict = {0:"entailment", 1:'neutral', 2:'contradiction'}
labels = [label_dict[d['label']] for d in data]



  0%|          | 0/9 [00:00<?, ?it/s]

#### Flipped Learning tasks Storycloze, PIQA, HellaSwag, ARC challenge and openbookQA

In [99]:
# story_cloze
validation_path = os.path.join(cache_dir, 'story_cloze') #, 'cloze_test_val__winter2018-cloze_test_ALL_val - 1 - 1.csv')
data = datasets.load_dataset('story_cloze', data_dir=validation_path)
len(data['validation'])



  0%|          | 0/1 [00:00<?, ?it/s]

1571

In [101]:
data = datasets.load_dataset('piqa', cache_dir=cache_dir)['validation']
data[0]



  0%|          | 0/3 [00:00<?, ?it/s]

{'goal': "How do I ready a guinea pig cage for it's new occupants?",
 'sol1': 'Provide the guinea pig with a cage full of a few inches of bedding made of ripped paper strips, you will also need to supply it with a water bottle and a food dish.',
 'sol2': 'Provide the guinea pig with a cage full of a few inches of bedding made of ripped jeans material, you will also need to supply it with a water bottle and a food dish.',
 'label': 0}

In [103]:
data = datasets.load_dataset('hellaswag', cache_dir=cache_dir)['validation']
data[0]



  0%|          | 0/3 [00:00<?, ?it/s]

{'ind': 24,
 'activity_label': 'Roof shingle removal',
 'ctx_a': 'A man is sitting on a roof.',
 'ctx_b': 'he',
 'ctx': 'A man is sitting on a roof. he',
 'endings': ['is using wrap to wrap a pair of skis.',
  'is ripping level tiles off.',
  "is holding a rubik's cube.",
  'starts pulling up roofing on a roof.'],
 'source_id': 'activitynet~v_-JhWjGDPHMY',
 'split': 'val',
 'split_type': 'indomain',
 'label': '3'}

In [105]:
# There is also the ARC-easy split
data = datasets.load_dataset('ai2_arc', 'ARC-Challenge', cache_dir=cache_dir)['validation']
data[10]



  0%|          | 0/3 [00:00<?, ?it/s]

{'id': 'Mercury_SC_412337',
 'question': 'During an investigation, heat transferred from a liquid to the environment. Which pair of explanations can best account for this result?',
 'choices': {'text': ['The temperature of the liquid increased, or the liquid became a gas.',
   'The temperature of the liquid increased, or the liquid became a solid.',
   'The temperature of the liquid decreased, or the liquid became a gas.',
   'The temperature of the liquid decreased, or the liquid became a solid.'],
  'label': ['A', 'B', 'C', 'D']},
 'answerKey': 'D'}

In [106]:
# OpenbookQa
data = datasets.load_dataset('openbookqa', 'main', cache_dir=cache_dir)['validation']
data[0]

Downloading builder script:   0%|          | 0.00/6.49k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/4.55k [00:00<?, ?B/s]



  0%|          | 0/3 [00:00<?, ?it/s]

{'id': '8-376',
 'question_stem': 'Frilled sharks and angler fish live far beneath the surface of the ocean, which is why they are known as',
 'choices': {'text': ['Deep sea animals',
   'fish',
   'Long Sea Fish',
   'Far Sea Animals'],
  'label': ['A', 'B', 'C', 'D']},
 'answerKey': 'A'}

#### Other tasks from AMA (Classification) : Agnews, DBPedia, Amazon movie reivew and SST

In [108]:
## For others use the strings : dbpedia_14, sst2
data = datasets.load_dataset('ag_news', cache_dir=cache_dir)['test']
inputs = [d['text'] for d in data]
label_dict = {0:'World', 1:'Sports', 2:'Business', 3: 'Sci/Tech'}
labels = [label_dict[d['label']] for d in data]



  0%|          | 0/2 [00:00<?, ?it/s]

#### Tasks from reframing natural language instructions

MC-Taco (question generation), QASC (question generation), Quoref, Winogrande, CosmosQA, MultiRC, MC-TACO (Incorrect answer generation for duration and transcience questions), QASC (overlapping words), Essential terms

In [109]:
# Download natural instructions

def transform_example(example):
    # Very simple transformation of the example
    return example["input"] + "\n" + example["output"]

NQ_REWRITE_TASKS = ['task003_mctaco_question_generation_event_duration.json', 
           'task040_qasc_question_generation.json',
           'task002_quoref_answer_generation.json',
           'task033_winogrande_answer_generation.json',
           'task024_cosmosqa_answer_generation.json',
           'task056_multirc_classify_correct_answer.json',
           'task005_mctaco_wrong_answer_generation_event_duration.json', 
           'task008_mctaco_wrong_answer_generation_transient_stationary.json',
           'task039_qasc_find_overlapping_words.json',
           'task044_essential_terms_identifying_essential_words.json']
for task_name in NQ_REWRITE_TASKS:
    nq_path = "/mmfs1/gscratch/zlab/bparan/projects/cascades/src/natural-instructions/tasks"
    task_path = os.path.join(nq_path, task_name)
    task_data = json.loads(open(task_path).read())
    instances = []
    for instance in task_data["Instances"]:
        instances.append({"guid": instance['id'], "input": instance["input"], "label": instance["output"][0]})
    instruction = task_data["Definition"]
    examples = ""
    for ex in task_data["Positive Examples"]:
        examples += transform_example(ex)
    
    break

#### Additional promising datasets from BigBench

* Word Sorting
* Word Unscrambling
* Which Wikipedia Edit
* Unnatural incontext learning
* Unit Interpretation
* ...

In [7]:
d = datasets.load_dataset('bigbench', 'word_sorting', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']

Downloading and preparing dataset bigbench/word_sorting (download: Unknown size, generated: 959.64 KiB, post-processed: Unknown size, total: 959.64 KiB) to /gscratch/zlab/bparan/projects/cascades/data/bigbench/word_sorting/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0...


Generating default split:   0%|          | 0/1900 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/1520 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/380 [00:00<?, ? examples/s]

Dataset bigbench downloaded and prepared to /gscratch/zlab/bparan/projects/cascades/data/bigbench/word_sorting/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [19]:
d = datasets.load_dataset('bigbench', 'word_unscrambling', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']



  0%|          | 0/3 [00:00<?, ?it/s]

In [20]:
d = datasets.load_dataset('bigbench', 'which_wiki_edit', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']

Downloading and preparing dataset bigbench/which_wiki_edit (download: Unknown size, generated: 12.08 MiB, post-processed: Unknown size, total: 12.08 MiB) to /gscratch/zlab/bparan/projects/cascades/data/bigbench/which_wiki_edit/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0...


Generating default split:   0%|          | 0/571 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/457 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/114 [00:00<?, ? examples/s]

Dataset bigbench downloaded and prepared to /gscratch/zlab/bparan/projects/cascades/data/bigbench/which_wiki_edit/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [35]:
d = datasets.load_dataset('bigbench', 'unnatural_in_context_learning', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']



  0%|          | 0/3 [00:00<?, ?it/s]

In [39]:
d = datasets.load_dataset('bigbench', 'unit_interpretation', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']



  0%|          | 0/3 [00:00<?, ?it/s]

In [45]:
d = datasets.load_dataset('bigbench', 'understanding_fables', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']

Downloading and preparing dataset bigbench/understanding_fables (download: Unknown size, generated: 445.17 KiB, post-processed: Unknown size, total: 445.17 KiB) to /gscratch/zlab/bparan/projects/cascades/data/bigbench/understanding_fables/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0...


Generating default split:   0%|          | 0/189 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/152 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/37 [00:00<?, ? examples/s]

Dataset bigbench downloaded and prepared to /gscratch/zlab/bparan/projects/cascades/data/bigbench/understanding_fables/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [53]:
d = datasets.load_dataset('bigbench', 'tracking_shuffled_objects', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']



  0%|          | 0/3 [00:00<?, ?it/s]

In [23]:
d = datasets.load_dataset('bigbench', 'temporal_sequences', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']

Downloading and preparing dataset bigbench/temporal_sequences (download: Unknown size, generated: 1.31 MiB, post-processed: Unknown size, total: 1.31 MiB) to /gscratch/zlab/bparan/projects/cascades/data/bigbench/temporal_sequences/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0...


Generating default split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/800 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/200 [00:00<?, ? examples/s]

Dataset bigbench downloaded and prepared to /gscratch/zlab/bparan/projects/cascades/data/bigbench/temporal_sequences/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [27]:
d = datasets.load_dataset('bigbench', 'symbol_interpretation', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']

Downloading and preparing dataset bigbench/symbol_interpretation (download: Unknown size, generated: 2.19 MiB, post-processed: Unknown size, total: 2.19 MiB) to /gscratch/zlab/bparan/projects/cascades/data/bigbench/symbol_interpretation/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0...


Generating default split:   0%|          | 0/990 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/795 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/195 [00:00<?, ? examples/s]

Dataset bigbench downloaded and prepared to /gscratch/zlab/bparan/projects/cascades/data/bigbench/symbol_interpretation/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0. Subsequent calls will reuse this data.


2022-10-13 09:34:18.109367: W tensorflow/core/data/root_dataset.cc:247] Optimization loop failed: CANCELLED: Operation was cancelled
2022-10-13 09:34:18.131926: W tensorflow/core/data/root_dataset.cc:247] Optimization loop failed: CANCELLED: Operation was cancelled


  0%|          | 0/3 [00:00<?, ?it/s]

In [90]:
d = datasets.load_dataset('bigbench', 'sufficient_information', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']



  0%|          | 0/3 [00:00<?, ?it/s]

In [37]:
d = datasets.load_dataset('bigbench', 'sports_understanding', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']

Downloading and preparing dataset bigbench/sports_understanding (download: Unknown size, generated: 443.48 KiB, post-processed: Unknown size, total: 443.48 KiB) to /gscratch/zlab/bparan/projects/cascades/data/bigbench/sports_understanding/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0...


Generating default split:   0%|          | 0/986 [00:00<?, ? examples/s]



Generating train split:   0%|          | 0/789 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/197 [00:00<?, ? examples/s]

Dataset bigbench downloaded and prepared to /gscratch/zlab/bparan/projects/cascades/data/bigbench/sports_understanding/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [46]:
d = datasets.load_dataset('bigbench', 'snarks', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']

Downloading and preparing dataset bigbench/snarks (download: Unknown size, generated: 89.50 KiB, post-processed: Unknown size, total: 89.50 KiB) to /gscratch/zlab/bparan/projects/cascades/data/bigbench/snarks/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0...


Generating default split:   0%|          | 0/181 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/145 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/36 [00:00<?, ? examples/s]

Dataset bigbench downloaded and prepared to /gscratch/zlab/bparan/projects/cascades/data/bigbench/snarks/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [48]:
d = datasets.load_dataset('bigbench', 'simple_text_editing', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']

Downloading and preparing dataset bigbench/simple_text_editing (download: Unknown size, generated: 54.52 KiB, post-processed: Unknown size, total: 54.52 KiB) to /gscratch/zlab/bparan/projects/cascades/data/bigbench/simple_text_editing/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0...


Generating default split:   0%|          | 0/47 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/31 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/16 [00:00<?, ? examples/s]

Dataset bigbench downloaded and prepared to /gscratch/zlab/bparan/projects/cascades/data/bigbench/simple_text_editing/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0. Subsequent calls will reuse this data.


2022-10-13 09:50:32.037137: W tensorflow/core/data/root_dataset.cc:247] Optimization loop failed: CANCELLED: Operation was cancelled


  0%|          | 0/3 [00:00<?, ?it/s]

In [56]:
d = datasets.load_dataset('bigbench', 'sentence_ambiguity', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']

Downloading and preparing dataset bigbench/sentence_ambiguity (download: Unknown size, generated: 16.07 KiB, post-processed: Unknown size, total: 16.07 KiB) to /gscratch/zlab/bparan/projects/cascades/data/bigbench/sentence_ambiguity/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0...


Generating default split:   0%|          | 0/60 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/44 [00:00<?, ? examples/s]

2022-10-13 09:54:48.161775: W tensorflow/core/data/root_dataset.cc:247] Optimization loop failed: CANCELLED: Operation was cancelled


Generating validation split:   0%|          | 0/16 [00:00<?, ? examples/s]

Dataset bigbench downloaded and prepared to /gscratch/zlab/bparan/projects/cascades/data/bigbench/sentence_ambiguity/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0. Subsequent calls will reuse this data.


2022-10-13 09:54:48.245917: W tensorflow/core/data/root_dataset.cc:247] Optimization loop failed: CANCELLED: Operation was cancelled


  0%|          | 0/3 [00:00<?, ?it/s]

In [69]:
d = datasets.load_dataset('bigbench', 'salient_translation_error_detection', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']

Downloading and preparing dataset bigbench/salient_translation_error_detection (download: Unknown size, generated: 2.18 MiB, post-processed: Unknown size, total: 2.18 MiB) to /gscratch/zlab/bparan/projects/cascades/data/bigbench/salient_translation_error_detection/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0...


Generating default split:   0%|          | 0/998 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/799 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/199 [00:00<?, ? examples/s]

Dataset bigbench downloaded and prepared to /gscratch/zlab/bparan/projects/cascades/data/bigbench/salient_translation_error_detection/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [74]:
d = datasets.load_dataset('bigbench', 'riddle_sense', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']

Downloading and preparing dataset bigbench/riddle_sense (download: Unknown size, generated: 30.43 KiB, post-processed: Unknown size, total: 30.43 KiB) to /gscratch/zlab/bparan/projects/cascades/data/bigbench/riddle_sense/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0...


Generating default split:   0%|          | 0/49 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/33 [00:00<?, ? examples/s]

2022-10-13 10:15:59.806252: W tensorflow/core/data/root_dataset.cc:247] Optimization loop failed: CANCELLED: Operation was cancelled


Generating validation split:   0%|          | 0/16 [00:00<?, ? examples/s]

Dataset bigbench downloaded and prepared to /gscratch/zlab/bparan/projects/cascades/data/bigbench/riddle_sense/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0. Subsequent calls will reuse this data.


2022-10-13 10:15:59.889900: W tensorflow/core/data/root_dataset.cc:247] Optimization loop failed: CANCELLED: Operation was cancelled


  0%|          | 0/3 [00:00<?, ?it/s]

In [78]:
d = datasets.load_dataset('bigbench', 'repeat_copy_logic', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']

Downloading and preparing dataset bigbench/repeat_copy_logic (download: Unknown size, generated: 13.13 KiB, post-processed: Unknown size, total: 13.13 KiB) to /gscratch/zlab/bparan/projects/cascades/data/bigbench/repeat_copy_logic/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0...


Generating default split:   0%|          | 0/32 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/16 [00:00<?, ? examples/s]

2022-10-13 10:18:25.874866: W tensorflow/core/data/root_dataset.cc:247] Optimization loop failed: CANCELLED: Operation was cancelled


Generating validation split:   0%|          | 0/16 [00:00<?, ? examples/s]

Dataset bigbench downloaded and prepared to /gscratch/zlab/bparan/projects/cascades/data/bigbench/repeat_copy_logic/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0. Subsequent calls will reuse this data.


2022-10-13 10:18:25.961445: W tensorflow/core/data/root_dataset.cc:247] Optimization loop failed: CANCELLED: Operation was cancelled


  0%|          | 0/3 [00:00<?, ?it/s]

In [80]:
d = datasets.load_dataset('bigbench', 'reasoning_about_colored_objects', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']

Downloading and preparing dataset bigbench/reasoning_about_colored_objects (download: Unknown size, generated: 1.74 MiB, post-processed: Unknown size, total: 1.74 MiB) to /gscratch/zlab/bparan/projects/cascades/data/bigbench/reasoning_about_colored_objects/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0...


Generating default split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/1600 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/400 [00:00<?, ? examples/s]

Dataset bigbench downloaded and prepared to /gscratch/zlab/bparan/projects/cascades/data/bigbench/reasoning_about_colored_objects/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [82]:
d = datasets.load_dataset('bigbench', 'real_or_fake_text', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']

Downloading and preparing dataset bigbench/real_or_fake_text (download: Unknown size, generated: 102.39 MiB, post-processed: Unknown size, total: 102.39 MiB) to /gscratch/zlab/bparan/projects/cascades/data/bigbench/real_or_fake_text/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0...


Generating default split:   0%|          | 0/15088 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/12072 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3016 [00:00<?, ? examples/s]

Dataset bigbench downloaded and prepared to /gscratch/zlab/bparan/projects/cascades/data/bigbench/real_or_fake_text/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [84]:
d = datasets.load_dataset('bigbench', 'question_selection', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']

Downloading and preparing dataset bigbench/question_selection (download: Unknown size, generated: 4.75 MiB, post-processed: Unknown size, total: 4.75 MiB) to /gscratch/zlab/bparan/projects/cascades/data/bigbench/question_selection/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0...


Generating default split:   0%|          | 0/1582 [00:00<?, ? examples/s]



Generating train split:   0%|          | 0/1266 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/316 [00:00<?, ? examples/s]

Dataset bigbench downloaded and prepared to /gscratch/zlab/bparan/projects/cascades/data/bigbench/question_selection/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [105]:
d = datasets.load_dataset('bigbench', 'presuppositions_as_nli', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']

Downloading and preparing dataset bigbench/presuppositions_as_nli (download: Unknown size, generated: 982.28 KiB, post-processed: Unknown size, total: 982.28 KiB) to /gscratch/zlab/bparan/projects/cascades/data/bigbench/presuppositions_as_nli/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0...


Generating default split:   0%|          | 0/735 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/588 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/147 [00:00<?, ? examples/s]

Dataset bigbench downloaded and prepared to /gscratch/zlab/bparan/projects/cascades/data/bigbench/presuppositions_as_nli/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [108]:
d = datasets.load_dataset('bigbench', 'physics_questions', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']

Downloading and preparing dataset bigbench/physics_questions (download: Unknown size, generated: 35.98 KiB, post-processed: Unknown size, total: 35.98 KiB) to /gscratch/zlab/bparan/projects/cascades/data/bigbench/physics_questions/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0...


Generating default split:   0%|          | 0/54 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/38 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/16 [00:00<?, ? examples/s]

Dataset bigbench downloaded and prepared to /gscratch/zlab/bparan/projects/cascades/data/bigbench/physics_questions/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0. Subsequent calls will reuse this data.


2022-10-13 12:47:28.847296: W tensorflow/core/data/root_dataset.cc:247] Optimization loop failed: CANCELLED: Operation was cancelled


  0%|          | 0/3 [00:00<?, ?it/s]

In [114]:
# physics too
d = datasets.load_dataset('bigbench', 'physical_intuition', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']



  0%|          | 0/3 [00:00<?, ?it/s]

In [181]:
# physics too
d = datasets.load_dataset('bigbench', 'penguins_in_a_table', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']



  0%|          | 0/3 [00:00<?, ?it/s]

In [131]:
# physics too
d = datasets.load_dataset('bigbench', 'operators', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']

Downloading and preparing dataset bigbench/operators (download: Unknown size, generated: 63.48 KiB, post-processed: Unknown size, total: 63.48 KiB) to /gscratch/zlab/bparan/projects/cascades/data/bigbench/operators/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0...


Generating default split:   0%|          | 0/210 [00:00<?, ? examples/s]



Generating train split:   0%|          | 0/168 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/42 [00:00<?, ? examples/s]

Dataset bigbench downloaded and prepared to /gscratch/zlab/bparan/projects/cascades/data/bigbench/operators/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [133]:
# physics too
d = datasets.load_dataset('bigbench', 'odd_one_out', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']

Downloading and preparing dataset bigbench/odd_one_out (download: Unknown size, generated: 27.24 KiB, post-processed: Unknown size, total: 27.24 KiB) to /gscratch/zlab/bparan/projects/cascades/data/bigbench/odd_one_out/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0...


Generating default split:   0%|          | 0/86 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/69 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/17 [00:00<?, ? examples/s]

Dataset bigbench downloaded and prepared to /gscratch/zlab/bparan/projects/cascades/data/bigbench/odd_one_out/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0. Subsequent calls will reuse this data.


2022-10-13 12:57:57.267219: W tensorflow/core/data/root_dataset.cc:247] Optimization loop failed: CANCELLED: Operation was cancelled


  0%|          | 0/3 [00:00<?, ?it/s]

In [135]:
# physics too
d = datasets.load_dataset('bigbench', 'object_counting', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']

Downloading and preparing dataset bigbench/object_counting (download: Unknown size, generated: 292.43 KiB, post-processed: Unknown size, total: 292.43 KiB) to /gscratch/zlab/bparan/projects/cascades/data/bigbench/object_counting/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0...


Generating default split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/800 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/200 [00:00<?, ? examples/s]

Dataset bigbench downloaded and prepared to /gscratch/zlab/bparan/projects/cascades/data/bigbench/object_counting/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [140]:
# physics too
d = datasets.load_dataset('bigbench', 'novel_concepts', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']



  0%|          | 0/3 [00:00<?, ?it/s]

In [143]:
# physics too
d = datasets.load_dataset('bigbench', 'nonsense_words_grammar', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']



  0%|          | 0/3 [00:00<?, ?it/s]

In [145]:
# physics too
d = datasets.load_dataset('bigbench', 'navigate', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']

Downloading and preparing dataset bigbench/navigate (download: Unknown size, generated: 441.84 KiB, post-processed: Unknown size, total: 441.84 KiB) to /gscratch/zlab/bparan/projects/cascades/data/bigbench/navigate/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0...


Generating default split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/800 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/200 [00:00<?, ? examples/s]

Dataset bigbench downloaded and prepared to /gscratch/zlab/bparan/projects/cascades/data/bigbench/navigate/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [160]:
# physics too
d = datasets.load_dataset('bigbench', 'modified_arithmetic', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']

Downloading and preparing dataset bigbench/modified_arithmetic (download: Unknown size, generated: 2.33 MiB, post-processed: Unknown size, total: 2.33 MiB) to /gscratch/zlab/bparan/projects/cascades/data/bigbench/modified_arithmetic/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0...


Generating default split:   0%|          | 0/6000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/4800 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1200 [00:00<?, ? examples/s]

Dataset bigbench downloaded and prepared to /gscratch/zlab/bparan/projects/cascades/data/bigbench/modified_arithmetic/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [165]:
# physics too
d = datasets.load_dataset('bigbench', 'minute_mysteries_qa', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']

Downloading and preparing dataset bigbench/minute_mysteries_qa (download: Unknown size, generated: 6.19 MiB, post-processed: Unknown size, total: 6.19 MiB) to /gscratch/zlab/bparan/projects/cascades/data/bigbench/minute_mysteries_qa/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0...


Generating default split:   0%|          | 0/477 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/383 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/94 [00:00<?, ? examples/s]

Dataset bigbench downloaded and prepared to /gscratch/zlab/bparan/projects/cascades/data/bigbench/minute_mysteries_qa/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [171]:
# physics too
d = datasets.load_dataset('bigbench', 'metaphor_boolean', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']

Downloading and preparing dataset bigbench/metaphor_boolean (download: Unknown size, generated: 418.24 KiB, post-processed: Unknown size, total: 418.24 KiB) to /gscratch/zlab/bparan/projects/cascades/data/bigbench/metaphor_boolean/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0...


Generating default split:   0%|          | 0/680 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/544 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/136 [00:00<?, ? examples/s]

Dataset bigbench downloaded and prepared to /gscratch/zlab/bparan/projects/cascades/data/bigbench/metaphor_boolean/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [178]:
# physics too
d = datasets.load_dataset('bigbench', 'logical_sequence', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']

Downloading and preparing dataset bigbench/logical_sequence (download: Unknown size, generated: 44.50 KiB, post-processed: Unknown size, total: 44.50 KiB) to /gscratch/zlab/bparan/projects/cascades/data/bigbench/logical_sequence/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0...


Generating default split:   0%|          | 0/39 [00:00<?, ? examples/s]

2022-10-13 14:10:53.155483: W tensorflow/core/data/root_dataset.cc:247] Optimization loop failed: CANCELLED: Operation was cancelled


Generating train split:   0%|          | 0/23 [00:00<?, ? examples/s]

2022-10-13 14:10:53.242602: W tensorflow/core/data/root_dataset.cc:247] Optimization loop failed: CANCELLED: Operation was cancelled


Generating validation split:   0%|          | 0/16 [00:00<?, ? examples/s]

Dataset bigbench downloaded and prepared to /gscratch/zlab/bparan/projects/cascades/data/bigbench/logical_sequence/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0. Subsequent calls will reuse this data.


2022-10-13 14:10:53.324030: W tensorflow/core/data/root_dataset.cc:247] Optimization loop failed: CANCELLED: Operation was cancelled


  0%|          | 0/3 [00:00<?, ?it/s]

In [183]:
# physics too
d = datasets.load_dataset('bigbench', 'logical_fallacy_detection', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']

Downloading and preparing dataset bigbench/logical_fallacy_detection (download: Unknown size, generated: 1.38 MiB, post-processed: Unknown size, total: 1.38 MiB) to /gscratch/zlab/bparan/projects/cascades/data/bigbench/logical_fallacy_detection/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0...


Generating default split:   0%|          | 0/2800 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/2240 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/560 [00:00<?, ? examples/s]

Dataset bigbench downloaded and prepared to /gscratch/zlab/bparan/projects/cascades/data/bigbench/logical_fallacy_detection/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [185]:
# physics too
d = datasets.load_dataset('bigbench', 'logical_deduction', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']



  0%|          | 0/3 [00:00<?, ?it/s]

In [187]:
# physics too
d = datasets.load_dataset('bigbench', 'logical_args', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']

Downloading and preparing dataset bigbench/logical_args (download: Unknown size, generated: 85.24 KiB, post-processed: Unknown size, total: 85.24 KiB) to /gscratch/zlab/bparan/projects/cascades/data/bigbench/logical_args/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0...


Generating default split:   0%|          | 0/32 [00:00<?, ? examples/s]

2022-10-13 20:52:20.466628: W tensorflow/core/data/root_dataset.cc:247] Optimization loop failed: CANCELLED: Operation was cancelled


Generating train split:   0%|          | 0/16 [00:00<?, ? examples/s]

2022-10-13 20:52:20.564030: W tensorflow/core/data/root_dataset.cc:247] Optimization loop failed: CANCELLED: Operation was cancelled


Generating validation split:   0%|          | 0/16 [00:00<?, ? examples/s]

2022-10-13 20:52:20.662655: W tensorflow/core/data/root_dataset.cc:247] Optimization loop failed: CANCELLED: Operation was cancelled


Dataset bigbench downloaded and prepared to /gscratch/zlab/bparan/projects/cascades/data/bigbench/logical_args/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [195]:
# physics too
d = datasets.load_dataset('bigbench', 'linguistics_puzzles', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']

Downloading and preparing dataset bigbench/linguistics_puzzles (download: Unknown size, generated: 3.33 MiB, post-processed: Unknown size, total: 3.33 MiB) to /gscratch/zlab/bparan/projects/cascades/data/bigbench/linguistics_puzzles/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0...


Generating default split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/1600 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/400 [00:00<?, ? examples/s]

Dataset bigbench downloaded and prepared to /gscratch/zlab/bparan/projects/cascades/data/bigbench/linguistics_puzzles/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [201]:
# physics too
d = datasets.load_dataset('bigbench', 'language_games', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']



  0%|          | 0/3 [00:00<?, ?it/s]

In [215]:
# international_phonetic_alphabet_transliterate too 
d = datasets.load_dataset('bigbench', 'intent_recognition', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']

Downloading and preparing dataset bigbench/intent_recognition (download: Unknown size, generated: 631.06 KiB, post-processed: Unknown size, total: 631.06 KiB) to /gscratch/zlab/bparan/projects/cascades/data/bigbench/intent_recognition/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0...


Generating default split:   0%|          | 0/693 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/555 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/138 [00:00<?, ? examples/s]

Dataset bigbench downloaded and prepared to /gscratch/zlab/bparan/projects/cascades/data/bigbench/intent_recognition/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [220]:
# international_phonetic_alphabet_transliterate too 
d = datasets.load_dataset('bigbench', 'implicit_relations', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']

Downloading and preparing dataset bigbench/implicit_relations (download: Unknown size, generated: 156.30 KiB, post-processed: Unknown size, total: 156.30 KiB) to /gscratch/zlab/bparan/projects/cascades/data/bigbench/implicit_relations/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0...


Generating default split:   0%|          | 0/85 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/68 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/17 [00:00<?, ? examples/s]

Dataset bigbench downloaded and prepared to /gscratch/zlab/bparan/projects/cascades/data/bigbench/implicit_relations/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [222]:
# international_phonetic_alphabet_transliterate too 
d = datasets.load_dataset('bigbench', 'implicatures', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']

Downloading and preparing dataset bigbench/implicatures (download: Unknown size, generated: 179.50 KiB, post-processed: Unknown size, total: 179.50 KiB) to /gscratch/zlab/bparan/projects/cascades/data/bigbench/implicatures/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0...


Generating default split:   0%|          | 0/492 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/394 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/98 [00:00<?, ? examples/s]

Dataset bigbench downloaded and prepared to /gscratch/zlab/bparan/projects/cascades/data/bigbench/implicatures/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [227]:
# international_phonetic_alphabet_transliterate too 
d = datasets.load_dataset('bigbench', 'identify_odd_metaphor', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']

Downloading builder script:   0%|          | 0.00/11.7k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/284k [00:00<?, ?B/s]

Downloading and preparing dataset bigbench/identify_odd_metaphor (download: Unknown size, generated: 54.04 KiB, post-processed: Unknown size, total: 54.04 KiB) to /gscratch/zlab/bparan/projects/cascades/data/bigbench/identify_odd_metaphor/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0...


Generating default split:   0%|          | 0/47 [00:00<?, ? examples/s]

2022-10-14 20:43:27.791681: W tensorflow/core/data/root_dataset.cc:247] Optimization loop failed: CANCELLED: Operation was cancelled


Generating train split:   0%|          | 0/31 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/16 [00:00<?, ? examples/s]

Dataset bigbench downloaded and prepared to /gscratch/zlab/bparan/projects/cascades/data/bigbench/identify_odd_metaphor/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0. Subsequent calls will reuse this data.


2022-10-14 20:43:27.967707: W tensorflow/core/data/root_dataset.cc:247] Optimization loop failed: CANCELLED: Operation was cancelled


  0%|          | 0/3 [00:00<?, ?it/s]

In [231]:
# international_phonetic_alphabet_transliterate too 
d = datasets.load_dataset('bigbench', 'hyperbaton', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']

Downloading and preparing dataset bigbench/hyperbaton (download: Unknown size, generated: 17.93 MiB, post-processed: Unknown size, total: 17.93 MiB) to /gscratch/zlab/bparan/projects/cascades/data/bigbench/hyperbaton/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0...


Generating default split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/40000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Dataset bigbench downloaded and prepared to /gscratch/zlab/bparan/projects/cascades/data/bigbench/hyperbaton/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [259]:
# international_phonetic_alphabet_transliterate too 
d = datasets.load_dataset('bigbench', 'formal_fallacies_syllogisms_negation', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']



  0%|          | 0/3 [00:00<?, ?it/s]

In [264]:
# international_phonetic_alphabet_transliterate too 
d = datasets.load_dataset('bigbench', 'few_shot_nlg', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']

Downloading and preparing dataset bigbench/few_shot_nlg (download: Unknown size, generated: 148.44 KiB, post-processed: Unknown size, total: 148.44 KiB) to /gscratch/zlab/bparan/projects/cascades/data/bigbench/few_shot_nlg/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0...


Generating default split:   0%|          | 0/153 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/123 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/30 [00:00<?, ? examples/s]

Dataset bigbench downloaded and prepared to /gscratch/zlab/bparan/projects/cascades/data/bigbench/few_shot_nlg/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [271]:
# international_phonetic_alphabet_transliterate too 
d = datasets.load_dataset('bigbench', 'evaluating_information_essentiality', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']

Downloading and preparing dataset bigbench/evaluating_information_essentiality (download: Unknown size, generated: 151.52 KiB, post-processed: Unknown size, total: 151.52 KiB) to /gscratch/zlab/bparan/projects/cascades/data/bigbench/evaluating_information_essentiality/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0...


Generating default split:   0%|          | 0/68 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/52 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/16 [00:00<?, ? examples/s]

Dataset bigbench downloaded and prepared to /gscratch/zlab/bparan/projects/cascades/data/bigbench/evaluating_information_essentiality/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0. Subsequent calls will reuse this data.


2022-10-15 12:47:46.684679: W tensorflow/core/data/root_dataset.cc:247] Optimization loop failed: CANCELLED: Operation was cancelled


  0%|          | 0/3 [00:00<?, ?it/s]

In [273]:
# international_phonetic_alphabet_transliterate too 
d = datasets.load_dataset('bigbench', 'epistemic_reasoning', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']

Downloading and preparing dataset bigbench/epistemic_reasoning (download: Unknown size, generated: 1.69 MiB, post-processed: Unknown size, total: 1.69 MiB) to /gscratch/zlab/bparan/projects/cascades/data/bigbench/epistemic_reasoning/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0...


Generating default split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/1600 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/400 [00:00<?, ? examples/s]

Dataset bigbench downloaded and prepared to /gscratch/zlab/bparan/projects/cascades/data/bigbench/epistemic_reasoning/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [282]:
# international_phonetic_alphabet_transliterate too 
d = datasets.load_dataset('bigbench', 'empirical_judgments', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']



  0%|          | 0/3 [00:00<?, ?it/s]

In [290]:
# international_phonetic_alphabet_transliterate too 
d = datasets.load_dataset('bigbench', 'elementary_math_qa', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']

Downloading and preparing dataset bigbench/elementary_math_qa (download: Unknown size, generated: 25.69 MiB, post-processed: Unknown size, total: 25.69 MiB) to /gscratch/zlab/bparan/projects/cascades/data/bigbench/elementary_math_qa/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0...


Generating default split:   0%|          | 0/38160 [00:00<?, ? examples/s]



Generating train split:   0%|          | 0/30531 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/7629 [00:00<?, ? examples/s]

Dataset bigbench downloaded and prepared to /gscratch/zlab/bparan/projects/cascades/data/bigbench/elementary_math_qa/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [297]:
# international_phonetic_alphabet_transliterate too 
d = datasets.load_dataset('bigbench', 'discourse_marker_prediction', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']

Downloading and preparing dataset bigbench/discourse_marker_prediction (download: Unknown size, generated: 3.99 MiB, post-processed: Unknown size, total: 3.99 MiB) to /gscratch/zlab/bparan/projects/cascades/data/bigbench/discourse_marker_prediction/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0...


Generating default split:   0%|          | 0/857 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/686 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/171 [00:00<?, ? examples/s]

Dataset bigbench downloaded and prepared to /gscratch/zlab/bparan/projects/cascades/data/bigbench/discourse_marker_prediction/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [299]:
# international_phonetic_alphabet_transliterate too 
d = datasets.load_dataset('bigbench', 'disambiguation_qa', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']



  0%|          | 0/3 [00:00<?, ?it/s]

In [301]:
# international_phonetic_alphabet_transliterate too 
d = datasets.load_dataset('bigbench', 'date_understanding', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']

Downloading and preparing dataset bigbench/date_understanding (download: Unknown size, generated: 186.06 KiB, post-processed: Unknown size, total: 186.06 KiB) to /gscratch/zlab/bparan/projects/cascades/data/bigbench/date_understanding/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0...


Generating default split:   0%|          | 0/369 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/296 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/73 [00:00<?, ? examples/s]

Dataset bigbench downloaded and prepared to /gscratch/zlab/bparan/projects/cascades/data/bigbench/date_understanding/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [306]:
# international_phonetic_alphabet_transliterate too 
d = datasets.load_dataset('bigbench', 'cs_algorithms', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']

Downloading and preparing dataset bigbench/cs_algorithms (download: Unknown size, generated: 533.76 KiB, post-processed: Unknown size, total: 533.76 KiB) to /gscratch/zlab/bparan/projects/cascades/data/bigbench/cs_algorithms/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0...


Generating default split:   0%|          | 0/1320 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/1056 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/264 [00:00<?, ? examples/s]

Dataset bigbench downloaded and prepared to /gscratch/zlab/bparan/projects/cascades/data/bigbench/cs_algorithms/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [310]:
# international_phonetic_alphabet_transliterate too 
d = datasets.load_dataset('bigbench', 'crash_blossom', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']

Downloading and preparing dataset bigbench/crash_blossom (download: Unknown size, generated: 23.93 KiB, post-processed: Unknown size, total: 23.93 KiB) to /gscratch/zlab/bparan/projects/cascades/data/bigbench/crash_blossom/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0...


Generating default split:   0%|          | 0/38 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/22 [00:00<?, ? examples/s]

2022-10-15 15:45:16.755606: W tensorflow/core/data/root_dataset.cc:247] Optimization loop failed: CANCELLED: Operation was cancelled


Generating validation split:   0%|          | 0/16 [00:00<?, ? examples/s]

Dataset bigbench downloaded and prepared to /gscratch/zlab/bparan/projects/cascades/data/bigbench/crash_blossom/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0. Subsequent calls will reuse this data.


2022-10-15 15:45:16.841486: W tensorflow/core/data/root_dataset.cc:247] Optimization loop failed: CANCELLED: Operation was cancelled


  0%|          | 0/3 [00:00<?, ?it/s]

In [316]:
# international_phonetic_alphabet_transliterate too 
d = datasets.load_dataset('bigbench', 'contextual_parametric_knowledge_conflicts', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']

Downloading and preparing dataset bigbench/contextual_parametric_knowledge_conflicts (download: Unknown size, generated: 27.84 MiB, post-processed: Unknown size, total: 27.84 MiB) to /gscratch/zlab/bparan/projects/cascades/data/bigbench/contextual_parametric_knowledge_conflicts/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0...


Generating default split:   0%|          | 0/17528 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/14023 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3505 [00:00<?, ? examples/s]

Dataset bigbench downloaded and prepared to /gscratch/zlab/bparan/projects/cascades/data/bigbench/contextual_parametric_knowledge_conflicts/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [319]:
# international_phonetic_alphabet_transliterate too 
d = datasets.load_dataset('bigbench', 'conceptual_combinations', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']



  0%|          | 0/3 [00:00<?, ?it/s]

In [321]:
# international_phonetic_alphabet_transliterate too 
d = datasets.load_dataset('bigbench', 'common_morpheme', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']



  0%|          | 0/3 [00:00<?, ?it/s]

In [322]:
# international_phonetic_alphabet_transliterate too 
d = datasets.load_dataset('bigbench', 'codenames', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']

Downloading and preparing dataset bigbench/codenames (download: Unknown size, generated: 49.31 KiB, post-processed: Unknown size, total: 49.31 KiB) to /gscratch/zlab/bparan/projects/cascades/data/bigbench/codenames/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0...


Generating default split:   0%|          | 0/85 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/68 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/17 [00:00<?, ? examples/s]

Dataset bigbench downloaded and prepared to /gscratch/zlab/bparan/projects/cascades/data/bigbench/codenames/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0. Subsequent calls will reuse this data.


2022-10-15 15:49:50.762256: W tensorflow/core/data/root_dataset.cc:247] Optimization loop failed: CANCELLED: Operation was cancelled


  0%|          | 0/3 [00:00<?, ?it/s]

In [323]:
# international_phonetic_alphabet_transliterate too 
d = datasets.load_dataset('bigbench', 'bridging_anaphora_resolution_barqa', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']

Downloading and preparing dataset bigbench/bridging_anaphora_resolution_barqa (download: Unknown size, generated: 3.76 MiB, post-processed: Unknown size, total: 3.76 MiB) to /gscratch/zlab/bparan/projects/cascades/data/bigbench/bridging_anaphora_resolution_barqa/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0...


Generating default split:   0%|          | 0/648 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/519 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/129 [00:00<?, ? examples/s]

Dataset bigbench downloaded and prepared to /gscratch/zlab/bparan/projects/cascades/data/bigbench/bridging_anaphora_resolution_barqa/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [323]:
# international_phonetic_alphabet_transliterate too 
d = datasets.load_dataset('bigbench', 'analogical_similarity', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']

Downloading and preparing dataset bigbench/bridging_anaphora_resolution_barqa (download: Unknown size, generated: 3.76 MiB, post-processed: Unknown size, total: 3.76 MiB) to /gscratch/zlab/bparan/projects/cascades/data/bigbench/bridging_anaphora_resolution_barqa/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0...


Generating default split:   0%|          | 0/648 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/519 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/129 [00:00<?, ? examples/s]

Dataset bigbench downloaded and prepared to /gscratch/zlab/bparan/projects/cascades/data/bigbench/bridging_anaphora_resolution_barqa/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [323]:
# international_phonetic_alphabet_transliterate too 
d = datasets.load_dataset('bigbench', 'abstract_narrative_understanding', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']

Downloading and preparing dataset bigbench/bridging_anaphora_resolution_barqa (download: Unknown size, generated: 3.76 MiB, post-processed: Unknown size, total: 3.76 MiB) to /gscratch/zlab/bparan/projects/cascades/data/bigbench/bridging_anaphora_resolution_barqa/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0...


Generating default split:   0%|          | 0/648 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/519 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/129 [00:00<?, ? examples/s]

Dataset bigbench downloaded and prepared to /gscratch/zlab/bparan/projects/cascades/data/bigbench/bridging_anaphora_resolution_barqa/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [325]:
# international_phonetic_alphabet_transliterate too 
d = datasets.load_dataset('bigbench', 'auto_debugging', cache_dir=cache_dir)
inputs = d['train']['inputs'] + d['validation']['inputs']
# inputs = [x.split('\n')[0] for x in inputs]
labels = d['train']['targets'] + d['validation']['targets']

Downloading and preparing dataset bigbench/auto_debugging (download: Unknown size, generated: 10.08 KiB, post-processed: Unknown size, total: 10.08 KiB) to /gscratch/zlab/bparan/projects/cascades/data/bigbench/auto_debugging/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0...


Generating default split:   0%|          | 0/34 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/18 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/16 [00:00<?, ? examples/s]

Dataset bigbench downloaded and prepared to /gscratch/zlab/bparan/projects/cascades/data/bigbench/auto_debugging/1.0.0/7d2f6e537fa937dfaac8b1c1df782f2055071d3fd8e4f4ae93d28012a354ced0. Subsequent calls will reuse this data.


2022-10-15 15:52:31.091462: W tensorflow/core/data/root_dataset.cc:247] Optimization loop failed: CANCELLED: Operation was cancelled


  0%|          | 0/3 [00:00<?, ?it/s]

In [326]:
for i in range(10):
    print(inputs[i])
    print(labels[i])
    print('-'*10 + "\n")

```
if x < 5:
	pass
```
What error does this program surface?
["NameError: name 'x' is not defined"]
----------

```
d = {1: 'hello', 2: 'bye', 3: 'good morning'}
str_ = ''
for i in range(len(d)):
	if i % 2 == 1:
		str_ += d[i]
```
What is the value of str_ after this program executes?
['hellogood morning']
----------

```
class MyClass():
	def __init__(self, param):
		self.param = param
x = MyClass(5)
y = MyClass(x.param)
```
What is the value of x.param at the end of this program?
['5']
----------

```
def sq(x):
	return x*x
y = sq(5)
x = sq(3)
```
What is the value of x at the end of this program?
['9']
----------

```
class MyClass():
	def __init__(self):
		print(param)
x = MyClass(5)
```
What is the value of x.param at the end of this program?
['None']
----------

```
while True
	print('hello world')
```
What type of exception does this program produce?
['SyntaxError: invalid syntax']
----------

```
sum = 0
for i in range(100):
	sum += i
```
What is the value of sum immediately a

In [153]:
len(inputs)

7854