In [2]:
from datasets import load_dataset, load_from_disk, concatenate_datasets
from promptsource import templates

CACHE_DIR = "/share/edc/home/antonis/datasets/huggingface"
import os
os.environ["HF_DATASETS_CACHE"] = CACHE_DIR

# # Get a list of all supported datasets
# datasets = templates.get_dataset_names()
# print(datasets)

In [1]:
import os
import json
import logging
from datasets import load_dataset, Dataset
from promptsource.templates import DatasetTemplates, TemplateCollection
# set logging level to INFO
logger = logging.getLogger(__name__)
logger.setLevel(20)

TOMixture = [
    # ("glue","mrpc"), # Paraphrase identification
    # ("glue","qqp"),
    # ("paws","labeled_final"),
    # ("kilt_tasks", "hotpotqa"), # Closed-book QA
    # ("wiki_qa", None),
    # ("adversarial_qa", "dbidaf"), # Extractive QA
    # ("adversarial_qa","dbert"),
    # ("adversarial_qa","droberta"),
    # ("duorc","SelfRC"),
    # ("duorc","ParaphraseRC"),
    # ("ropes",None),
    # ("quoref",None),
    # ("cos_e","v1.11"), # Multiple-choice QA
    # ("cosmos_qa",None),
    # ("dream",None),
    # ("qasc",None),
    # ("quail",None),
    # ("quarel",None),
    # ("quartz",None),
    # ("sciq",None),
    # ("social_i_qa",None),
    # ("wiki_hop","original"),
    # ("wiqa",None),
    # ("amazon_polarity",None), # Sentiment
    # ("app_reviews",None),
    ("sst","default"), # Senitment Classification")
    ("imdb",None),
    # ("rotten_tomatoes",None),
    # ("yelp_review_full",None),
    # ("common_gen",None), # Structure-to-text
    # ("wiki_bio",None),
    # ("cnn_dailymail","3.0.0"), # Summarization
    # ("gigaword",None),
    # ("multi_news",None),
    # ("samsum",None),
    # ("xsum",None),
    # ("ag_news",None), # Topic Classification
    # ("dbpedia_14",None),
    # ("trec",None),
]

def get_dataset_name(name: str, subset: str):
    if subset is not None:
        canonized_name = f"{name}/{subset}"
    else:
        canonized_name = name
    return canonized_name

def get_T0MixtureDatasets(split, max_samples=None, return_as_dict=True):
    """
    T0MixtureDatasets creates a separate dataset for each dataset in the mixture
    """
    datasets = {} if return_as_dict else []
    for name, subset in TOMixture:
        dataset = load_dataset(name, subset, split=split, cache_dir=CACHE_DIR)
        if max_samples:
            dataset = Dataset.from_dict(dataset[:max_samples])
        templates = [template for id, template in DatasetTemplates(name, subset).templates.items()]
        dataset.templates = templates
        dataset.name = get_dataset_name(name, subset)

        if return_as_dict:
            datasets[get_dataset_name(name, subset)] = dataset
        else:
            datasets.append(dataset)


        logger.info(f"Loaded dataset {name}/{subset} with {len(templates)} templates")
        assert(len(templates) > 0), "No templates"
    return datasets

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import os
import json
import logging
from datasets import load_dataset, Dataset
from promptsource.templates import DatasetTemplates, TemplateCollection
# set logging level to INFO
# from src._promptsource import get_T0MixtureDatasets 
logger = logging.getLogger(__name__)
logger.setLevel(20)

In [4]:
datasets = get_T0MixtureDatasets("test", max_samples=1000, return_as_dict=True)
ds_sst = load_dataset("sst", "default", split="test", cache_dir=CACHE_DIR)
ds_imdb = load_dataset("imdb", split="test", cache_dir=CACHE_DIR)
# ds_yelp_review = load_dataset("yelp_review_full", split="train", cache_dir=CACHE_DIR)
# ds_sentiment140 = load_dataset("sentiment140", split="train", cache_dir=CACHE_DIR)

Found cached dataset sst (/share/edc/home/antonis/datasets/huggingface/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff)
Found cached dataset imdb (/share/edc/home/antonis/datasets/huggingface/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)
Found cached dataset sst (/share/edc/home/antonis/datasets/huggingface/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff)
Found cached dataset imdb (/share/edc/home/antonis/datasets/huggingface/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)


In [8]:
name, subset = "multi_nli", None
dataset = load_dataset(name, subset, split="train", cache_dir=CACHE_DIR)
templates = [template for id, template in DatasetTemplates(name, subset).templates.items()]

Found cached dataset multi_nli (/share/edc/home/antonis/datasets/huggingface/multi_nli/default/0.0.0/591f72eb6263d1ab527561777936b199b714cda156d35716881158a2bd144f39)


In [9]:
print(templates[0].__dict__.keys())
templates[0].answer_choices.split(' ||| ')

dict_keys(['answer_choices', 'id', 'jinja', 'metadata', 'name', 'reference'])


['True', 'Inconclusive', 'False']

In [11]:
import random
template = random.choice(templates)
prompt, answer = template.apply(dataset[0])

In [12]:
prompt, answer

('Conceptually cream skimming has two basic dimensions - product and geography. Are we justified in saying that "Product and geography are what make cream skimming work. "? Yes, no, or maybe?',
 'Maybe')