<a href="https://colab.research.google.com/github/aswin-t/petl-athiruve-hanmaegeo-raulmy/blob/main/notebooks/text_to_text.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Text-to-Text Formatting of Datasets

## Imports

In [1]:
!pip install datasets --quiet
!pip install huggingface_hub --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m469.0/469.0 KB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 KB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.2/199.2 KB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.9/132.9 KB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 KB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.2/114.2 KB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m158.8/158.8 KB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━

In [2]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) n
Token is valid.
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [3]:
from datasets import concatenate_datasets, load_dataset, Dataset, DatasetDict
import pandas as pd

## Helper Classes

In [6]:
class TextToTextFormat:

  def __init__(self, parent_dataset: str, child_datasets: list[str]):
    self.parent_dataset = parent_dataset
    self.child_datasets = child_datasets

  def run(self):
    processed = list(self.load_and_process())
    combined = DatasetDict()
    for k in processed[0].keys():
      try:
        flattened = concatenate_datasets([d[k] for d in processed])
        combined[k] = flattened
      except KeyError:
        pass
    combined.push_to_hub(f"hanmaegeo/{self.parent_dataset}_text_to_text")

  def load_and_process(self):
    return map(
        lambda d: self.process_dataset(d, load_dataset(self.parent_dataset, d)),
        self.child_datasets)

  def process_dataset(self, dataset_label: str, dataset):
    res_dataset = dataset.map(
        lambda e: self.text_to_text_format(dataset_label, e))
    
    for k in res_dataset.keys():
      cols_to_remove = res_dataset[k].column_names
      cols_to_remove = [c for c in cols_to_remove if c not in ['input', 'target']]
      res_dataset[k] = res_dataset[k].remove_columns(cols_to_remove)
    
    return res_dataset

  def label_lookup(self, child_dataset: str):
    lookup = None
    if self.parent_dataset == 'glue':
      if child_dataset in ['rte', 'qnli']:
        lookup = {0: 'entailment', 1: 'not entailment', -1: 'test'}
      elif child_dataset in ['wnli']:
        lookup = {0: 'not entailment', 1: 'entailment', -1: 'test'}
      elif child_dataset in ['cola']:
        lookup = {0: 'unacceptable', 1: 'acceptable', -1: 'test'}
      elif child_dataset in ['mnli_matched', 'mnli_mismatched']:
        lookup = {0: 'entailment', 1: 'neutral', 2: 'contradiction', -1: 'test'}
      elif child_dataset == 'sst2':
        lookup = {0: 'negative', 1: 'positive', -1: 'test'}
      elif child_dataset == 'qqp':
        lookup = {0: 'not duplicate', 1: 'duplicate', -1: 'test'}
      elif child_dataset == 'mrpc':
        lookup = {0: 'not equivalent', 1: 'equivalent', -1: 'test'}
    elif self.parent_dataset == 'super_glue':
      if child_dataset in ['axb', 'axg', 'rte']:
        lookup = {0: 'entailment', 1: 'not entailment', -1: 'test'}
      elif child_dataset in ['cb']:
        lookup = {0: 'entailment', 1: 'contradiction', 2: 'neutral', -1: 'test'}
      elif child_dataset in ['boolq', 'multirc', 'wic', 'wsc']:
        lookup = {0: 'false', 1: 'true', -1: 'test'}
      elif child_dataset in ['copa']:
        lookup = {0: 'choice1', 1: 'choice2', -1: 'test'}

    return lookup

  def text_to_text_format(self, child_dataset: str, entry):
    if self.parent_dataset == 'glue':
      if child_dataset in ['cola', 'sst2']:
        this_input = f"sentence: {entry['sentence']}"
      elif child_dataset in ['mrpc', 'stsb', 'rte', 'wnli']:
        this_input = f"sentence1: {entry['sentence1']} sentence2: {entry['sentence2']}"
      elif child_dataset in ['qqp']:
        this_input = f"question1: {entry['question1']} question2: {entry['question2']}"
      elif child_dataset in ['mnli_matched', 'mnli_mismatched']:
        this_input = f"premise: {entry['premise']} hypothesis: {entry['hypothesis']}"
      elif child_dataset in ['qnli']:
        this_input = f"question: {entry['question']} sentence: {entry['sentence']}"
    elif self.parent_dataset == 'super_glue':
      if child_dataset in ['axb']:
        this_input = f"sentence1: {entry['sentence1']} sentence2: {entry['sentence2']}"
      elif child_dataset in ['axg', 'cb', 'rte']:
        this_input = f"premise: {entry['premise']} hypothesis: {entry['hypothesis']}"
      elif child_dataset in ['boolq']:
        this_input = f"question: {entry['question']} passage: {entry['passage']}"
      elif child_dataset in ['copa']:
        this_input = f"premise: {entry['premise']} choice1: {entry['choice1']} choice2: {entry['choice2']} question: {entry['question']}"
      elif child_dataset in ['multirc']:
        this_input = f"paragraph: {entry['paragraph']} question: {entry['question']} answer: {entry['answer']}"
      elif child_dataset in ['record']:
        this_input = f"passage: {entry['passage']} query: {entry['query']} entities: {str(entry['entities'])} entity_spans: {str(entry['entity_spans'])}"
      elif child_dataset in ['wic']:
        this_input = f"word: {entry['word']} sentence1: {entry['sentence1']} sentence2: {entry['sentence2']} start1: {entry['start1']} start2: {entry['start2']} end1: {entry['end1']} end2: {entry['end2']}"
      elif child_dataset in ['wsc']:
        this_input = f"text: {entry['text']} span1_index: {entry['span1_index']} span2_index: {entry['span2_index']} span1_text: {entry['span1_text']} span2_text: {entry['span2_text']}"

    if child_dataset == 'stsb':
      this_target = str(entry['label'])
    elif child_dataset == 'record':
      this_target = str(entry['answers'])
    else:
      this_target = self.label_lookup(child_dataset)[entry['label']]
      
    return {'input': this_input, 'target': this_target}



## Glue

In [7]:
glue_datasets = ['cola', 'sst2', 'mrpc', 'qqp', 'stsb', 'mnli_matched',
                 'mnli_mismatched', 'qnli', 'rte', 'wnli']

TextToTextFormat('glue', glue_datasets).run()



  0%|          | 0/3 [00:00<?, ?it/s]

Map:   0%|          | 0/8551 [00:00<?, ? examples/s]

Map:   0%|          | 0/1043 [00:00<?, ? examples/s]

Map:   0%|          | 0/1063 [00:00<?, ? examples/s]



  0%|          | 0/3 [00:00<?, ?it/s]

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]



  0%|          | 0/3 [00:00<?, ?it/s]

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]



  0%|          | 0/3 [00:00<?, ?it/s]

Map:   0%|          | 0/363846 [00:00<?, ? examples/s]

Map:   0%|          | 0/40430 [00:00<?, ? examples/s]

Map:   0%|          | 0/390965 [00:00<?, ? examples/s]



  0%|          | 0/3 [00:00<?, ?it/s]

Map:   0%|          | 0/5749 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1379 [00:00<?, ? examples/s]



  0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/9815 [00:00<?, ? examples/s]

Map:   0%|          | 0/9796 [00:00<?, ? examples/s]



  0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/9832 [00:00<?, ? examples/s]

Map:   0%|          | 0/9847 [00:00<?, ? examples/s]



  0%|          | 0/3 [00:00<?, ?it/s]

Map:   0%|          | 0/104743 [00:00<?, ? examples/s]

Map:   0%|          | 0/5463 [00:00<?, ? examples/s]

Map:   0%|          | 0/5463 [00:00<?, ? examples/s]



  0%|          | 0/3 [00:00<?, ?it/s]

Map:   0%|          | 0/2490 [00:00<?, ? examples/s]

Map:   0%|          | 0/277 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]



  0%|          | 0/3 [00:00<?, ?it/s]

Map:   0%|          | 0/635 [00:00<?, ? examples/s]

Map:   0%|          | 0/71 [00:00<?, ? examples/s]

Map:   0%|          | 0/146 [00:00<?, ? examples/s]



Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/70 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]



Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/426 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/471 [00:00<?, ?B/s]

## Super Glue

In [8]:
super_glue_datasets = ['axb', 'axg', 'boolq', 'cb', 'copa', 'multirc', 'record',
                       'rte', 'wic', 'wsc']

TextToTextFormat('super_glue', super_glue_datasets).run()

Downloading builder script:   0%|          | 0.00/30.7k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/38.7k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/14.8k [00:00<?, ?B/s]

Downloading and preparing dataset super_glue/axb to /root/.cache/huggingface/datasets/super_glue/axb/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed...


Downloading data:   0%|          | 0.00/34.0k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/1104 [00:00<?, ? examples/s]

Dataset super_glue downloaded and prepared to /root/.cache/huggingface/datasets/super_glue/axb/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/1104 [00:00<?, ? examples/s]

Downloading and preparing dataset super_glue/axg to /root/.cache/huggingface/datasets/super_glue/axg/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed...


Downloading data:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/356 [00:00<?, ? examples/s]

Dataset super_glue downloaded and prepared to /root/.cache/huggingface/datasets/super_glue/axg/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/356 [00:00<?, ? examples/s]

Downloading and preparing dataset super_glue/boolq to /root/.cache/huggingface/datasets/super_glue/boolq/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed...


Downloading data:   0%|          | 0.00/4.12M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9427 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3270 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3245 [00:00<?, ? examples/s]

Dataset super_glue downloaded and prepared to /root/.cache/huggingface/datasets/super_glue/boolq/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Map:   0%|          | 0/9427 [00:00<?, ? examples/s]

Map:   0%|          | 0/3270 [00:00<?, ? examples/s]

Map:   0%|          | 0/3245 [00:00<?, ? examples/s]

Downloading and preparing dataset super_glue/cb to /root/.cache/huggingface/datasets/super_glue/cb/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed...


Downloading data:   0%|          | 0.00/75.5k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/250 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/56 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/250 [00:00<?, ? examples/s]

Dataset super_glue downloaded and prepared to /root/.cache/huggingface/datasets/super_glue/cb/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

Map:   0%|          | 0/56 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

Downloading and preparing dataset super_glue/copa to /root/.cache/huggingface/datasets/super_glue/copa/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed...


Downloading data:   0%|          | 0.00/44.0k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/400 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/500 [00:00<?, ? examples/s]

Dataset super_glue downloaded and prepared to /root/.cache/huggingface/datasets/super_glue/copa/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Downloading and preparing dataset super_glue/multirc to /root/.cache/huggingface/datasets/super_glue/multirc/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed...


Downloading data:   0%|          | 0.00/1.12M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/27243 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/4848 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/9693 [00:00<?, ? examples/s]

Dataset super_glue downloaded and prepared to /root/.cache/huggingface/datasets/super_glue/multirc/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Map:   0%|          | 0/27243 [00:00<?, ? examples/s]

Map:   0%|          | 0/4848 [00:00<?, ? examples/s]

Map:   0%|          | 0/9693 [00:00<?, ? examples/s]

Downloading and preparing dataset super_glue/record to /root/.cache/huggingface/datasets/super_glue/record/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed...


Downloading data:   0%|          | 0.00/51.8M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/100730 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Dataset super_glue downloaded and prepared to /root/.cache/huggingface/datasets/super_glue/record/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Map:   0%|          | 0/100730 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Downloading and preparing dataset super_glue/rte to /root/.cache/huggingface/datasets/super_glue/rte/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed...


Downloading data:   0%|          | 0.00/751k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2490 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/277 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3000 [00:00<?, ? examples/s]

Dataset super_glue downloaded and prepared to /root/.cache/huggingface/datasets/super_glue/rte/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Map:   0%|          | 0/2490 [00:00<?, ? examples/s]

Map:   0%|          | 0/277 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Downloading and preparing dataset super_glue/wic to /root/.cache/huggingface/datasets/super_glue/wic/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed...


Downloading data:   0%|          | 0.00/396k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5428 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/638 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1400 [00:00<?, ? examples/s]

Dataset super_glue downloaded and prepared to /root/.cache/huggingface/datasets/super_glue/wic/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Map:   0%|          | 0/5428 [00:00<?, ? examples/s]

Map:   0%|          | 0/638 [00:00<?, ? examples/s]

Map:   0%|          | 0/1400 [00:00<?, ? examples/s]

Downloading and preparing dataset super_glue/wsc to /root/.cache/huggingface/datasets/super_glue/wsc/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed...


Downloading data:   0%|          | 0.00/32.8k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/554 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/104 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/146 [00:00<?, ? examples/s]

Dataset super_glue downloaded and prepared to /root/.cache/huggingface/datasets/super_glue/wsc/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Map:   0%|          | 0/554 [00:00<?, ? examples/s]

Map:   0%|          | 0/104 [00:00<?, ? examples/s]

Map:   0%|          | 0/146 [00:00<?, ? examples/s]



Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/30 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]