In [1]:
# !pip install seqio
# !pip install t5

In [2]:
# !pip install -U huggingface_hub
# !pip install -U ipywidgets

In [1]:
import seqio
import functools
import torch
from datasets import Dataset
import t5.data
from t5.data import postprocessors
from t5.data import preprocessors
from t5.data.glue_utils import get_glue_metric
from t5.data.glue_utils import get_glue_postprocess_fn
from t5.data.glue_utils import get_glue_text_preprocessor
from t5.data.glue_utils import get_super_glue_metric
from t5.evaluation import metrics
import tensorflow_datasets as tfds
from t5.models import utils as model_utils
import gin
from absl import logging
from datasets import load_dataset,concatenate_datasets,DatasetDict

In [2]:
tfds.__version__

'4.5.2+nightly'

In [3]:
tfds.__version__

'4.5.2+nightly'

In [4]:
TaskRegistry = seqio.TaskRegistry



DEFAULT_OUTPUT_FEATURES = {
    "inputs": seqio.Feature(
        vocabulary=t5.data.get_default_vocabulary(), add_eos=True,
        required=False),
    "targets": seqio.Feature(
        vocabulary=t5.data.get_default_vocabulary(), add_eos=True)
}

# ==================================== C4 ======================================
# Final pretraining task used in Raffel et al., 2019.
TaskRegistry.add(
    "c4_v220_span_corruption",
    source=seqio.TfdsDataSource(tfds_name="c4/en:2.2.0"),
    preprocessors=[
        functools.partial(
            preprocessors.rekey, key_map={
                "inputs": None,
                "targets": "text"
            }),
        seqio.preprocessors.tokenize,
        seqio.CacheDatasetPlaceholder(),
        preprocessors.span_corruption,
        seqio.preprocessors.append_eos_after_trim,

    ],
    output_features=DEFAULT_OUTPUT_FEATURES,
    metric_fns=[])


# Baseline pretraining task used in Raffel et al., 2019.
TaskRegistry.add(
    "c4_v220_iid_denoising",
    source=seqio.TfdsDataSource(tfds_name="c4/en:2.2.0"),
    preprocessors=[
        functools.partial(
            preprocessors.rekey, key_map={
                "inputs": None,
                "targets": "text"
            }),
        seqio.preprocessors.tokenize,
        seqio.CacheDatasetPlaceholder(),
        preprocessors.iid_denoising,
        seqio.preprocessors.append_eos_after_trim,
    ],
    output_features=DEFAULT_OUTPUT_FEATURES,
    metric_fns=[])


# Prefix language modeling pretraining task used in Raffel et al., 2019.
TaskRegistry.add(
    "c4_v220_prefix_lm",
    source=seqio.TfdsDataSource(tfds_name="c4/en:2.2.0"),
    preprocessors=[
        functools.partial(
            preprocessors.rekey, key_map={
                "inputs": None,
                "targets": "text"
            }),
        seqio.preprocessors.tokenize,
        seqio.CacheDatasetPlaceholder(),
        preprocessors.prefix_lm,
        seqio.preprocessors.append_eos_after_trim,
    ],
    output_features=DEFAULT_OUTPUT_FEATURES,
    metric_fns=[])


# Configurable tasks used for comparisons in Raffel et al., 2019.
_c4_config_suffixes = ["", ".noclean", ".realnewslike", ".webtextlike"]
for config_suffix in _c4_config_suffixes:
    TaskRegistry.add(
        "c4{name}_v020_unsupervised".format(name=config_suffix.replace(".", "_")),
        source=seqio.TfdsDataSource(tfds_name="c4/en{config}:2.2.0".format(
          config=config_suffix)),
        preprocessors=[
          functools.partial(
              preprocessors.rekey, key_map={
                  "inputs": None,
                  "targets": "text"
              }),
          seqio.preprocessors.tokenize,
          seqio.CacheDatasetPlaceholder(),
          preprocessors.unsupervised,
          seqio.preprocessors.append_eos_after_trim,
        ],
        output_features=DEFAULT_OUTPUT_FEATURES,
        metric_fns=[])


# ================================ Wikipedia ===================================
TaskRegistry.add(
    "wikipedia_20190301.en_v003_unsupervised",
    source=seqio.TfdsDataSource(tfds_name="wikipedia/20190301.en:1.0.0"),
    preprocessors=[
        functools.partial(
            preprocessors.rekey, key_map={
                "inputs": None,
                "targets": "text"
            }),
        seqio.preprocessors.tokenize,
        seqio.CacheDatasetPlaceholder(),
        preprocessors.unsupervised,
        seqio.preprocessors.append_eos_after_trim,
    ],
    output_features=DEFAULT_OUTPUT_FEATURES,
    metric_fns=[])


# =================================== GLUE =====================================
for b in tfds.text.glue.Glue.builder_configs.values():
    TaskRegistry.add(
        "glue_%s_v002" % b.name,
        source=seqio.TfdsDataSource(
        tfds_name="glue/%s:2.0.0" % b.name,
        splits=["test"] if b.name == "ax" else None),
        preprocessors=[
        get_glue_text_preprocessor(b),
        seqio.preprocessors.tokenize,
        seqio.CacheDatasetPlaceholder(),
        seqio.preprocessors.append_eos_after_trim,
        ],
        metric_fns=get_glue_metric(b.name),
        output_features=DEFAULT_OUTPUT_FEATURES,
        postprocess_fn=get_glue_postprocess_fn(b))

# =============================== CNN DailyMail ================================
TaskRegistry.add(
    "cnn_dailymail_v002",
    source=seqio.TfdsDataSource(tfds_name="cnn_dailymail:3.1.0"),
    preprocessors=[
        functools.partial(
            preprocessors.summarize,
            article_key="article",
            summary_key="highlights"),
        seqio.preprocessors.tokenize,
        seqio.CacheDatasetPlaceholder(),
        seqio.preprocessors.append_eos_after_trim,
    ],
    metric_fns=[metrics.rouge],
    output_features=DEFAULT_OUTPUT_FEATURES)

# ==================================== WMT =====================================
# Format: year, tfds builder config, tfds version
b_configs = [
    ("14", tfds.translate.wmt14.Wmt14Translate.builder_configs["de-en"], "1.0.0"
    ),
    ("14", tfds.translate.wmt14.Wmt14Translate.builder_configs["fr-en"], "1.0.0"
    ),
    ("16", tfds.translate.wmt16.Wmt16Translate.builder_configs["ro-en"], "1.0.0"
    ),
    ("15", tfds.translate.wmt15.Wmt15Translate.builder_configs["fr-en"], "1.0.0"
    ),
    ("19", tfds.translate.wmt19.Wmt19Translate.builder_configs["de-en"], "1.0.0"
    ),
]

for prefix, b, tfds_version in b_configs:
    TaskRegistry.add(
        "wmt%s_%s%s_v003" % (prefix, b.language_pair[1], b.language_pair[0]),
        source=seqio.TfdsDataSource(tfds_name="wmt%s_translate/%s:%s" %
                                  (prefix, b.name, tfds_version)),
        preprocessors=[
          functools.partial(
              preprocessors.translate,
              source_language=b.language_pair[1],
              target_language=b.language_pair[0],
          ),
          seqio.preprocessors.tokenize,
          seqio.CacheDatasetPlaceholder(),
          seqio.preprocessors.append_eos_after_trim,
        ],
        metric_fns=[metrics.bleu],
        output_features=DEFAULT_OUTPUT_FEATURES)

# Special case for t2t ende.
b = tfds.translate.wmt_t2t.WmtT2tTranslate.builder_configs["de-en"]
TaskRegistry.add(
    "wmt_t2t_ende_v003",
    source=seqio.TfdsDataSource(tfds_name="wmt_t2t_translate/de-en:1.0.0"),
    preprocessors=[
        functools.partial(
            preprocessors.translate,
            source_language=b.language_pair[1],
            target_language=b.language_pair[0]),
        seqio.preprocessors.tokenize,
        seqio.CacheDatasetPlaceholder(),
        seqio.preprocessors.append_eos_after_trim,
    ],
    metric_fns=[metrics.bleu],
    output_features=DEFAULT_OUTPUT_FEATURES)

# ================================= SuperGlue ==================================
for b in tfds.text.super_glue.SuperGlue.builder_configs.values():
  # We use a simplified version of WSC, defined below
    if "wsc" in b.name:
        continue
    if b.name == "axb":
        glue_preprocessors = [
            functools.partial(
                preprocessors.rekey,
                key_map={
                    "premise": "sentence1",
                    "hypothesis": "sentence2",
                    "label": "label",
                    "idx": "idx",
                }),
            get_glue_text_preprocessor(b),
            seqio.preprocessors.tokenize,
            seqio.CacheDatasetPlaceholder(),
            seqio.preprocessors.append_eos_after_trim,
        ]
    else:
        glue_preprocessors = [
            get_glue_text_preprocessor(b),
            seqio.preprocessors.tokenize,
            seqio.CacheDatasetPlaceholder(),
            seqio.preprocessors.append_eos_after_trim,
    ]
    TaskRegistry.add(
        "super_glue_%s_v102" % b.name,
        source=seqio.TfdsDataSource(
          tfds_name="super_glue/%s:1.0.2" % b.name,
          splits=["test"] if b.name in ["axb", "axg"] else None),
        preprocessors=glue_preprocessors,
        metric_fns=get_super_glue_metric(b.name),
        output_features=DEFAULT_OUTPUT_FEATURES,
        postprocess_fn=get_glue_postprocess_fn(b))

    # Create SuperGLUE tasks with 1 sentinel token added.
    seqio.experimental.add_task_with_sentinels("super_glue_%s_v102" % b.name,
                                             num_sentinels=1)

# ======================== Definite Pronoun Resolution =========================
TaskRegistry.add(
    "dpr_v001_simple",
    source=seqio.TfdsDataSource(tfds_name="definite_pronoun_resolution:1.1.0"),
    preprocessors=[
        preprocessors.definite_pronoun_resolution_simple,
        seqio.preprocessors.tokenize,
        seqio.CacheDatasetPlaceholder(),
        seqio.preprocessors.append_eos_after_trim,
    ],
    metric_fns=[metrics.accuracy],
    output_features=DEFAULT_OUTPUT_FEATURES)

# Create SuperGLUE tasks with 1 sentinel token added.
seqio.experimental.add_task_with_sentinels("dpr_v001_simple", num_sentinels=1)

# =================================== WSC ======================================
TaskRegistry.add(
    "super_glue_wsc_v102_simple_train",
    source=seqio.TfdsDataSource(
        tfds_name="super_glue/wsc.fixed:1.0.2", splits=["train"]),
    preprocessors=[
        functools.partial(preprocessors.wsc_simple, correct_referent_only=True),
        seqio.preprocessors.tokenize,
        seqio.CacheDatasetPlaceholder(),
        seqio.preprocessors.append_eos_after_trim,
    ],
    metric_fns=[],
    output_features=DEFAULT_OUTPUT_FEATURES)

# Create SuperGLUE tasks with 1 sentinel token added.
seqio.experimental.add_task_with_sentinels("super_glue_wsc_v102_simple_train",
                                           num_sentinels=1)

TaskRegistry.add(
    "super_glue_wsc_v102_simple_eval",
    source=seqio.TfdsDataSource(
        tfds_name="super_glue/wsc.fixed:1.0.2", splits=["validation", "test"]),
    preprocessors=[
        functools.partial(
            preprocessors.wsc_simple, correct_referent_only=False),
        seqio.preprocessors.tokenize,
        seqio.CacheDatasetPlaceholder(),
        seqio.preprocessors.append_eos_after_trim,
    ],
    postprocess_fn=postprocessors.wsc_simple,
    metric_fns=[metrics.accuracy],
    output_features=DEFAULT_OUTPUT_FEATURES)
# Create SuperGLUE tasks with 1 sentinel token added.
seqio.experimental.add_task_with_sentinels("super_glue_wsc_v102_simple_eval",
                                           num_sentinels=1)

# =================================== WNLI =====================================
TaskRegistry.add(
    "glue_wnli_v002_simple_eval",
    source=seqio.TfdsDataSource(
        tfds_name="glue/wnli:1.0.0", splits=["validation", "test"]),
    preprocessors=[
        preprocessors.wnli_simple,
        seqio.preprocessors.tokenize,
        seqio.CacheDatasetPlaceholder(),
        seqio.preprocessors.append_eos_after_trim,
    ],
    postprocess_fn=postprocessors.wsc_simple,
    metric_fns=[metrics.accuracy],
    output_features=DEFAULT_OUTPUT_FEATURES)

# =================================== Squad ====================================
# Maximized evaluation metrics over all answers.
TaskRegistry.add(
    "squad_v010_allanswers",
    source=seqio.TfdsDataSource(tfds_name="squad/v1.1:3.0.0"),
    preprocessors=[
        preprocessors.squad,
        seqio.preprocessors.tokenize,
        seqio.CacheDatasetPlaceholder(),
        seqio.preprocessors.append_eos_after_trim,
    ],
    postprocess_fn=postprocessors.qa,
    metric_fns=[metrics.squad],
    output_features=DEFAULT_OUTPUT_FEATURES)


# Maximized evaluation metrics over all answers.
TaskRegistry.add(
    "squad_v010_context_free",
    source=seqio.TfdsDataSource(tfds_name="squad/v1.1:3.0.0"),
    preprocessors=[
        functools.partial(preprocessors.squad, include_context=False),
        seqio.preprocessors.tokenize,
        seqio.CacheDatasetPlaceholder(),
        seqio.preprocessors.append_eos_after_trim,
    ],
    postprocess_fn=postprocessors.qa,
    metric_fns=[metrics.squad],
    output_features=DEFAULT_OUTPUT_FEATURES)

# Squad span prediction task instead of text.
TaskRegistry.add(
    "squad_v010_allanswers_span",
    source=seqio.TfdsDataSource(tfds_name="squad/v1.1:3.0.0"),
    preprocessors=[
        preprocessors.squad_span_space_tokenized,
        seqio.preprocessors.tokenize,
        seqio.CacheDatasetPlaceholder(),
        seqio.preprocessors.append_eos_after_trim,
    ],
    postprocess_fn=postprocessors.span_qa,
    metric_fns=[metrics.span_squad],
    output_features=DEFAULT_OUTPUT_FEATURES)

# Deprecated: Use `squad_v010_allanswers` instead.
TaskRegistry.add(
    "squad_v010",
    source=seqio.TfdsDataSource(tfds_name="squad/v1.1:3.0.0"),
    preprocessors=[
        preprocessors.squad,
        seqio.preprocessors.tokenize,
        seqio.CacheDatasetPlaceholder(),
        seqio.preprocessors.append_eos_after_trim,
    ],
    metric_fns=[metrics.squad],
    output_features=DEFAULT_OUTPUT_FEATURES)

# ================================= TriviaQA ===================================
TaskRegistry.add(
    "trivia_qa_v010",
    source=seqio.TfdsDataSource(tfds_name="trivia_qa/rc:1.1.0"),
    preprocessors=[
        preprocessors.trivia_qa,
        seqio.preprocessors.tokenize,
        seqio.CacheDatasetPlaceholder(),
        preprocessors.trivia_qa_truncate_inputs,
        seqio.preprocessors.append_eos_after_trim,
    ],
    metric_fns=[],
    output_features=DEFAULT_OUTPUT_FEATURES)


# =============== PrefixLM objectives (not used in the T5 paper) ===============


# # Vocabulary (shared by encoder and decoder)
# sentencepiece_model_file = "gs://t5-data/vocabs/cc_all.32000.100extra/sentencepiece.model"

# vocab = seqio.SentencePieceVocabulary(sentencepiece_model_file)

# seqio.TaskRegistry.add(
#     "c4_prefix_lm_objective_encoder_decoder_architecture",
#     source=seqio.TfdsDataSource(tfds_name="c4/en:2.2.0"),
#     preprocessors=[
#         functools.partial(
#             preprocessors.rekey, key_map={
#                 "inputs": None,
#                 "targets": "text"
#             }),
#         seqio.preprocessors.tokenize,
#         seqio.CacheDatasetPlaceholder(),
#         preprocessors.targets_for_prefix_lm_objective,
#         preprocessors.pack_prefix_lm_encoder_decoder,
#     ],
#     output_features={
#         "encoder_input_tokens": seqio.Feature(vocabulary=vocab, add_eos=False),
#         "decoder_target_tokens": seqio.Feature(vocabulary=vocab, add_eos=False),
#         "decoder_input_tokens": seqio.Feature(vocabulary=vocab, add_eos=False),
#         "encoder_segment_ids": seqio.Feature(vocabulary=vocab, add_eos=False),
#         "encoder_positions": seqio.Feature(vocabulary=vocab, add_eos=False),
#         "decoder_segment_ids": seqio.Feature(vocabulary=vocab, add_eos=False),
#         "decoder_positions": seqio.Feature(vocabulary=vocab, add_eos=False),
#         "decoder_loss_weights": seqio.Feature(vocabulary=vocab, add_eos=False),
#         # All but the last stage of the preprocessing uses "targets" as the key,
#         # so this output feature is necessary. It is not marked required because
#         # the final preprocessor drops it.
#         "targets": seqio.Feature(vocabulary=vocab, required=False),
#     },
#     metric_fns=[])


# seqio.TaskRegistry.add(
#     "c4_prefix_lm_objective_decoder_architecture",
#     source=seqio.TfdsDataSource(tfds_name="c4/en:2.2.0"),
#     preprocessors=[
#         functools.partial(
#             preprocessors.rekey, key_map={
#                 "inputs": None,
#                 "targets": "text"
#             }),
#         seqio.preprocessors.tokenize,
#         seqio.CacheDatasetPlaceholder(),
#         preprocessors.targets_for_prefix_lm_objective,
#         preprocessors.pack_prefix_lm_decoder_only,
#     ],
#     output_features={
#         "decoder_target_tokens": seqio.Feature(vocabulary=vocab, add_eos=False),
#         "decoder_input_tokens": seqio.Feature(vocabulary=vocab, add_eos=False),
#         "decoder_loss_weights": seqio.Feature(vocabulary=vocab, add_eos=False),
#         "decoder_causal_attention": seqio.Feature(
#             vocabulary=vocab, add_eos=False),
#         # All but the last stage of the preprocessing uses "targets" as the key,
#         # so this output feature is necessary. It is not marked required because
#         # the final preprocessor drops it.
#         "targets": seqio.Feature(vocabulary=vocab, required=False),
#     },
#     metric_fns=[])


# TaskRegistry.add(
#     "c4_v220_full_lm",
#     source=seqio.TfdsDataSource(tfds_name="c4/en:2.2.0"),
#     preprocessors=[
#         functools.partial(
#             preprocessors.rekey, key_map={
#                 "inputs": None,
#                 "targets": "text"
#             }),
#         seqio.preprocessors.tokenize,
#         seqio.CacheDatasetPlaceholder(),
#         preprocessors.full_lm,
#     ],
#     output_features={
#         "targets": seqio.Feature(vocabulary=vocab, add_eos=True)
#     },
#     metric_fns=[])

<seqio.dataset_providers.Task at 0x7f9cacfd5340>

# EDA

In [6]:
#glue_mnli_v002
#super_glue_cb_v102
dataset = seqio.get_mixture_or_task("super_glue_multirc_v102").get_dataset(
    sequence_length={"inputs": 256, "targets": 128},
    split="train",
    shuffle=True,
    num_epochs=1,
    shard_info=seqio.ShardInfo(index=0, num_shards=10),
    use_cached=False,
    seed=42
)

iterator = dataset.as_numpy_iterator()

2022-03-19 07:15:06.901175: W tensorflow/core/platform/cloud/google_auth_provider.cc:184] All attempts to get a Google authentication bearer token failed, returning an empty token. Retrieving token from files failed with "NOT_FOUND: Could not locate the credentials file.". Retrieving token from GCE failed with "FAILED_PRECONDITION: Error executing an HTTP request: libcurl code 6 meaning 'Couldn't resolve host name', error details: Could not resolve host: metadata".
2022-03-19 07:15:07.665418: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:922] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-03-19 07:15:07.667788: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:922] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-03-19 07:15:07.668081: I tensorflow/stream_executor/cuda/cuda_gpu_executor.

In [7]:
for i, ex in enumerate(iterator):
    print(ex.keys())
    print(ex)
    break

dict_keys(['idx/paragraph', 'idx/question', 'idx/answer', 'inputs_pretokenized', 'inputs', 'targets_pretokenized', 'targets'])
{'idx/paragraph': 134, 'idx/question': 1523, 'idx/answer': 7935, 'inputs_pretokenized': b"multirc question: Where was Toodles when the duck bit his ear? answer: In the water paragraph: Once upon a time I had a dog named Toodles. He was black and white and had long floppy ears. He also had very short legs, but really big paws. Every Saturday we would go to the park and play Toodles' favorite game. Toodles loved playing fetch. One Saturday, Toodles ran over to the pond because he saw ducks swimming there. He ran all around the pond, barking at the ducks. The ducks ignored him, and kept swimming. Toodles wasn't having it! He jumped into the pond and started swimming toward the ducks, chasing around his new playmates. One of the ducks, braver than the others, poked Toodles with his beak - and then bit him right on one of his floppy ears! Toodles barked and ran out 

In [54]:
#glue_mnli_v002
#super_glue_cb_v102
dataset = seqio.get_mixture_or_task("super_glue_wsc_axg_n").get_dataset(
    sequence_length={"inputs": 256, "targets": 128},
    split="train",
    shuffle=True,
    num_epochs=1,
    shard_info=seqio.ShardInfo(index=0, num_shards=10),
    use_cached=False,
    seed=42
)

iterator = dataset.as_numpy_iterator()

for i, ex in enumerate(iterator):
    print(ex.keys())
    print(ex)
    break

dict_keys(['inputs_pretokenized', 'inputs', 'targets_pretokenized', 'targets', 'label', 'idx'])
{'inputs_pretokenized': b'wsc: John hired Bill to take care of *him* .', 'inputs': array([    3,   210,     7,    75,    10,  1079, 10626,  3259,    12,
         240,   124,    13,  1429, 10813,  1935,     3,     5,     1],
      dtype=int32), 'targets_pretokenized': b'John', 'targets': array([1079,    1], dtype=int32), 'label': 1, 'idx': 461}


In [7]:
dictionary = {
    "idx": [],
    "inputs_pretokenized": [],
    "targets_pretokenized": []
}
for i, ex in enumerate(iterator):
    dictionary["idx"].append(ex["idx"])
    dictionary["inputs_pretokenized"].append(ex["inputs_pretokenized"].decode("utf-8") )
    dictionary["targets_pretokenized"].append(ex["targets_pretokenized"].decode("utf-8") )


In [8]:
dataset = Dataset.from_dict(dictionary)

In [9]:
dataset.column_names

['idx', 'inputs_pretokenized', 'targets_pretokenized']

In [10]:
dataset['inputs_pretokenized'][:2]

["mnli hypothesis: You won't learn anything by serving overseas.  premise: because actually when you when you do uh service overseas you end up learning something usually that's that's really useful plumbing or farming or or something like that so you're really learning a skill",
 'mnli hypothesis: This is a stringed instrument created by the ancient Aztecs. premise: The geiro, a percussion instrument made of a notched dried gourd, was developed by the Taano Indians.']

In [11]:
dataset['targets_pretokenized'][:2]

['contradiction', 'contradiction']

In [48]:
#glue_mnli_v002
#super_glue_cb_v102
dataset = seqio.get_mixture_or_task("glue_mnli_v002").get_dataset(
    sequence_length={"inputs": 1, "targets": 1},
    split="train",
    shuffle=True,
    num_epochs=1,
    shard_info=seqio.ShardInfo(index=0, num_shards=10),
    use_cached=False,
    seed=42
)

iterator = dataset.as_numpy_iterator()

ValueError: No Task or Mixture found with name 'glue_wic_v002'. Available:
 - c4_noclean_v020_unsupervised
 - c4_realnewslike_v020_unsupervised
 - c4_v020_unsupervised
 - c4_v220_iid_denoising
 - c4_v220_prefix_lm
 - c4_v220_span_corruption
 - c4_webtextlike_v020_unsupervised
 - cnn_dailymail_v002
 - dpr_v001_simple
 - dpr_v001_simple_1_sentinel
 - glue_ax_v002
 - glue_cola_v002
 - glue_mnli_matched_v002
 - glue_mnli_mismatched_v002
 - glue_mnli_v002
 - glue_mrpc_v002
 - glue_qnli_v002
 - glue_qqp_v002
 - glue_rte_v002
 - glue_sst2_v002
 - glue_stsb_v002
 - glue_wnli_v002
 - glue_wnli_v002_simple_eval
 - squad_v010
 - squad_v010_allanswers
 - squad_v010_allanswers_span
 - squad_v010_context_free
 - super_glue_axb_v102
 - super_glue_axb_v102_1_sentinel
 - super_glue_axg_v102
 - super_glue_axg_v102_1_sentinel
 - super_glue_boolq_v102
 - super_glue_boolq_v102_1_sentinel
 - super_glue_cb_v102
 - super_glue_cb_v102_1_sentinel
 - super_glue_copa_v102
 - super_glue_copa_v102_1_sentinel
 - super_glue_multirc_v102
 - super_glue_multirc_v102_1_sentinel
 - super_glue_record_v102
 - super_glue_record_v102_1_sentinel
 - super_glue_rte_v102
 - super_glue_rte_v102_1_sentinel
 - super_glue_wic_v102
 - super_glue_wic_v102_1_sentinel
 - super_glue_wsc_v102_simple_1_sentinel_eval
 - super_glue_wsc_v102_simple_1_sentinel_train
 - super_glue_wsc_v102_simple_eval
 - super_glue_wsc_v102_simple_train
 - trivia_qa_v010
 - wikipedia_20190301.en_v003_unsupervised
 - wmt14_ende_v003
 - wmt14_enfr_v003
 - wmt15_enfr_v003
 - wmt16_enro_v003
 - wmt19_ende_v003
 - wmt_t2t_ende_v003

In [13]:
dictionary = {
    "idx": [],
    "inputs_pretokenized": [],
    "targets_pretokenized": []
}
for i, ex in enumerate(iterator):
    dictionary["idx"].append(ex["idx"])
    dictionary["inputs_pretokenized"].append(ex["inputs_pretokenized"].decode("utf-8") )
    dictionary["targets_pretokenized"].append(ex["targets_pretokenized"].decode("utf-8") )

In [14]:
dataset = Dataset.from_dict(dictionary)

In [15]:
dataset['inputs_pretokenized'][:2]

["mnli hypothesis: You won't learn anything by serving overseas.  premise: because actually when you when you do uh service overseas you end up learning something usually that's that's really useful plumbing or farming or or something like that so you're really learning a skill",
 'mnli hypothesis: This is a stringed instrument created by the ancient Aztecs. premise: The geiro, a percussion instrument made of a notched dried gourd, was developed by the Taano Indians.']

In [16]:
dataset['targets_pretokenized'][:2]

['contradiction', 'contradiction']

In [17]:
dataset = seqio.get_mixture_or_task("super_glue_boolq_v102").get_dataset(
    sequence_length={"inputs": 256, "targets": 128},
    split="train",
    shuffle=True,
    num_epochs=1,
    shard_info=seqio.ShardInfo(index=0, num_shards=10),
    use_cached=False,
    seed=42
)

iterator = dataset.as_numpy_iterator()

[1mDownloading and preparing dataset 3.93 MiB (download: 3.93 MiB, generated: Unknown size, total: 3.93 MiB) to /root/tensorflow_datasets/super_glue/boolq/1.0.2...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/9427 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/super_glue/boolq/1.0.2.incompleteCOOR2N/super_glue-train.tfrecord*...:   0…

Generating validation examples...:   0%|          | 0/3270 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/super_glue/boolq/1.0.2.incompleteCOOR2N/super_glue-validation.tfrecord*...…

Generating test examples...:   0%|          | 0/3245 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/super_glue/boolq/1.0.2.incompleteCOOR2N/super_glue-test.tfrecord*...:   0%…

[1mDataset super_glue downloaded and prepared to /root/tensorflow_datasets/super_glue/boolq/1.0.2. Subsequent calls will reuse this data.[0m


# Create dataset



In [8]:
def get_date(name,split,dataset_name):
    
    start_position=len(name)+1
    
    if dataset_name=='superglue':
        seqio_name=f"super_glue_{name}_v102"
    elif dataset_name=='glue':
        seqio_name=f"glue_{name}_v002"
    else :
        raise f"dataset_name: {dataset_name} not config"
    dataset = seqio.get_mixture_or_task(seqio_name).get_dataset(
        sequence_length={"inputs": 1, "targets": 1},
        split=split,
        shuffle=True,
        num_epochs=1,
        shard_info=seqio.ShardInfo(index=0, num_shards=1),
        use_cached=False,
        seed=42
    )

    iterator = dataset.as_numpy_iterator()

    dictionary = {
        "idx": [],
        "inputs": [],
        "targets": []
    }

    for i, ex in enumerate(iterator):
        if name =='multirc':
            dictionary["idx"].append(f"{ex['idx/paragraph']},{ex['idx/question']},{ex['idx/answer']}") 
        elif name == 'record':
            dictionary["idx"].append(ex["idx/query"])
        else: dictionary["idx"].append(ex["idx"]) 

        dictionary["inputs"].append(ex["inputs_pretokenized"].decode("utf-8")[start_position:])
        dictionary["targets"].append(ex["targets_pretokenized"].decode("utf-8"))
    
    return Dataset.from_dict(dictionary)

## for mnli

In [68]:
seqio_mnli_dataset=DatasetDict()

In [69]:
name="mnli"
#super_glue_cb_v102
for split in ['train','test_matched','test_mismatched','validation_matched','validation_mismatched']:
    print(split)
    seqio_mnli_dataset[split]=get_date(name,split,'glue')

train
test_matched
test_mismatched
validation_matched
validation_mismatched


In [70]:
seqio_mnli_dataset

DatasetDict({
    train: Dataset({
        features: ['idx', 'inputs', 'targets'],
        num_rows: 392702
    })
    test_matched: Dataset({
        features: ['idx', 'inputs', 'targets'],
        num_rows: 9796
    })
    test_mismatched: Dataset({
        features: ['idx', 'inputs', 'targets'],
        num_rows: 9847
    })
    validation_matched: Dataset({
        features: ['idx', 'inputs', 'targets'],
        num_rows: 9815
    })
    validation_mismatched: Dataset({
        features: ['idx', 'inputs', 'targets'],
        num_rows: 9832
    })
})

In [71]:
# create validation and test dataset by combining matched and mismatched

seqio_mnli_dataset['validation']=concatenate_datasets([seqio_mnli_dataset['validation_matched'],seqio_mnli_dataset['validation_mismatched']])
seqio_mnli_dataset['test']=concatenate_datasets([seqio_mnli_dataset['test_matched'],seqio_mnli_dataset['test_mismatched']])

In [72]:
seqio_mnli_dataset

DatasetDict({
    train: Dataset({
        features: ['idx', 'inputs', 'targets'],
        num_rows: 392702
    })
    test_matched: Dataset({
        features: ['idx', 'inputs', 'targets'],
        num_rows: 9796
    })
    test_mismatched: Dataset({
        features: ['idx', 'inputs', 'targets'],
        num_rows: 9847
    })
    validation_matched: Dataset({
        features: ['idx', 'inputs', 'targets'],
        num_rows: 9815
    })
    validation_mismatched: Dataset({
        features: ['idx', 'inputs', 'targets'],
        num_rows: 9832
    })
    validation: Dataset({
        features: ['idx', 'inputs', 'targets'],
        num_rows: 19647
    })
    test: Dataset({
        features: ['idx', 'inputs', 'targets'],
        num_rows: 19643
    })
})

In [None]:
seqio_mnli_dataset.push_to_hub(
        "stjokerli/TextToText_mnli_seqio",
)

## For superGlue

### For 'cb','boolq','rte','copa' ,'wic','multirc','record'

In [10]:
text_to_text_SuperGlue=DatasetDict()

In [11]:
# task_dict=
for name in [
#     'cb','boolq','rte','copa'
#             ,'wic',
    'multirc'
#     ,'record'
            ]:
   
    print(name)
    temp=DatasetDict()
    for split in ['train','test','validation']:
        
        print(" ",split)
        temp[split]=get_date(name,split,"superglue")
    text_to_text_SuperGlue[name]=temp

multirc
  train
  test
  validation


## For wcs

In [21]:
def get_data(name,subfix,split):
    
    start_position=len(name)+1
    
#     if dataset_name=='superglue':
#         seqio_name=f"super_glue_{name}_v102"
#     elif dataset_name=='glue':
#         seqio_name=f"glue_{name}_v002"
#     else :
#         raise f"dataset_name: {dataset_name} not config"
    seqio_name=f"super_glue_{name}_v102_simple_{subfix}"
    dataset = seqio.get_mixture_or_task(seqio_name).get_dataset(
        sequence_length={"inputs": 1, "targets": 1},
        split=split,
        shuffle=True,
        num_epochs=1,
        shard_info=seqio.ShardInfo(index=0, num_shards=1),
        use_cached=False,
        seed=42
    )

    iterator = dataset.as_numpy_iterator()

    dictionary = {
        "idx": [],
        "inputs": [],
        "targets": []
    }
    for i, ex in enumerate(iterator):
        dictionary["idx"].append(ex["idx"])
        dictionary["inputs"].append(ex["inputs_pretokenized"].decode("utf-8")[start_position:])
        dictionary["targets"].append(ex["targets_pretokenized"].decode("utf-8"))
    
    return Dataset.from_dict(dictionary)

In [22]:
temp=DatasetDict()
temp['train']=get_data('wsc','train','train')
temp['validation']=get_data('wsc','eval','validation')
temp['test']=get_data('wsc','eval','test')

In [23]:
text_to_text_SuperGlue['wsc']=temp

In [24]:
def get_date(name,split,dataset_name):
    
    start_position=len(name)+1
    
    if dataset_name=='superglue':
        seqio_name=f"super_glue_{name}_v102"
    elif dataset_name=='glue':
        seqio_name=f"glue_{name}_v002"
    else :
        raise f"dataset_name: {dataset_name} not config"
    dataset = seqio.get_mixture_or_task(seqio_name).get_dataset(
        sequence_length={"inputs": 1, "targets": 1},
        split=split,
        shuffle=True,
        num_epochs=1,
        shard_info=seqio.ShardInfo(index=0, num_shards=1),
        use_cached=False,
        seed=42
    )

    iterator = dataset.as_numpy_iterator()

    dictionary = {
        "idx": [],
        "inputs": [],
        "targets": []
    }
    for i, ex in enumerate(iterator):
        dictionary["idx"].append(ex["idx"])
        dictionary["inputs"].append(ex["inputs_pretokenized"].decode("utf-8")[start_position:])
#         print(ex["targets_pretokenized"])
        dictionary["targets"].append(ex["targets_pretokenized"].decode("utf-8"))
    
    return Dataset.from_dict(dictionary)

In [25]:
for name in ['axg','axb']:
    
    print(name)
    temp=DatasetDict()
    for split in ['test']:
        
        print(" ",split)
        temp[split]=get_date(name,split,"superglue")
    text_to_text_SuperGlue[name]=temp

axg
  test
axb
  test


In [26]:
text_to_text_SuperGlue

DatasetDict({
    cb: DatasetDict({
        train: Dataset({
            features: ['idx', 'inputs', 'targets'],
            num_rows: 250
        })
        test: Dataset({
            features: ['idx', 'inputs', 'targets'],
            num_rows: 250
        })
        validation: Dataset({
            features: ['idx', 'inputs', 'targets'],
            num_rows: 56
        })
    })
    boolq: DatasetDict({
        train: Dataset({
            features: ['idx', 'inputs', 'targets'],
            num_rows: 9427
        })
        test: Dataset({
            features: ['idx', 'inputs', 'targets'],
            num_rows: 3245
        })
        validation: Dataset({
            features: ['idx', 'inputs', 'targets'],
            num_rows: 3270
        })
    })
    rte: DatasetDict({
        train: Dataset({
            features: ['idx', 'inputs', 'targets'],
            num_rows: 2490
        })
        test: Dataset({
            features: ['idx', 'inputs', 'targets'],
            num_r

# push to hugging face data hub

In [12]:
for task in text_to_text_SuperGlue.keys():
    text_to_text_SuperGlue[task].push_to_hub(
        f"stjokerli/TextToText_{task}_seqio",
)



Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]



Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]



Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]