In [4]:
!pip install seqio
!pip install t5

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
[0mLooking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting t5
  Downloading t5-0.9.3-py3-none-any.whl (153 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.9/153.9 KB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
Collecting mesh-tensorflow[transformer]>=0.1.13
  Downloading mesh_tensorflow-0.1.19-py3-none-any.whl (366 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m366.4/366.4 KB[0m [31m33.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge-score
  Downloading rouge_score-0.0.4-py2.py3-none-any.whl (22 kB)
Collecting editdistance
  Downloading editdistance-0.6.0-cp38-cp38-manylinux2010_x86_64.whl (286 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m286.4/286.4 KB[0m [31m25.0 MB/s[0m eta [36m0:00:00[0m
Collecting sacrebleu
  Downloading sacrebleu-2.0.0-py3-none-any.whl (90 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [5]:
import seqio
import functools
import torch
from datasets import Dataset
import t5.data
from t5.data import postprocessors
from t5.data import preprocessors
from t5.data.glue_utils import get_glue_metric
from t5.data.glue_utils import get_glue_postprocess_fn
from t5.data.glue_utils import get_glue_text_preprocessor
from t5.data.glue_utils import get_super_glue_metric
from t5.evaluation import metrics
import tensorflow_datasets as tfds
from t5.models import utils as model_utils
import gin
from absl import logging

In [6]:
tfds.__version__

'4.5.2+nightly'

In [7]:
TaskRegistry = seqio.TaskRegistry



DEFAULT_OUTPUT_FEATURES = {
    "inputs": seqio.Feature(
        vocabulary=t5.data.get_default_vocabulary(), add_eos=True,
        required=False),
    "targets": seqio.Feature(
        vocabulary=t5.data.get_default_vocabulary(), add_eos=True)
}

# ==================================== C4 ======================================
# Final pretraining task used in Raffel et al., 2019.
TaskRegistry.add(
    "c4_v220_span_corruption",
    source=seqio.TfdsDataSource(tfds_name="c4/en:2.2.0"),
    preprocessors=[
        functools.partial(
            preprocessors.rekey, key_map={
                "inputs": None,
                "targets": "text"
            }),
        seqio.preprocessors.tokenize,
        seqio.CacheDatasetPlaceholder(),
        preprocessors.span_corruption,
        seqio.preprocessors.append_eos_after_trim,

    ],
    output_features=DEFAULT_OUTPUT_FEATURES,
    metric_fns=[])


# Baseline pretraining task used in Raffel et al., 2019.
TaskRegistry.add(
    "c4_v220_iid_denoising",
    source=seqio.TfdsDataSource(tfds_name="c4/en:2.2.0"),
    preprocessors=[
        functools.partial(
            preprocessors.rekey, key_map={
                "inputs": None,
                "targets": "text"
            }),
        seqio.preprocessors.tokenize,
        seqio.CacheDatasetPlaceholder(),
        preprocessors.iid_denoising,
        seqio.preprocessors.append_eos_after_trim,
    ],
    output_features=DEFAULT_OUTPUT_FEATURES,
    metric_fns=[])


# Prefix language modeling pretraining task used in Raffel et al., 2019.
TaskRegistry.add(
    "c4_v220_prefix_lm",
    source=seqio.TfdsDataSource(tfds_name="c4/en:2.2.0"),
    preprocessors=[
        functools.partial(
            preprocessors.rekey, key_map={
                "inputs": None,
                "targets": "text"
            }),
        seqio.preprocessors.tokenize,
        seqio.CacheDatasetPlaceholder(),
        preprocessors.prefix_lm,
        seqio.preprocessors.append_eos_after_trim,
    ],
    output_features=DEFAULT_OUTPUT_FEATURES,
    metric_fns=[])


# Configurable tasks used for comparisons in Raffel et al., 2019.
_c4_config_suffixes = ["", ".noclean", ".realnewslike", ".webtextlike"]
for config_suffix in _c4_config_suffixes:
    TaskRegistry.add(
        "c4{name}_v020_unsupervised".format(name=config_suffix.replace(".", "_")),
        source=seqio.TfdsDataSource(tfds_name="c4/en{config}:2.2.0".format(
          config=config_suffix)),
        preprocessors=[
          functools.partial(
              preprocessors.rekey, key_map={
                  "inputs": None,
                  "targets": "text"
              }),
          seqio.preprocessors.tokenize,
          seqio.CacheDatasetPlaceholder(),
          preprocessors.unsupervised,
          seqio.preprocessors.append_eos_after_trim,
        ],
        output_features=DEFAULT_OUTPUT_FEATURES,
        metric_fns=[])


# ================================ Wikipedia ===================================
TaskRegistry.add(
    "wikipedia_20190301.en_v003_unsupervised",
    source=seqio.TfdsDataSource(tfds_name="wikipedia/20190301.en:1.0.0"),
    preprocessors=[
        functools.partial(
            preprocessors.rekey, key_map={
                "inputs": None,
                "targets": "text"
            }),
        seqio.preprocessors.tokenize,
        seqio.CacheDatasetPlaceholder(),
        preprocessors.unsupervised,
        seqio.preprocessors.append_eos_after_trim,
    ],
    output_features=DEFAULT_OUTPUT_FEATURES,
    metric_fns=[])


# =================================== GLUE =====================================
for b in tfds.text.glue.Glue.builder_configs.values():
    TaskRegistry.add(
        "glue_%s_v002" % b.name,
        source=seqio.TfdsDataSource(
        tfds_name="glue/%s:2.0.0" % b.name,
        splits=["test"] if b.name == "ax" else None),
        preprocessors=[
        get_glue_text_preprocessor(b),
        seqio.preprocessors.tokenize,
        seqio.CacheDatasetPlaceholder(),
        seqio.preprocessors.append_eos_after_trim,
        ],
        metric_fns=get_glue_metric(b.name),
        output_features=DEFAULT_OUTPUT_FEATURES,
        postprocess_fn=get_glue_postprocess_fn(b))

# =============================== CNN DailyMail ================================
TaskRegistry.add(
    "cnn_dailymail_v002",
    source=seqio.TfdsDataSource(tfds_name="cnn_dailymail:3.1.0"),
    preprocessors=[
        functools.partial(
            preprocessors.summarize,
            article_key="article",
            summary_key="highlights"),
        seqio.preprocessors.tokenize,
        seqio.CacheDatasetPlaceholder(),
        seqio.preprocessors.append_eos_after_trim,
    ],
    metric_fns=[metrics.rouge],
    output_features=DEFAULT_OUTPUT_FEATURES)

# ==================================== WMT =====================================
# Format: year, tfds builder config, tfds version
b_configs = [
    ("14", tfds.translate.wmt14.Wmt14Translate.builder_configs["de-en"], "1.0.0"
    ),
    ("14", tfds.translate.wmt14.Wmt14Translate.builder_configs["fr-en"], "1.0.0"
    ),
    ("16", tfds.translate.wmt16.Wmt16Translate.builder_configs["ro-en"], "1.0.0"
    ),
    ("15", tfds.translate.wmt15.Wmt15Translate.builder_configs["fr-en"], "1.0.0"
    ),
    ("19", tfds.translate.wmt19.Wmt19Translate.builder_configs["de-en"], "1.0.0"
    ),
]

for prefix, b, tfds_version in b_configs:
    TaskRegistry.add(
        "wmt%s_%s%s_v003" % (prefix, b.language_pair[1], b.language_pair[0]),
        source=seqio.TfdsDataSource(tfds_name="wmt%s_translate/%s:%s" %
                                  (prefix, b.name, tfds_version)),
        preprocessors=[
          functools.partial(
              preprocessors.translate,
              source_language=b.language_pair[1],
              target_language=b.language_pair[0],
          ),
          seqio.preprocessors.tokenize,
          seqio.CacheDatasetPlaceholder(),
          seqio.preprocessors.append_eos_after_trim,
        ],
        metric_fns=[metrics.bleu],
        output_features=DEFAULT_OUTPUT_FEATURES)

# Special case for t2t ende.
b = tfds.translate.wmt_t2t.WmtT2tTranslate.builder_configs["de-en"]
TaskRegistry.add(
    "wmt_t2t_ende_v003",
    source=seqio.TfdsDataSource(tfds_name="wmt_t2t_translate/de-en:1.0.0"),
    preprocessors=[
        functools.partial(
            preprocessors.translate,
            source_language=b.language_pair[1],
            target_language=b.language_pair[0]),
        seqio.preprocessors.tokenize,
        seqio.CacheDatasetPlaceholder(),
        seqio.preprocessors.append_eos_after_trim,
    ],
    metric_fns=[metrics.bleu],
    output_features=DEFAULT_OUTPUT_FEATURES)

# ================================= SuperGlue ==================================
for b in tfds.text.super_glue.SuperGlue.builder_configs.values():
  # We use a simplified version of WSC, defined below
    if "wsc" in b.name:
        continue
    if b.name == "axb":
        glue_preprocessors = [
            functools.partial(
                preprocessors.rekey,
                key_map={
                    "premise": "sentence1",
                    "hypothesis": "sentence2",
                    "label": "label",
                    "idx": "idx",
                }),
            get_glue_text_preprocessor(b),
            seqio.preprocessors.tokenize,
            seqio.CacheDatasetPlaceholder(),
            seqio.preprocessors.append_eos_after_trim,
        ]
    else:
        glue_preprocessors = [
            get_glue_text_preprocessor(b),
            seqio.preprocessors.tokenize,
            seqio.CacheDatasetPlaceholder(),
            seqio.preprocessors.append_eos_after_trim,
    ]
    TaskRegistry.add(
        "super_glue_%s_v102" % b.name,
        source=seqio.TfdsDataSource(
          tfds_name="super_glue/%s:1.0.2" % b.name,
          splits=["test"] if b.name in ["axb", "axg"] else None),
        preprocessors=glue_preprocessors,
        metric_fns=get_super_glue_metric(b.name),
        output_features=DEFAULT_OUTPUT_FEATURES,
        postprocess_fn=get_glue_postprocess_fn(b))

    # Create SuperGLUE tasks with 1 sentinel token added.
    seqio.experimental.add_task_with_sentinels("super_glue_%s_v102" % b.name,
                                             num_sentinels=1)

# ======================== Definite Pronoun Resolution =========================
TaskRegistry.add(
    "dpr_v001_simple",
    source=seqio.TfdsDataSource(tfds_name="definite_pronoun_resolution:1.1.0"),
    preprocessors=[
        preprocessors.definite_pronoun_resolution_simple,
        seqio.preprocessors.tokenize,
        seqio.CacheDatasetPlaceholder(),
        seqio.preprocessors.append_eos_after_trim,
    ],
    metric_fns=[metrics.accuracy],
    output_features=DEFAULT_OUTPUT_FEATURES)

# Create SuperGLUE tasks with 1 sentinel token added.
seqio.experimental.add_task_with_sentinels("dpr_v001_simple", num_sentinels=1)

# =================================== WSC ======================================
TaskRegistry.add(
    "super_glue_wsc_v102_simple_train",
    source=seqio.TfdsDataSource(
        tfds_name="super_glue/wsc.fixed:1.0.2", splits=["train"]),
    preprocessors=[
        functools.partial(preprocessors.wsc_simple, correct_referent_only=True),
        seqio.preprocessors.tokenize,
        seqio.CacheDatasetPlaceholder(),
        seqio.preprocessors.append_eos_after_trim,
    ],
    metric_fns=[],
    output_features=DEFAULT_OUTPUT_FEATURES)

# Create SuperGLUE tasks with 1 sentinel token added.
seqio.experimental.add_task_with_sentinels("super_glue_wsc_v102_simple_train",
                                           num_sentinels=1)

TaskRegistry.add(
    "super_glue_wsc_v102_simple_eval",
    source=seqio.TfdsDataSource(
        tfds_name="super_glue/wsc.fixed:1.0.2", splits=["validation", "test"]),
    preprocessors=[
        functools.partial(
            preprocessors.wsc_simple, correct_referent_only=False),
        seqio.preprocessors.tokenize,
        seqio.CacheDatasetPlaceholder(),
        seqio.preprocessors.append_eos_after_trim,
    ],
    postprocess_fn=postprocessors.wsc_simple,
    metric_fns=[metrics.accuracy],
    output_features=DEFAULT_OUTPUT_FEATURES)
# Create SuperGLUE tasks with 1 sentinel token added.
seqio.experimental.add_task_with_sentinels("super_glue_wsc_v102_simple_eval",
                                           num_sentinels=1)

# =================================== WNLI =====================================
TaskRegistry.add(
    "glue_wnli_v002_simple_eval",
    source=seqio.TfdsDataSource(
        tfds_name="glue/wnli:1.0.0", splits=["validation", "test"]),
    preprocessors=[
        preprocessors.wnli_simple,
        seqio.preprocessors.tokenize,
        seqio.CacheDatasetPlaceholder(),
        seqio.preprocessors.append_eos_after_trim,
    ],
    postprocess_fn=postprocessors.wsc_simple,
    metric_fns=[metrics.accuracy],
    output_features=DEFAULT_OUTPUT_FEATURES)

# =================================== Squad ====================================
# Maximized evaluation metrics over all answers.
TaskRegistry.add(
    "squad_v010_allanswers",
    source=seqio.TfdsDataSource(tfds_name="squad/v1.1:3.0.0"),
    preprocessors=[
        preprocessors.squad,
        seqio.preprocessors.tokenize,
        seqio.CacheDatasetPlaceholder(),
        seqio.preprocessors.append_eos_after_trim,
    ],
    postprocess_fn=postprocessors.qa,
    metric_fns=[metrics.squad],
    output_features=DEFAULT_OUTPUT_FEATURES)


# Maximized evaluation metrics over all answers.
TaskRegistry.add(
    "squad_v010_context_free",
    source=seqio.TfdsDataSource(tfds_name="squad/v1.1:3.0.0"),
    preprocessors=[
        functools.partial(preprocessors.squad, include_context=False),
        seqio.preprocessors.tokenize,
        seqio.CacheDatasetPlaceholder(),
        seqio.preprocessors.append_eos_after_trim,
    ],
    postprocess_fn=postprocessors.qa,
    metric_fns=[metrics.squad],
    output_features=DEFAULT_OUTPUT_FEATURES)

# Squad span prediction task instead of text.
TaskRegistry.add(
    "squad_v010_allanswers_span",
    source=seqio.TfdsDataSource(tfds_name="squad/v1.1:3.0.0"),
    preprocessors=[
        preprocessors.squad_span_space_tokenized,
        seqio.preprocessors.tokenize,
        seqio.CacheDatasetPlaceholder(),
        seqio.preprocessors.append_eos_after_trim,
    ],
    postprocess_fn=postprocessors.span_qa,
    metric_fns=[metrics.span_squad],
    output_features=DEFAULT_OUTPUT_FEATURES)

# Deprecated: Use `squad_v010_allanswers` instead.
TaskRegistry.add(
    "squad_v010",
    source=seqio.TfdsDataSource(tfds_name="squad/v1.1:3.0.0"),
    preprocessors=[
        preprocessors.squad,
        seqio.preprocessors.tokenize,
        seqio.CacheDatasetPlaceholder(),
        seqio.preprocessors.append_eos_after_trim,
    ],
    metric_fns=[metrics.squad],
    output_features=DEFAULT_OUTPUT_FEATURES)

# ================================= TriviaQA ===================================
TaskRegistry.add(
    "trivia_qa_v010",
    source=seqio.TfdsDataSource(tfds_name="trivia_qa/rc:1.1.0"),
    preprocessors=[
        preprocessors.trivia_qa,
        seqio.preprocessors.tokenize,
        seqio.CacheDatasetPlaceholder(),
        preprocessors.trivia_qa_truncate_inputs,
        seqio.preprocessors.append_eos_after_trim,
    ],
    metric_fns=[],
    output_features=DEFAULT_OUTPUT_FEATURES)


# =============== PrefixLM objectives (not used in the T5 paper) ===============


# # Vocabulary (shared by encoder and decoder)
# sentencepiece_model_file = "gs://t5-data/vocabs/cc_all.32000.100extra/sentencepiece.model"

# vocab = seqio.SentencePieceVocabulary(sentencepiece_model_file)

# seqio.TaskRegistry.add(
#     "c4_prefix_lm_objective_encoder_decoder_architecture",
#     source=seqio.TfdsDataSource(tfds_name="c4/en:2.2.0"),
#     preprocessors=[
#         functools.partial(
#             preprocessors.rekey, key_map={
#                 "inputs": None,
#                 "targets": "text"
#             }),
#         seqio.preprocessors.tokenize,
#         seqio.CacheDatasetPlaceholder(),
#         preprocessors.targets_for_prefix_lm_objective,
#         preprocessors.pack_prefix_lm_encoder_decoder,
#     ],
#     output_features={
#         "encoder_input_tokens": seqio.Feature(vocabulary=vocab, add_eos=False),
#         "decoder_target_tokens": seqio.Feature(vocabulary=vocab, add_eos=False),
#         "decoder_input_tokens": seqio.Feature(vocabulary=vocab, add_eos=False),
#         "encoder_segment_ids": seqio.Feature(vocabulary=vocab, add_eos=False),
#         "encoder_positions": seqio.Feature(vocabulary=vocab, add_eos=False),
#         "decoder_segment_ids": seqio.Feature(vocabulary=vocab, add_eos=False),
#         "decoder_positions": seqio.Feature(vocabulary=vocab, add_eos=False),
#         "decoder_loss_weights": seqio.Feature(vocabulary=vocab, add_eos=False),
#         # All but the last stage of the preprocessing uses "targets" as the key,
#         # so this output feature is necessary. It is not marked required because
#         # the final preprocessor drops it.
#         "targets": seqio.Feature(vocabulary=vocab, required=False),
#     },
#     metric_fns=[])


# seqio.TaskRegistry.add(
#     "c4_prefix_lm_objective_decoder_architecture",
#     source=seqio.TfdsDataSource(tfds_name="c4/en:2.2.0"),
#     preprocessors=[
#         functools.partial(
#             preprocessors.rekey, key_map={
#                 "inputs": None,
#                 "targets": "text"
#             }),
#         seqio.preprocessors.tokenize,
#         seqio.CacheDatasetPlaceholder(),
#         preprocessors.targets_for_prefix_lm_objective,
#         preprocessors.pack_prefix_lm_decoder_only,
#     ],
#     output_features={
#         "decoder_target_tokens": seqio.Feature(vocabulary=vocab, add_eos=False),
#         "decoder_input_tokens": seqio.Feature(vocabulary=vocab, add_eos=False),
#         "decoder_loss_weights": seqio.Feature(vocabulary=vocab, add_eos=False),
#         "decoder_causal_attention": seqio.Feature(
#             vocabulary=vocab, add_eos=False),
#         # All but the last stage of the preprocessing uses "targets" as the key,
#         # so this output feature is necessary. It is not marked required because
#         # the final preprocessor drops it.
#         "targets": seqio.Feature(vocabulary=vocab, required=False),
#     },
#     metric_fns=[])


# TaskRegistry.add(
#     "c4_v220_full_lm",
#     source=seqio.TfdsDataSource(tfds_name="c4/en:2.2.0"),
#     preprocessors=[
#         functools.partial(
#             preprocessors.rekey, key_map={
#                 "inputs": None,
#                 "targets": "text"
#             }),
#         seqio.preprocessors.tokenize,
#         seqio.CacheDatasetPlaceholder(),
#         preprocessors.full_lm,
#     ],
#     output_features={
#         "targets": seqio.Feature(vocabulary=vocab, add_eos=True)
#     },
#     metric_fns=[])

<seqio.dataset_providers.Task at 0x7fcd1f98acd0>

In [8]:
#glue_mnli_v002
#super_glue_cb_v102
dataset = seqio.get_mixture_or_task("glue_mnli_v002").get_dataset(
    sequence_length={"inputs": 256, "targets": 128},
    split="train",
    shuffle=True,
    num_epochs=1,
    shard_info=seqio.ShardInfo(index=0, num_shards=10),
    use_cached=False,
    seed=42
)

iterator = dataset.as_numpy_iterator()

2022-03-01 12:25:59.831226: W tensorflow/core/platform/cloud/google_auth_provider.cc:184] All attempts to get a Google authentication bearer token failed, returning an empty token. Retrieving token from files failed with "NOT_FOUND: Could not locate the credentials file.". Retrieving token from GCE failed with "FAILED_PRECONDITION: Error executing an HTTP request: libcurl code 6 meaning 'Couldn't resolve host name', error details: Could not resolve host: metadata".


[1mDownloading and preparing dataset 298.29 MiB (download: 298.29 MiB, generated: 100.56 MiB, total: 398.85 MiB) to ~/tensorflow_datasets/glue/mnli/2.0.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/5 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/392702 [00:00<?, ? examples/s]

Shuffling ~/tensorflow_datasets/glue/mnli/2.0.0.incompleteGRCOYK/glue-train.tfrecord*...:   0%|          | 0/3…

Generating validation_matched examples...:   0%|          | 0/9815 [00:00<?, ? examples/s]

Shuffling ~/tensorflow_datasets/glue/mnli/2.0.0.incompleteGRCOYK/glue-validation_matched.tfrecord*...:   0%|  …

Generating validation_mismatched examples...:   0%|          | 0/9832 [00:00<?, ? examples/s]

Shuffling ~/tensorflow_datasets/glue/mnli/2.0.0.incompleteGRCOYK/glue-validation_mismatched.tfrecord*...:   0%…

Generating test_matched examples...:   0%|          | 0/9796 [00:00<?, ? examples/s]

Shuffling ~/tensorflow_datasets/glue/mnli/2.0.0.incompleteGRCOYK/glue-test_matched.tfrecord*...:   0%|        …

Generating test_mismatched examples...:   0%|          | 0/9847 [00:00<?, ? examples/s]

Shuffling ~/tensorflow_datasets/glue/mnli/2.0.0.incompleteGRCOYK/glue-test_mismatched.tfrecord*...:   0%|     …

[1mDataset glue downloaded and prepared to ~/tensorflow_datasets/glue/mnli/2.0.0. Subsequent calls will reuse this data.[0m


2022-03-01 12:26:52.897581: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:922] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-03-01 12:26:52.961094: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:922] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-03-01 12:26:52.961414: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:922] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-03-01 12:26:52.961773: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate

In [9]:
dictionary = {
    "idx": [],
    "inputs_pretokenized": [],
    "targets_pretokenized": []
}
for i, ex in enumerate(iterator):
    dictionary["idx"].append(ex["idx"])
    dictionary["inputs_pretokenized"].append(ex["inputs_pretokenized"].decode("utf-8") )
    dictionary["targets_pretokenized"].append(ex["targets_pretokenized"].decode("utf-8") )


In [10]:
dataset = Dataset.from_dict(dictionary)

In [11]:
dataset.column_names

['idx', 'inputs_pretokenized', 'targets_pretokenized']

In [12]:
dataset['inputs_pretokenized'][:2]

["mnli hypothesis: You won't learn anything by serving overseas.  premise: because actually when you when you do uh service overseas you end up learning something usually that's that's really useful plumbing or farming or or something like that so you're really learning a skill",
 'mnli hypothesis: This is a stringed instrument created by the ancient Aztecs. premise: The geiro, a percussion instrument made of a notched dried gourd, was developed by the Taano Indians.']

In [13]:
dataset['targets_pretokenized'][:2]

['contradiction', 'contradiction']

In [None]:
dataset = seqio.get_mixture_or_task("super_glue_boolq_v102").get_dataset(
    sequence_length={"inputs": 256, "targets": 128},
    split="train",
    shuffle=True,
    num_epochs=1,
    shard_info=seqio.ShardInfo(index=0, num_shards=10),
    use_cached=False,
    seed=42
)

iterator = dataset.as_numpy_iterator()

In [19]:
def get_date(name,split):
    
    dataset = seqio.get_mixture_or_task(name).get_dataset(
        sequence_length={"inputs": 256, "targets": 128},
        split=split,
        shuffle=True,
        num_epochs=1,
        shard_info=seqio.ShardInfo(index=0, num_shards=1),
        use_cached=False,
        seed=42
    )

    iterator = dataset.as_numpy_iterator()

    dictionary = {
        "idx": [],
        "processed_input": [],
        "processed_output": []
    }
    for i, ex in enumerate(iterator):
        dictionary["idx"].append(ex["idx"])
        dictionary["processed_input"].append(ex["inputs_pretokenized"].decode("utf-8"))
        dictionary["processed_output"].append(ex["targets_pretokenized"].decode("utf-8"))
    
    return Dataset.from_dict(dictionary)

In [20]:
from datasets import load_dataset,concatenate_datasets,DatasetDict

In [21]:
seqio_mnli_dataset=DatasetDict()

In [22]:
name="glue_mnli_v002"
#super_glue_cb_v102
for split in ['train','test_matched','test_mismatched','validation_matched','validation_mismatched']:
    print(split)
    seqio_mnli_dataset[split]=get_date(name,split)

train
test_matched
test_mismatched
validation_matched
validation_mismatched


In [23]:
seqio_mnli_dataset

DatasetDict({
    train: Dataset({
        features: ['idx', 'processed_input', 'processed_output'],
        num_rows: 392702
    })
    test_matched: Dataset({
        features: ['idx', 'processed_input', 'processed_output'],
        num_rows: 9796
    })
    test_mismatched: Dataset({
        features: ['idx', 'processed_input', 'processed_output'],
        num_rows: 9847
    })
    validation_matched: Dataset({
        features: ['idx', 'processed_input', 'processed_output'],
        num_rows: 9815
    })
    validation_mismatched: Dataset({
        features: ['idx', 'processed_input', 'processed_output'],
        num_rows: 9832
    })
})

In [24]:
seqio_mnli_dataset['train']['processed_input'][392701]

'mnli hypothesis: He laughed when he talked about finding the new dog. premise: Thank you, I will. He laughed rather ruefully, as he described how he had discovered a very rare species of fern in an inaccessible place, and in his efforts to obtain it had lost his footing, and slipped ignominiously into a neighbouring pond. '

In [25]:
# create validation and test dataset by combining matched and mismatched

seqio_mnli_dataset['validation']=concatenate_datasets([seqio_mnli_dataset['validation_matched'],seqio_mnli_dataset['validation_mismatched']])
seqio_mnli_dataset['test']=concatenate_datasets([seqio_mnli_dataset['test_matched'],seqio_mnli_dataset['test_mismatched']])

In [30]:
seqio_mnli_dataset.push_to_hub(
        "stjokerli/TextToText_mnli_seqio",
)



Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

2022-03-01 12:34:14.051296: W tensorflow/core/kernels/data/cache_dataset_ops.cc:768] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]



Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]



Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]



Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]



Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]



Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

In [31]:
load_dataset("stjokerli/TextToText_mnli_seqio")

Downloading:   0%|          | 0.00/1.50k [00:00<?, ?B/s]



Downloading and preparing dataset None/None (download: 54.48 MiB, generated: 98.47 MiB, post-processed: Unknown size, total: 152.95 MiB) to /root/.cache/huggingface/datasets/parquet/stjokerli--TextToText_mnli_seqio-87ed5d481a855188/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901...


  0%|          | 0/7 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/47.7M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.20M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.20M [00:00<?, ?B/s]

  0%|          | 0/7 [00:00<?, ?it/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/parquet/stjokerli--TextToText_mnli_seqio-87ed5d481a855188/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901. Subsequent calls will reuse this data.


  0%|          | 0/7 [00:00<?, ?it/s]

DatasetDict({
    test: Dataset({
        features: ['idx', 'processed_input', 'processed_output'],
        num_rows: 19643
    })
    validation_matched: Dataset({
        features: ['idx', 'processed_input', 'processed_output'],
        num_rows: 9815
    })
    validation: Dataset({
        features: ['idx', 'processed_input', 'processed_output'],
        num_rows: 19647
    })
    test_matched: Dataset({
        features: ['idx', 'processed_input', 'processed_output'],
        num_rows: 9796
    })
    train: Dataset({
        features: ['idx', 'processed_input', 'processed_output'],
        num_rows: 392702
    })
    test_mismatched: Dataset({
        features: ['idx', 'processed_input', 'processed_output'],
        num_rows: 9847
    })
    validation_mismatched: Dataset({
        features: ['idx', 'processed_input', 'processed_output'],
        num_rows: 9832
    })
})

In [32]:
text_to_text_SuperGlue=DatasetDict()

In [33]:
# task_dict=
for name in ['cb','boolq','rte','copa']:
    seqio_name=f"super_glue_{name}_v102"
    print(name)
    temp=DatasetDict()
    for split in ['train','test','validation']:
        
        print(" ",split)
        temp[split]=get_date(seqio_name,split)
    text_to_text_SuperGlue[name]=temp

cb
  train


INFO:absl:Load pre-computed DatasetInfo (eg: splits, num examples,...) from GCS: super_glue/cb/1.0.2
INFO:absl:Load dataset info from /tmp/tmpdy8k0haitfds
INFO:absl:Field info.description from disk and from code do not match. Keeping the one from code.
INFO:absl:Field info.config_name from disk and from code do not match. Keeping the one from code.
INFO:absl:Field info.config_description from disk and from code do not match. Keeping the one from code.
INFO:absl:Field info.citation from disk and from code do not match. Keeping the one from code.
INFO:absl:Field info.location from disk and from code do not match. Keeping the one from code.
INFO:absl:Field info.splits from disk and from code do not match. Keeping the one from code.
INFO:absl:Field info.module_name from disk and from code do not match. Keeping the one from code.
INFO:absl:Sharding at the data source: 0 of 1
INFO:absl:Load pre-computed DatasetInfo (eg: splits, num examples,...) from GCS: super_glue/cb/1.0.2
INFO:absl:Load d

[1mDownloading and preparing dataset 73.71 KiB (download: 73.71 KiB, generated: Unknown size, total: 73.71 KiB) to ~/tensorflow_datasets/super_glue/cb/1.0.2...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

INFO:absl:Downloading https://dl.fbaipublicfiles.com/glue/superglue/data/v2/CB.zip into /root/tensorflow_datasets/downloads/dl.fbaipublicfile.com_glue_superglue_v2_CB6itp-ktvqVy3U_d97UGtqB37ewTcsPwsg74cVthM5cI.zip.tmp.583e95e8a5cf48cbbbde8c50b09d1a1f...


Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/250 [00:00<?, ? examples/s]

Shuffling ~/tensorflow_datasets/super_glue/cb/1.0.2.incompleteF1FWER/super_glue-train.tfrecord*...:   0%|     …

INFO:absl:Done writing ~/tensorflow_datasets/super_glue/cb/1.0.2.incompleteF1FWER/super_glue-train.tfrecord*. Number of examples: 250 (shards: [250])


Generating validation examples...:   0%|          | 0/56 [00:00<?, ? examples/s]

Shuffling ~/tensorflow_datasets/super_glue/cb/1.0.2.incompleteF1FWER/super_glue-validation.tfrecord*...:   0%|…

INFO:absl:Done writing ~/tensorflow_datasets/super_glue/cb/1.0.2.incompleteF1FWER/super_glue-validation.tfrecord*. Number of examples: 56 (shards: [56])


Generating test examples...:   0%|          | 0/250 [00:00<?, ? examples/s]

Shuffling ~/tensorflow_datasets/super_glue/cb/1.0.2.incompleteF1FWER/super_glue-test.tfrecord*...:   0%|      …

INFO:absl:Done writing ~/tensorflow_datasets/super_glue/cb/1.0.2.incompleteF1FWER/super_glue-test.tfrecord*. Number of examples: 250 (shards: [250])
INFO:absl:Constructing tf.data.Dataset super_glue for split train, from ~/tensorflow_datasets/super_glue/cb/1.0.2
INFO:absl:Automatically caching small dataset in memory: 'super_glue_cb_v102:train'


[1mDataset super_glue downloaded and prepared to ~/tensorflow_datasets/super_glue/cb/1.0.2. Subsequent calls will reuse this data.[0m


INFO:absl:Sharding at the data source: 0 of 1


  test


INFO:absl:Load dataset info from ~/tensorflow_datasets/super_glue/cb/1.0.2
INFO:absl:Reusing dataset super_glue (~/tensorflow_datasets/super_glue/cb/1.0.2)
INFO:absl:Constructing tf.data.Dataset super_glue for split test, from ~/tensorflow_datasets/super_glue/cb/1.0.2
INFO:absl:Automatically caching small dataset in memory: 'super_glue_cb_v102:test'
INFO:absl:Sharding at the data source: 0 of 1


  validation


INFO:absl:Load dataset info from ~/tensorflow_datasets/super_glue/cb/1.0.2
INFO:absl:Reusing dataset super_glue (~/tensorflow_datasets/super_glue/cb/1.0.2)
INFO:absl:Constructing tf.data.Dataset super_glue for split validation, from ~/tensorflow_datasets/super_glue/cb/1.0.2
INFO:absl:Automatically caching small dataset in memory: 'super_glue_cb_v102:validation'


boolq
  train


INFO:absl:Load pre-computed DatasetInfo (eg: splits, num examples,...) from GCS: super_glue/boolq/1.0.2
INFO:absl:Load dataset info from /tmp/tmpeqoyaufitfds
INFO:absl:Field info.description from disk and from code do not match. Keeping the one from code.
INFO:absl:Field info.config_name from disk and from code do not match. Keeping the one from code.
INFO:absl:Field info.config_description from disk and from code do not match. Keeping the one from code.
INFO:absl:Field info.citation from disk and from code do not match. Keeping the one from code.
INFO:absl:Field info.location from disk and from code do not match. Keeping the one from code.
INFO:absl:Field info.splits from disk and from code do not match. Keeping the one from code.
INFO:absl:Field info.module_name from disk and from code do not match. Keeping the one from code.
INFO:absl:Sharding at the data source: 0 of 1
INFO:absl:Load pre-computed DatasetInfo (eg: splits, num examples,...) from GCS: super_glue/boolq/1.0.2
INFO:absl:

[1mDownloading and preparing dataset 3.93 MiB (download: 3.93 MiB, generated: Unknown size, total: 3.93 MiB) to ~/tensorflow_datasets/super_glue/boolq/1.0.2...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

INFO:absl:Downloading https://dl.fbaipublicfiles.com/glue/superglue/data/v2/BoolQ.zip into /root/tensorflow_datasets/downloads/dl.fbaipublicf.com_glue_superglue_v2_BoolQCy5tmUeU-X3py3LZlvIApsvmfkGGfswBSVgdYquC030.zip.tmp.35a7ddf9e76344abb2dd7d19f8566931...


Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/9427 [00:00<?, ? examples/s]

Shuffling ~/tensorflow_datasets/super_glue/boolq/1.0.2.incompleteYPJ0ZU/super_glue-train.tfrecord*...:   0%|  …

INFO:absl:Done writing ~/tensorflow_datasets/super_glue/boolq/1.0.2.incompleteYPJ0ZU/super_glue-train.tfrecord*. Number of examples: 9427 (shards: [9427])


Generating validation examples...:   0%|          | 0/3270 [00:00<?, ? examples/s]

Shuffling ~/tensorflow_datasets/super_glue/boolq/1.0.2.incompleteYPJ0ZU/super_glue-validation.tfrecord*...:   …

INFO:absl:Done writing ~/tensorflow_datasets/super_glue/boolq/1.0.2.incompleteYPJ0ZU/super_glue-validation.tfrecord*. Number of examples: 3270 (shards: [3270])


Generating test examples...:   0%|          | 0/3245 [00:00<?, ? examples/s]

Shuffling ~/tensorflow_datasets/super_glue/boolq/1.0.2.incompleteYPJ0ZU/super_glue-test.tfrecord*...:   0%|   …

INFO:absl:Done writing ~/tensorflow_datasets/super_glue/boolq/1.0.2.incompleteYPJ0ZU/super_glue-test.tfrecord*. Number of examples: 3245 (shards: [3245])
INFO:absl:Constructing tf.data.Dataset super_glue for split train, from ~/tensorflow_datasets/super_glue/boolq/1.0.2
INFO:absl:Automatically caching small dataset in memory: 'super_glue_boolq_v102:train'


[1mDataset super_glue downloaded and prepared to ~/tensorflow_datasets/super_glue/boolq/1.0.2. Subsequent calls will reuse this data.[0m


INFO:absl:Sharding at the data source: 0 of 1


  test


INFO:absl:Load dataset info from ~/tensorflow_datasets/super_glue/boolq/1.0.2
INFO:absl:Reusing dataset super_glue (~/tensorflow_datasets/super_glue/boolq/1.0.2)
INFO:absl:Constructing tf.data.Dataset super_glue for split test, from ~/tensorflow_datasets/super_glue/boolq/1.0.2
INFO:absl:Automatically caching small dataset in memory: 'super_glue_boolq_v102:test'
INFO:absl:Sharding at the data source: 0 of 1


  validation


INFO:absl:Load dataset info from ~/tensorflow_datasets/super_glue/boolq/1.0.2
INFO:absl:Reusing dataset super_glue (~/tensorflow_datasets/super_glue/boolq/1.0.2)
INFO:absl:Constructing tf.data.Dataset super_glue for split validation, from ~/tensorflow_datasets/super_glue/boolq/1.0.2
INFO:absl:Automatically caching small dataset in memory: 'super_glue_boolq_v102:validation'


rte
  train


INFO:absl:Load pre-computed DatasetInfo (eg: splits, num examples,...) from GCS: super_glue/rte/1.0.2
INFO:absl:Load dataset info from /tmp/tmpb5enpnigtfds
INFO:absl:Field info.description from disk and from code do not match. Keeping the one from code.
INFO:absl:Field info.config_name from disk and from code do not match. Keeping the one from code.
INFO:absl:Field info.config_description from disk and from code do not match. Keeping the one from code.
INFO:absl:Field info.citation from disk and from code do not match. Keeping the one from code.
INFO:absl:Field info.location from disk and from code do not match. Keeping the one from code.
INFO:absl:Field info.splits from disk and from code do not match. Keeping the one from code.
INFO:absl:Field info.module_name from disk and from code do not match. Keeping the one from code.
INFO:absl:Sharding at the data source: 0 of 1
INFO:absl:Load pre-computed DatasetInfo (eg: splits, num examples,...) from GCS: super_glue/rte/1.0.2
INFO:absl:Load

[1mDownloading and preparing dataset 733.32 KiB (download: 733.32 KiB, generated: Unknown size, total: 733.32 KiB) to ~/tensorflow_datasets/super_glue/rte/1.0.2...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

INFO:absl:Downloading https://dl.fbaipublicfiles.com/glue/superglue/data/v2/RTE.zip into /root/tensorflow_datasets/downloads/dl.fbaipublicfil.com_glue_superglue_v2_RTEuZ5Qum8w_Xht3Ep2kvtVNNXspMwVAoRqGDefP93d2qI.zip.tmp.6a4e40ee46bd4c3a8f6955b90e566548...


Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/2490 [00:00<?, ? examples/s]

Shuffling ~/tensorflow_datasets/super_glue/rte/1.0.2.incompleteCNOVHF/super_glue-train.tfrecord*...:   0%|    …

INFO:absl:Done writing ~/tensorflow_datasets/super_glue/rte/1.0.2.incompleteCNOVHF/super_glue-train.tfrecord*. Number of examples: 2490 (shards: [2490])


Generating validation examples...:   0%|          | 0/277 [00:00<?, ? examples/s]

Shuffling ~/tensorflow_datasets/super_glue/rte/1.0.2.incompleteCNOVHF/super_glue-validation.tfrecord*...:   0%…

INFO:absl:Done writing ~/tensorflow_datasets/super_glue/rte/1.0.2.incompleteCNOVHF/super_glue-validation.tfrecord*. Number of examples: 277 (shards: [277])


Generating test examples...:   0%|          | 0/3000 [00:00<?, ? examples/s]

Shuffling ~/tensorflow_datasets/super_glue/rte/1.0.2.incompleteCNOVHF/super_glue-test.tfrecord*...:   0%|     …

INFO:absl:Done writing ~/tensorflow_datasets/super_glue/rte/1.0.2.incompleteCNOVHF/super_glue-test.tfrecord*. Number of examples: 3000 (shards: [3000])
INFO:absl:Constructing tf.data.Dataset super_glue for split train, from ~/tensorflow_datasets/super_glue/rte/1.0.2
INFO:absl:Automatically caching small dataset in memory: 'super_glue_rte_v102:train'


[1mDataset super_glue downloaded and prepared to ~/tensorflow_datasets/super_glue/rte/1.0.2. Subsequent calls will reuse this data.[0m


INFO:absl:Sharding at the data source: 0 of 1


  test


INFO:absl:Load dataset info from ~/tensorflow_datasets/super_glue/rte/1.0.2
INFO:absl:Reusing dataset super_glue (~/tensorflow_datasets/super_glue/rte/1.0.2)
INFO:absl:Constructing tf.data.Dataset super_glue for split test, from ~/tensorflow_datasets/super_glue/rte/1.0.2
INFO:absl:Automatically caching small dataset in memory: 'super_glue_rte_v102:test'
INFO:absl:Sharding at the data source: 0 of 1


  validation


INFO:absl:Load dataset info from ~/tensorflow_datasets/super_glue/rte/1.0.2
INFO:absl:Reusing dataset super_glue (~/tensorflow_datasets/super_glue/rte/1.0.2)
INFO:absl:Constructing tf.data.Dataset super_glue for split validation, from ~/tensorflow_datasets/super_glue/rte/1.0.2
INFO:absl:Automatically caching small dataset in memory: 'super_glue_rte_v102:validation'


copa
  train


INFO:absl:Load pre-computed DatasetInfo (eg: splits, num examples,...) from GCS: super_glue/copa/1.0.2
INFO:absl:Load dataset info from /tmp/tmp0jd6vl_8tfds
INFO:absl:Field info.description from disk and from code do not match. Keeping the one from code.
INFO:absl:Field info.config_name from disk and from code do not match. Keeping the one from code.
INFO:absl:Field info.config_description from disk and from code do not match. Keeping the one from code.
INFO:absl:Field info.citation from disk and from code do not match. Keeping the one from code.
INFO:absl:Field info.location from disk and from code do not match. Keeping the one from code.
INFO:absl:Field info.splits from disk and from code do not match. Keeping the one from code.
INFO:absl:Field info.module_name from disk and from code do not match. Keeping the one from code.
INFO:absl:Sharding at the data source: 0 of 1
INFO:absl:Load pre-computed DatasetInfo (eg: splits, num examples,...) from GCS: super_glue/copa/1.0.2
INFO:absl:Lo

[1mDownloading and preparing dataset 42.96 KiB (download: 42.96 KiB, generated: Unknown size, total: 42.96 KiB) to ~/tensorflow_datasets/super_glue/copa/1.0.2...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

INFO:absl:Downloading https://dl.fbaipublicfiles.com/glue/superglue/data/v2/COPA.zip into /root/tensorflow_datasets/downloads/dl.fbaipublicfi.com_glue_superglue_v2_COPAU9LyCyY2AxrKl_bASv72y6Se-TNEliICWt_IgJ3osDI.zip.tmp.f9d61cd3c80642cf9fadd9ba10e8c393...


Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/400 [00:00<?, ? examples/s]

Shuffling ~/tensorflow_datasets/super_glue/copa/1.0.2.incompleteM9II1E/super_glue-train.tfrecord*...:   0%|   …

INFO:absl:Done writing ~/tensorflow_datasets/super_glue/copa/1.0.2.incompleteM9II1E/super_glue-train.tfrecord*. Number of examples: 400 (shards: [400])


Generating validation examples...:   0%|          | 0/100 [00:00<?, ? examples/s]

Shuffling ~/tensorflow_datasets/super_glue/copa/1.0.2.incompleteM9II1E/super_glue-validation.tfrecord*...:   0…

INFO:absl:Done writing ~/tensorflow_datasets/super_glue/copa/1.0.2.incompleteM9II1E/super_glue-validation.tfrecord*. Number of examples: 100 (shards: [100])


Generating test examples...:   0%|          | 0/500 [00:00<?, ? examples/s]

Shuffling ~/tensorflow_datasets/super_glue/copa/1.0.2.incompleteM9II1E/super_glue-test.tfrecord*...:   0%|    …

INFO:absl:Done writing ~/tensorflow_datasets/super_glue/copa/1.0.2.incompleteM9II1E/super_glue-test.tfrecord*. Number of examples: 500 (shards: [500])
INFO:absl:Constructing tf.data.Dataset super_glue for split train, from ~/tensorflow_datasets/super_glue/copa/1.0.2
INFO:absl:Automatically caching small dataset in memory: 'super_glue_copa_v102:train'


[1mDataset super_glue downloaded and prepared to ~/tensorflow_datasets/super_glue/copa/1.0.2. Subsequent calls will reuse this data.[0m


INFO:absl:Sharding at the data source: 0 of 1


  test


INFO:absl:Load dataset info from ~/tensorflow_datasets/super_glue/copa/1.0.2
INFO:absl:Reusing dataset super_glue (~/tensorflow_datasets/super_glue/copa/1.0.2)
INFO:absl:Constructing tf.data.Dataset super_glue for split test, from ~/tensorflow_datasets/super_glue/copa/1.0.2
INFO:absl:Automatically caching small dataset in memory: 'super_glue_copa_v102:test'
INFO:absl:Sharding at the data source: 0 of 1


  validation


INFO:absl:Load dataset info from ~/tensorflow_datasets/super_glue/copa/1.0.2
INFO:absl:Reusing dataset super_glue (~/tensorflow_datasets/super_glue/copa/1.0.2)
INFO:absl:Constructing tf.data.Dataset super_glue for split validation, from ~/tensorflow_datasets/super_glue/copa/1.0.2
INFO:absl:Automatically caching small dataset in memory: 'super_glue_copa_v102:validation'


In [34]:
text_to_text_SuperGlue

DatasetDict({
    cb: DatasetDict({
        train: Dataset({
            features: ['idx', 'processed_input', 'processed_output'],
            num_rows: 250
        })
        test: Dataset({
            features: ['idx', 'processed_input', 'processed_output'],
            num_rows: 250
        })
        validation: Dataset({
            features: ['idx', 'processed_input', 'processed_output'],
            num_rows: 56
        })
    })
    boolq: DatasetDict({
        train: Dataset({
            features: ['idx', 'processed_input', 'processed_output'],
            num_rows: 9427
        })
        test: Dataset({
            features: ['idx', 'processed_input', 'processed_output'],
            num_rows: 3245
        })
        validation: Dataset({
            features: ['idx', 'processed_input', 'processed_output'],
            num_rows: 3270
        })
    })
    rte: DatasetDict({
        train: Dataset({
            features: ['idx', 'processed_input', 'processed_output'],
     

In [35]:
for task in text_to_text_SuperGlue.keys():
    text_to_text_SuperGlue[task].push_to_hub(
        f"stjokerli/TextToText_{task}_seqio",
)



Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]



Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]



Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]



Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]



Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]



Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]



Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]



Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]



Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]



Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]



Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]



Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

In [36]:
load_dataset("stjokerli/TextToText_cb_seqio")

Downloading:   0%|          | 0.00/922 [00:00<?, ?B/s]



Downloading and preparing dataset None/None (download: 126.38 KiB, generated: 211.48 KiB, post-processed: Unknown size, total: 337.86 KiB) to /root/.cache/huggingface/datasets/parquet/stjokerli--TextToText_cb_seqio-6ec7039b16c98d10/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901...


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/54.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/57.8k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/17.4k [00:00<?, ?B/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/parquet/stjokerli--TextToText_cb_seqio-6ec7039b16c98d10/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['idx', 'processed_input', 'processed_output'],
        num_rows: 250
    })
    test: Dataset({
        features: ['idx', 'processed_input', 'processed_output'],
        num_rows: 250
    })
    validation: Dataset({
        features: ['idx', 'processed_input', 'processed_output'],
        num_rows: 56
    })
})