# Setup

In [5]:
#@title Run this only once - Sets up TF Eager execution.

import tensorflow as tf

# Enable Eager execution - useful for seeing the generated data.
tf.enable_eager_execution()

In [6]:
#@title Setting a random seed.

from tensor2tensor.utils import trainer_lib

# Set a seed so that we have deterministic outputs.
RANDOM_SEED = 301
trainer_lib.set_random_seed(RANDOM_SEED)

W0726 20:55:13.756133 140038208472896 deprecation_wrapper.py:119] From /home/aa5118/anaconda3/envs/tf/lib/python3.7/site-packages/tensor2tensor/utils/trainer_lib.py:780: The name tf.set_random_seed is deprecated. Please use tf.compat.v1.set_random_seed instead.



In [8]:
#@title Run for setting up directories.

import os

# Setup and create directories.
DATA_DIR = os.path.expanduser("/mimic/t2t/data")
OUTPUT_DIR = os.path.expanduser("/mimic/t2t/output")
TMP_DIR = os.path.expanduser("/mnt/")

# Create them.
tf.gfile.MakeDirs(DATA_DIR)
tf.gfile.MakeDirs(OUTPUT_DIR)
tf.gfile.MakeDirs(TMP_DIR)

# Define the problem

In [9]:
import random
import string

def sample_sentence():

    return None

def target_sentence(input_sentence):
    
    return None

In [10]:
from tensor2tensor.data_generators import problem
from tensor2tensor.data_generators import text_problems
from tensor2tensor.utils import registry
@registry.register_problem

# We inherit from `Text2TextProblem` which takes care of a lot of details
# regarding reading and writing the data to disk, what vocabulary type one
# should use, its size etc -- so that we need not worry about them, one can,
# of course, override those.
class SortWordsAccordingToLengthRandom(text_problems.Text2TextProblem):

    # START: Methods we should override.

    # The methods that need to be overriden from `Text2TextProblem` are:
    # `is_generate_per_split` and
    # `generate_samples`.

    @property
    def is_generate_per_split(self):
    # We have pre-existing data splits (train, eval, test) so we set
    # this to True, which will have generate_samples be called for each of the
    # dataset_splits.
        return True

    def generate_samples(self, data_dir, tmp_dir, dataset_split):
        # Here we are generating the data in-situ using the `sample_sentence`
        # function, otherwise we would have downloaded the data and put it in
        # `tmp_dir` -- and read it from that location.
        del tmp_dir

        # Unused here, is used in `Text2TextProblem.generate_data`.
        del data_dir

        # This would have been useful if `self.is_generate_per_split()` was True.
        # In that case we would have checked if we were generating a training,
        # evaluation or test sample. This is of type `problem.DatasetSplit`.
        del dataset_split

        # Just an arbitrary limit to our number of examples, this can be set higher.
        MAX_EXAMPLES = 10

        for i in range(MAX_EXAMPLES):
            sentence_input = sample_sentence()
            sentence_target = target_sentence(sentence_input)
            yield {
              "inputs"  : sentence_input,
              "targets" : sentence_target,
            }

        # END: Methods we should override.

        # START: Overridable methods.

    @property
    def vocab_type(self):
        # We can use different types of vocabularies, `VocabType.CHARACTER`,
        # `VocabType.SUBWORD` and `VocabType.TOKEN`.
        #
        # SUBWORD and CHARACTER are fully invertible -- but SUBWORD provides a good
        # tradeoff between CHARACTER and TOKEN.
        return text_problems.VocabType.SUBWORD

    @property
    def approx_vocab_size(self):
        # Approximate vocab size to generate. Only for VocabType.SUBWORD.
        return 2**13  # ~8k

    @property
    def dataset_splits(self):
        # Since we are responsible for generating the dataset splits, we override
        # `Text2TextProblem.dataset_splits` to specify that we intend to keep
        # 80% data for training and 10% for evaluation and testing each.
        return [{
            "split": problem.DatasetSplit.TRAIN,
            "shards": 8,
        }, {
            "split": problem.DatasetSplit.EVAL,
            "shards": 1,
        }, {
            "split": problem.DatasetSplit.TEST,
            "shards": 1,
        }]

        # END: Overridable methods.

# Generate the data

In [12]:
sort_len_problem = SortWordsAccordingToLengthRandom()
sort_len_problem.generate_data(DATA_DIR, TMP_DIR)

In [None]:
tfe = tf.contrib.eager

Modes = tf.estimator.ModeKeys

# We can iterate over our examples by making an iterator and calling next on it.
eager_iterator = tfe.Iterator(sort_len_problem.dataset(Modes.EVAL, DATA_DIR))
example = eager_iterator.next()

input_tensor = example["inputs"]
target_tensor = example["targets"]

# The tensors are actually encoded using the generated vocabulary file -- you
# can inspect the actual vocab file in DATA_DIR.
print("Tensor Input: " + str(input_tensor))
print("Tensor Target: " + str(target_tensor))

In [None]:
# We use the encoders to decode the tensors to the actual input text.
input_encoder = sort_len_problem.get_feature_encoders(
    data_dir=DATA_DIR)["inputs"]
target_encoder = sort_len_problem.get_feature_encoders(
    data_dir=DATA_DIR)["targets"]

input_decoded = input_encoder.decode(input_tensor.numpy())
target_decoded = target_encoder.decode(target_tensor.numpy())

print("Decoded Input: " + input_decoded)
print("Decoded Target: " + target_decoded)