# Setup

Before we can start using the `tensor2tensor` models, we first have to get our data into a format that `tensor2tensor` can digest. This means defining a custom `Problem` as follows:

In [1]:
#@title Run this only once - Sets up TF Eager execution.

import tensorflow as tf

# Enable Eager execution - useful for seeing the generated data.
tf.enable_eager_execution()

In [2]:
#@title Setting a random seed.

from tensor2tensor.utils import trainer_lib

# Set a seed so that we have deterministic outputs.
RANDOM_SEED = 301
trainer_lib.set_random_seed(RANDOM_SEED)

W0904 18:18:29.019985 140425123104576 deprecation_wrapper.py:119] From /home/aa5118/anaconda3/envs/tf/lib/python3.7/site-packages/tensor2tensor/utils/expert_utils.py:68: The name tf.variable_scope is deprecated. Please use tf.compat.v1.variable_scope instead.

W0904 18:18:31.300210 140425123104576 lazy_loader.py:50] 
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

W0904 18:18:32.220480 140425123104576 deprecation_wrapper.py:119] From /home/aa5118/anaconda3/envs/tf/lib/python3.7/site-packages/tensor2tensor/utils/metrics_hook.py:28: The name tf.train.SessionRunHook is deprecated. Please use tf.estimator.SessionRunHook instead.

W0904 18:18:32.226704 140425123104576 deprecation_

In [5]:
#@title Run for setting up directories.

import os

# Setup and create directories.
DATA_DIR = os.path.expanduser("../data/t2t_experiments/transformer/low_resource/full_context/data")
OUTPUT_DIR = os.path.expanduser("../data/t2t_experiments/transformer/low_resource/full_context/output")
TMP_DIR = os.path.expanduser("/mnt/")

# Create them.
tf.gfile.MakeDirs(DATA_DIR)
tf.gfile.MakeDirs(OUTPUT_DIR)
tf.gfile.MakeDirs(TMP_DIR)

In [6]:
from tensor2tensor.data_generators import problem
from tensor2tensor.data_generators import text_problems
from tensor2tensor.utils import registry

# Define the problem

In [7]:
@registry.register_problem

class MimicDischargeSummaries(text_problems.Text2TextProblem):
    
    @property
    def is_generate_per_split(self):
        # our data already has pre-existing splits so we return true
        return True

    def generate_samples(self, data_dir, tmp_dir, dataset_split):
        
        del tmp_dir
        
        _train = (dataset_split == problem.DatasetSplit.TRAIN)
        _eval = (dataset_split == problem.DatasetSplit.EVAL)
        
        dataset = "train" if _train else "val" if _eval else "test"
        
        full_context = "full_context" in str(data_dir) # returns a boolean
        directory = "../data/preprocessed/low_resource/"
        tgt = directory + "tgt-" + dataset + ".txt"

        if full_context == True:
            src = directory + "src-" + dataset + ".txt"
        else:
            directory += "other_contexts/" 
            context = str(data_dir)[39:-5] # this index needs to be changed if file paths are changed
            src = directory + "src-" + dataset + "-" + context + ".txt"
        
        f_src = open(src,'r')
        f_tgt = open(tgt,'r')
        
        context_data = f_src.readline()
        discharge_summary = f_tgt.readline()

        while context_data:
            yield {
              "inputs"  : context_data,
              "targets" : discharge_summary,
            }
            
            context_data = f_src.readline()
            discharge_summary = f_tgt.readline()
            
        f_src.close()
        f_tgt.close()

    @property
    def vocab_type(self):
        # SUBWORD and CHARACTER are fully invertible -- but SUBWORD provides a good
        # tradeoff between CHARACTER and TOKEN.
        return text_problems.VocabType.SUBWORD

    @property
    def approx_vocab_size(self):
        # Approximate vocab size to generate. Only for VocabType.SUBWORD.
        return 2**15  # ~32k - this is the default setting

    @property
    def dataset_splits(self):
        return [{
            "split": problem.DatasetSplit.TRAIN,
            "shards": 80
        }, {
            "split": problem.DatasetSplit.EVAL,
            "shards": 10
        }, {
            "split": problem.DatasetSplit.TEST,
            "shards": 10
        }]

# Generate the data

First, we instantiate the problem and run it for the full context data.

In [8]:
mimic_problem = MimicDischargeSummaries()
mimic_problem.generate_data(DATA_DIR, TMP_DIR)

W0904 18:19:54.778185 140425123104576 deprecation_wrapper.py:119] From /home/aa5118/anaconda3/envs/tf/lib/python3.7/site-packages/tensor2tensor/data_generators/generator_utils.py:343: The name tf.gfile.Exists is deprecated. Please use tf.io.gfile.exists instead.

W0904 18:19:54.779488 140425123104576 deprecation_wrapper.py:119] From /home/aa5118/anaconda3/envs/tf/lib/python3.7/site-packages/tensor2tensor/data_generators/generator_utils.py:349: The name tf.logging.info is deprecated. Please use tf.compat.v1.logging.info instead.

W0904 18:20:43.752292 140425123104576 deprecation_wrapper.py:119] From /home/aa5118/anaconda3/envs/tf/lib/python3.7/site-packages/tensor2tensor/data_generators/generator_utils.py:355: The name tf.gfile.MakeDirs is deprecated. Please use tf.io.gfile.makedirs instead.

W0904 18:20:43.754397 140425123104576 deprecation_wrapper.py:119] From /home/aa5118/anaconda3/envs/tf/lib/python3.7/site-packages/tensor2tensor/data_generators/text_encoder.py:944: The name tf.gfil

Now, we run it in a loop instead for each individual context type.

In [7]:
context_list = ['h','h-gae','h-gae-d','h-gae-p','h-gae-d-p','h-gae-d-p-m','h-gae-d-p-m-t','h-gae-d-p-m-l']

for context in context_list:
    # Setup and create directories.
    DATA_DIR = os.path.expanduser("../data/t2t_experiments/other_contexts/"+context+"/data")
    OUTPUT_DIR = os.path.expanduser("../data/t2t_experiments/other_contexts/"+context+"/output")
    TMP_DIR = os.path.expanduser("/mnt/")

    # Create them.
    tf.gfile.MakeDirs(DATA_DIR)
    tf.gfile.MakeDirs(OUTPUT_DIR)
    tf.gfile.MakeDirs(TMP_DIR)
    
    mimic_problem.generate_data(DATA_DIR, TMP_DIR)

W0802 19:05:31.212889 139639908783936 deprecation_wrapper.py:119] From /home/aa5118/anaconda3/envs/tf/lib/python3.7/site-packages/tensor2tensor/data_generators/generator_utils.py:343: The name tf.gfile.Exists is deprecated. Please use tf.io.gfile.exists instead.

W0802 19:05:31.213634 139639908783936 deprecation_wrapper.py:119] From /home/aa5118/anaconda3/envs/tf/lib/python3.7/site-packages/tensor2tensor/data_generators/generator_utils.py:349: The name tf.logging.info is deprecated. Please use tf.compat.v1.logging.info instead.

W0802 19:08:39.032947 139639908783936 deprecation_wrapper.py:119] From /home/aa5118/anaconda3/envs/tf/lib/python3.7/site-packages/tensor2tensor/data_generators/generator_utils.py:355: The name tf.gfile.MakeDirs is deprecated. Please use tf.io.gfile.makedirs instead.

W0802 19:08:39.034780 139639908783936 deprecation_wrapper.py:119] From /home/aa5118/anaconda3/envs/tf/lib/python3.7/site-packages/tensor2tensor/data_generators/text_encoder.py:944: The name tf.gfil

# View the generated data

In [9]:
tfe = tf.contrib.eager

Modes = tf.estimator.ModeKeys

# We can iterate over our examples by making an iterator and calling next on it.
eager_iterator = tfe.Iterator(mimic_problem.dataset(Modes.EVAL, DATA_DIR))
example = eager_iterator.next()

input_tensor = example["inputs"]
target_tensor = example["targets"]

# The tensors are actually encoded using the generated vocabulary file -- you
# can inspect the actual vocab file in DATA_DIR.
print("Tensor Input: " + str(input_tensor))
print("Tensor Target: " + str(target_tensor))

W0904 18:22:27.944880 140425123104576 deprecation_wrapper.py:119] From /home/aa5118/anaconda3/envs/tf/lib/python3.7/site-packages/tensor2tensor/data_generators/text_problems.py:394: The name tf.VarLenFeature is deprecated. Please use tf.io.VarLenFeature instead.

W0904 18:22:27.945790 140425123104576 deprecation_wrapper.py:119] From /home/aa5118/anaconda3/envs/tf/lib/python3.7/site-packages/tensor2tensor/data_generators/problem.py:705: The name tf.FixedLenFeature is deprecated. Please use tf.io.FixedLenFeature instead.



Tensor Input: tf.Tensor(
[   79    71    52  1899     6    43     6    33    19    57    71   429
   367     3   770     4   364     3  1149     4   365     3   325     4
   366     3   339    60  2054    94     7   949   139  2915    24   176
   136   891  1307     7   582   146   149    60    94     7   136   298
   149     7    94   822   245     7   705   303   741     7   102   591
   124    15   325    54   820     7  2749   268   741    60   630   444
   761   176    55   828   891    94     4   362   277   368     3  1156
  1345  1564     5    21     9  7064  2925     7  1156  1345  3868     5
   105     9 11939 12298 11326     7   674     5  1825   968     7  1433
  4825  6989     5 22037 11326     4   235     3   812    11    92     4
   343     3   227   232     5 11345     5   111     6    81     5    30
     7   191     5  1098     5    18     6    37     5    30     7   225
     5   170     5    48     6    38     5    30     7   230   229     5
   113     5    48     6  

Below cell is not executed in order to protect patient privacy. Executing it will show the decoded context data and discharge summary

In [11]:
# We use the encoders to decode the tensors to the actual input text.
input_encoder = mimic_problem.get_feature_encoders(
    data_dir=DATA_DIR)["inputs"]
target_encoder = mimic_problem.get_feature_encoders(
    data_dir=DATA_DIR)["targets"]

input_decoded = input_encoder.decode(input_tensor.numpy())
target_decoded = target_encoder.decode(target_tensor.numpy())

print("Decoded Input: " + input_decoded)
print("Decoded Target: " + target_decoded)