In [None]:
%tensorflow_version 2.x
!pip3 install --upgrade pip
#Make sure to clone a working version with the experiment's setting
!pip3 install git+https://github.com/google-research/text-to-text-transfer-transformer.git@9499f06c5c34a65c2b7254a1a5f1077327115e9d


import functools
import os
import time
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

import tensorflow.compat.v1 as tf
import tensorflow_datasets as tfds

import t5

#Set the base dir(Google cloud bucket)
BASE_DIR = "gs://" #@param { type: "string" }

if not BASE_DIR or BASE_DIR == "gs://":
  raise ValueError("You must enter a BASE_DIR.")
ON_CLOUD = True


if ON_CLOUD:
  import tensorflow_gcs_config
  from google.colab import auth
  # Set credentials for GCS reading/writing from Colab and TPU.
  TPU_TOPOLOGY = "2x2"
  try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
    TPU_ADDRESS = tpu.get_master()
    print('Running on TPU:', TPU_ADDRESS)
  except ValueError:
    raise BaseException('ERROR: Not connected to a TPU runtime; please see the previous cell in this notebook for instructions!')
  auth.authenticate_user()
  tf.config.experimental_connect_to_host(TPU_ADDRESS)
  tensorflow_gcs_config.configure_gcs_from_colab_auth()

tf.disable_v2_behavior()

# Improve logging.
from contextlib import contextmanager
import logging as py_logging

if ON_CLOUD:
  tf.get_logger().propagate = False
  py_logging.root.setLevel('INFO')

@contextmanager
def tf_verbosity_level(level):
  og_level = tf.logging.get_verbosity()
  tf.logging.set_verbosity(level)
  yield
  tf.logging.set_verbosity(og_level)

In [2]:
train_path_javadoc = "gs://" #@param { type: "string" }
eval_path_javadoc = "gs://" #@param { type: "string" }
nq_tsv_path_javadoc = {
    "train":      train_path_javadoc, 
    "validation": eval_path_javadoc
}
num_nq_examples_javadoc = dict(train=1398135, validation=175084)

In [3]:
train_path_inside = "gs://" #@param { type: "string" }
eval_path_inside = "gs://" #@param { type: "string" }

nq_tsv_path_inside = {
    "train":       train_path_inside,
    "validation":  eval_path_inside
}

num_nq_examples_inside = dict(train=272994, validation=34138)

In [4]:
from t5.data import postprocessors as t5_postprocessors
from t5.seqio import Feature,SentencePieceVocabulary


# # Set the path of sentencepiece model and vocab files
# # Must be the same used for the pre-trained phase
vocab_model_path = 'gs://' #@param { type: "string" }
vocab_path = 'gs://' #@param { type: "string" }


TaskRegistry = t5.data.TaskRegistry
TfdsTask = t5.data.TfdsTask


def get_default_vocabulary():
  return SentencePieceVocabulary(vocab_model_path, 100)

DEFAULT_OUTPUT_FEATURES = {
    "inputs": Feature(
        vocabulary=get_default_vocabulary(), add_eos=True, required=False),

    "targets": Feature(
        vocabulary=get_default_vocabulary(), add_eos=True)
}

In [None]:
def nq_dataset_javadoc(split, shuffle_files=True):
  # We only have one file for each split.
  del shuffle_files

   # Load lines from the text file as examples.

  ds = tf.data.TextLineDataset(nq_tsv_path_javadoc[split])
  ds = ds.map(
      functools.partial(tf.io.decode_csv, record_defaults=["string","string"],
                        field_delim="\t", use_quote_delim=True),
      num_parallel_calls=tf.data.experimental.AUTOTUNE)
  
  ds = ds.map(lambda *ex: dict(zip(["input", "output"], ex)))
  return ds

print("A few raw train examples...")
for ex in tfds.as_numpy(nq_dataset_javadoc("train").take(5)):
  print(ex)

In [6]:
def javadoc_preprocessing(ds):
  
  def to_inputs_and_targets(ex):

        inputs = tf.strings.join(['complete javadoc comment:' + ex['input']], separator=' ')
        class_label = tf.strings.join([ex['output']], separator=' ')
        return {'inputs': inputs, 'targets': class_label }
    
  return ds.map(to_inputs_and_targets, 
                num_parallel_calls=tf.data.experimental.AUTOTUNE)

In [None]:
t5.data.TaskRegistry.remove('javadoc')
t5.data.TaskRegistry.add(
    "javadoc",
    dataset_fn=nq_dataset_javadoc,
    splits=["train", "validation"],
    text_preprocessor=[javadoc_preprocessing],
    output_features = DEFAULT_OUTPUT_FEATURES,
    metric_fns=[t5.evaluation.metrics.accuracy],
    num_input_examples=num_nq_examples_javadoc
)

In [None]:
nq_task = t5.data.TaskRegistry.get("javadoc")
ds = nq_task.get_dataset(split="train", sequence_length={"inputs": 256, "targets": 256})
print("A few preprocessed training examples...")
for ex in tfds.as_numpy(ds.take(5)):
  print(ex)


In [None]:
def nq_dataset_inside(split, shuffle_files=False):
  # We only have one file for each split.
  del shuffle_files

  # Load lines from the text file as examples.
  ds = tf.data.TextLineDataset(nq_tsv_path_inside[split])
  ds = ds.map(
      functools.partial(tf.io.decode_csv, record_defaults=["string","string"],
                        field_delim="\t", use_quote_delim=True),
      num_parallel_calls=tf.data.experimental.AUTOTUNE)
  
  ds = ds.map(lambda *ex: dict(zip(["input", "output"], ex)))
  return ds

print("A few raw valid examples...")
for ex in tfds.as_numpy(nq_dataset_inside("validation").take(5)):
  print(ex)


In [11]:
def inside_preprocessing(ds):
  
  def to_inputs_and_targets(ex):

        inputs = tf.strings.join(['complete block/inside comment:' + ex['input']], separator=' ')
        class_label = tf.strings.join([ex['output']], separator=' ')
        return {'inputs': inputs, 'targets': class_label }
    
  return ds.map(to_inputs_and_targets, 
                num_parallel_calls=tf.data.experimental.AUTOTUNE)

In [12]:
#Create a new training task
t5.data.TaskRegistry.remove('inside')
t5.data.TaskRegistry.add(
    "inside",
    dataset_fn=nq_dataset_inside,
    splits=["train", "validation"],
    text_preprocessor=[inside_preprocessing],
    output_features = DEFAULT_OUTPUT_FEATURES,
    metric_fns=[t5.evaluation.metrics.accuracy],
    num_input_examples=num_nq_examples_inside
)

<t5.data.dataset_providers.FunctionTask at 0x7fca8ba68f10>

In [None]:
nq_task = t5.data.TaskRegistry.get("inside")
ds = nq_task.get_dataset(split="train", sequence_length={"inputs": 256, "targets": 256})
print("A few preprocessed training examples...")
for ex in tfds.as_numpy(ds.take(5)):
  print(ex)

#SECOND TASK HAS BEEN CREATED

In [None]:
def _rate_num_input_examples(task):
  if "train" in task.splits:
    return float(task.num_input_examples("train"))
  elif "validation" in task.splits:
    return float(task.num_input_examples("validation"))
  else:
    raise ValueError("Task %s does not have a train or validation split." % (task.name))


t5.data.MixtureRegistry.remove("all_tasks")
t5.data.MixtureRegistry.add(
    "all_tasks",
    ["javadoc", "inside"],
    default_rate=_rate_num_input_examples
     #default_rate=1.0
)

In [16]:
from mesh_tensorflow.transformer.learning_rate_schedules import slanted_triangular 
MODEL_SIZE = "small" 

MODEL_DIR = 'gs://'#@param { type: "string" }

# Specify the pre-trained dir which must contain the pre-trained models, the operative_config.gin file and the checkpoint file as well
PRETRAINED_DIR='gs://'#@param { type: "string" }
#'gs://bucket_comment_completion/pretrained_model_extra_id/'


model_parallelism, train_batch_size, keep_checkpoint_max = {
    "small": (1, 256, 16),
    "base": (2, 128, 8),
    "large": (8, 64, 4),
    "3B": (8, 16, 1),
    "11B": (8, 16, 1)}[MODEL_SIZE]

#tf.io.gfile.makedirs(MODEL_DIR)

model = t5.models.MtfModel(
    model_dir=MODEL_DIR,
    tpu=TPU_ADDRESS,
    tpu_topology=TPU_TOPOLOGY,
    model_parallelism=model_parallelism,
    batch_size=train_batch_size,
    learning_rate_schedule = slanted_triangular,
    sequence_length={"inputs": 256, "targets": 256},
    save_checkpoints_steps=5000,
    keep_checkpoint_max=keep_checkpoint_max if ON_CLOUD else None,
    iterations_per_loop=100,
)

In [None]:
#You might want to load in colab the gin config file

PATH_GIN_FILE = '/content/operative_config.gin' #@param { type: "string" }
import gin


# Uncomment the following for fine-tune a pre-trained model
# with gin.unlock_config():
#     gin.parse_config_file(PATH_GIN_FILE)
#     #RUN FINE-TUNING
#     FINETUNE_STEPS = xxxxxx
#     model.finetune(
#         mixture_or_task_name="all_tasks",
#         pretrained_model_dir=MODEL_DIR,
#         finetune_steps=FINETUNE_STEPS
#     )

# The following are used when there is no pre-trained model, namely Ablation study
with gin.unlock_config():
    gin.parse_config_file(PATH_GIN_FILE)
    #RUN FINE-TUNING
    TRAIN_STEPS = 100000
    model.train("all_tasks", TRAIN_STEPS)


In [None]:
# Use a larger batch size for evaluation, which requires less memory.
# NB: If too high than lower the batch size

model.batch_size = 1024
model.eval(
    mixture_or_task_name="all_tasks",
    checkpoint_steps=-1 #evaluate only last checkpoint
)

In [None]:
# Run this cell for getting the scores/confidence

BASE_DIR_VALIDATION = 'gs://' #@param { type: "string" }

input_file = os.path.join(BASE_DIR_VALIDATION, '..')
target_file = os.path.join(BASE_DIR_VALIDATION, '..')
output_score_file = os.path.join(BASE_DIR_VALIDATION, '..')

vocabulary_predict=get_default_vocabulary()

model.score(inputs=input_file,
            targets=target_file,
            scores_file=output_score_file,
            checkpoint_steps=-1, vocabulary=vocabulary_predict)