# Set Up

In [1]:
from IPython.display import clear_output

!pip install -q t5
!pip install tensorflow-text==2.12.0


clear_output()

In [2]:
print("Installing dependencies...")
import functools
import os
import gin
import tensorflow_gcs_config
from google.colab import auth
import tensorflow.compat.v1 as tf
import tensorflow_datasets as tfds
from contextlib import contextmanager
import logging as py_logging
import t5

Installing dependencies...


In [3]:
TOKENIZER_DIR = "gs://snippet-summarization/tokenizer" #@param { type: "string" }
if not TOKENIZER_DIR or TOKENIZER_DIR == "gs://":
  raise ValueError("You must enter a TOKENIZER_DIR.")

print("Setting up GCS access...")
os.environ['USE_AUTH_EPHEM'] = '0'
from google.colab import auth
auth.authenticate_user()

# Set credentials for GCS reading/writing from Colab and TPU.
TPU_TOPOLOGY = "2x2"
try:
  tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
  TPU_ADDRESS = tpu.get_master()
  print('Running on TPU:', TPU_ADDRESS)
except ValueError:
  raise BaseException('ERROR: Not connected to a TPU runtime; please see the previous cell in this notebook for instructions!')
tf.config.experimental_connect_to_host(TPU_ADDRESS)
tensorflow_gcs_config.configure_gcs_from_colab_auth()

tf.disable_v2_behavior()


#LOGGING
tf.get_logger().propagate = False
py_logging.root.setLevel('INFO')

@contextmanager
def tf_verbosity_level(level):
  og_level = tf.logging.get_verbosity()
  tf.logging.set_verbosity(level)
  yield
  tf.logging.set_verbosity(og_level)

Setting up GCS access...


Instructions for updating:
non-resource variables are not supported in the long term


Running on TPU: grpc://10.51.40.210:8470


# Load Vocabulary

In [4]:
VOCAB_PREFIX = 'sp' #@param {type: "string"}
vocab_model_path = os.path.join(TOKENIZER_DIR, f'{VOCAB_PREFIX}.model')
vocab_path = os.path.join(TOKENIZER_DIR, f'{VOCAB_PREFIX}.vocab')
print(vocab_model_path)
print(vocab_path)


gs://snippet-summarization/tokenizer/sp.model
gs://snippet-summarization/tokenizer/sp.vocab


In [5]:
from t5.data import postprocessors as t5_postprocessors
from t5.seqio import Feature,SentencePieceVocabulary

num_special_mask_tokens = 100 #@param {type: "integer"}

def load_vocabulary():
  return SentencePieceVocabulary(vocab_model_path, num_special_mask_tokens)

# Prepare Dataset for T5

In [18]:
train_path = 'gs://snippet-summarization/data/snippet-summarization/train.tsv' #@param { type: "string" }
eval_path = 'gs://snippet-summarization/data/snippet-summarization/eval.tsv' #@param { type: "string" }
test_path = 'gs://snippet-summarization/data/snippet-summarization/test.tsv' #@param { type: "string" }

finetune_datasets_paths = {
    "train":      train_path,
    "validation": eval_path,
    "test": test_path
}



In [19]:
def load_dataset(split, shuffle_files=True):
  """
  Function to load .tsv dataset as a tf.data.Dataset in TensorFlow
  """
  # We only have one file for each split.
  del shuffle_files

  # Load lines from the text file as examples.

  ds = tf.data.TextLineDataset(finetune_datasets_paths[split])
  ds = ds.map(functools.partial(tf.io.decode_csv, record_defaults=["string","string"],
                          field_delim="\t", use_quote_delim=True)
                          ,
        num_parallel_calls=tf.data.experimental.AUTOTUNE)

  ds = ds.map(lambda *ex: dict(zip(["input", "output"], ex)))
  return ds

### A few examples

In [20]:
print("A few raw validation examples...")
for ex in tfds.as_numpy(load_dataset("validation").take(5)):
  print(ex)

A few raw validation examples...
{'input': b'@ Override <nl> protected void paintText ( @ NotNull Graphics graphics , <nl> @ NotNull AbstractButton component , <nl> @ NotNull Rectangle textRect , <nl> @ NotNull String text ) { <nl> CommonDropDownButton button = ( CommonDropDownButton ) component ; <nl> <start> if ( button . getAction ( ) . getShowExpandArrow ( ) ) { <end> <nl>  <nl> <start> textRect . x -= ARROW_REGION_WIDTH / 2 ; <end> <nl> <start> } <end> <nl> super . paintText ( graphics , button , textRect , text ) ; <nl> }<nl>', 'output': b'offset the text rect to reserve space for the arrow'}
{'input': b'private void configureListener ( XStream xStream ) <nl> { <nl> xStream . alias ( "listener" , Listener . class ) ; <nl> for ( String panelAttribute : LISTENER_ATTRIBUTE ) <nl> { <nl> xStream . aliasAttribute ( Listener . class , panelAttribute , panelAttribute ) ; <nl> } <nl>  <nl> <start> xStream . addImplicitCollection ( Listener . class , "os" , OsModel . class ) ; <end> <nl> 

# Dataset Prepocessing

In [21]:
from tensorflow_datasets.core.utils.type_utils import Shape
def preprocessing(ds):
  """
  Preprocess function to convert the tf.data.Dataset into a text-to-text format,
  with both inputs and targets fields.
  Param: tf.data.Dataset
  Return: text-to-text format
  """
  prefix = 'SNIPPET SUMMARIZATION: ' #@param {type : "string"}
  def to_inputs_and_targets(ex):
    x_input = tf.strings.strip(prefix + ex['input'])
    y_label = tf.strings.strip(ex['output'])
    inputs = tf.strings.join([x_input], separator=' ')
    class_label = tf.strings.join([y_label], separator=' ')
    return {'inputs': inputs, 'targets': class_label}
  return ds.map(to_inputs_and_targets,
                num_parallel_calls=tf.data.experimental.AUTOTUNE)

### A few examples

In [22]:
print("A few preprocessed train examples...")
sample = tfds.as_numpy(preprocessing(load_dataset("train").take(5)))
for ex in sample:
  print(ex)

A few preprocessed train examples...
{'inputs': b'SNIPPET SUMMARIZATION: private final void writeFinalWovenProxyMethods ( ) { <nl> // add private fields for the Callable<Object> dispatcher <nl> // and InvocationListener. These aren\'t static because we can have <nl> // multiple instances of the same proxy class. These should not be <nl> // serialized, or used in JPA or any other thing we can think of, <nl> // so we annotate them as necessary <nl> generateField(DISPATCHER_FIELD, Type.getDescriptor(Callable.class)); <nl> generateField(LISTENER_FIELD, Type.getDescriptor(InvocationListener.class)); <nl> // a general methodAdapter field that we will use to with GeneratorAdapters <nl> // to create the methods required to implement WovenProxy <nl> GeneratorAdapter methodAdapter; <nl> // add a method for unwrapping the dispatcher <nl> methodAdapter = getMethodGenerator(PUBLIC_GENERATED_METHOD_ACCESS, new Method( <nl> "org_apache_aries_proxy_weaving_WovenProxy_unwrap", DISPATCHER_TYPE, <nl> NO_

# Creating Task and Mixture

In [23]:
DEFAULT_OUTPUT_FEATURES = {
    "inputs": Feature(
        vocabulary=load_vocabulary(), add_eos=True, required=False),
    "targets": Feature(
        vocabulary=load_vocabulary(), add_eos=True)
    }

TASK_NAME = "snippet_summarization" #@param{ type : "string"}

# TASK
t5.data.TaskRegistry.remove(TASK_NAME)
t5.data.TaskRegistry.add(
    TASK_NAME,
    # Function which returns a tf.data.Dataset
    dataset_fn=load_dataset,
    splits=["train","validation","test"],
    # List of functions that preprocess the input tf.data.Dataset
    text_preprocessor=[preprocessing],
    # Accuracy is used as evaluation metric
    metric_fns=[t5.evaluation.metrics.accuracy, t5.evaluation.metrics.bleu],
    # Not required, helps for mixing and auto-caching
    # num_input_examples=num_input_examples,
    output_features = DEFAULT_OUTPUT_FEATURES
)

MIXTURE_NAME = "task" #@param{ type : "string"}

# MIXTURE
t5.data.MixtureRegistry.remove(MIXTURE_NAME)
t5.data.MixtureRegistry.add(
    MIXTURE_NAME,
    # List of tasks
    [TASK_NAME],
    default_rate=1.0
)

<seqio.dataset_providers.Mixture at 0x7d0eeac1a590>

### A few examples

# Creating Model

In [24]:
scheduler = "isr" #@param ["polynomial", "constant", "isr", "slanted"]

In [25]:
# Storage paths
FINETUNE_MODEL_DIR = f"gs://snippet-summarization/models/snippet-summarizer/Best-Performing-Model/Pre-trained"
PRETRAIN_MODEL_DIR='gs://snippet-summarization/models/pre-trained'
# FLAGS = tf.app.flags.FLAGS
# tf.app.flags.DEFINE_string ('f', '', 'kernel')

In [26]:
from tensorflow.keras.optimizers.schedules import PolynomialDecay

# Learning rate properties
starter_learning_rate = 0.01 #@param {type : "number"}
end_learning_rate = 0.001 #@param {type : "number"}
decay_steps = 10000 #@param {type : "integer"}

learning_rate_fn = PolynomialDecay(
     starter_learning_rate,
     decay_steps,
     end_learning_rate,
     power=0.5)

In [27]:
from mesh_tensorflow.transformer.learning_rate_schedules import slanted_triangular, truncated_rsqrt
from t5 import models

# Learning rate schedule fn
if scheduler == 'polynomial':
  learning_rate_scheduler = learning_rate_fn
elif scheduler == 'isr':
  learning_rate_scheduler = truncated_rsqrt
elif scheduler == 'slanted':
  learning_rate_scheduler = slanted_triangular
else:
  learning_rate_scheduler = 0.001

print(learning_rate_scheduler)

# Model properties
MODEL_SIZE = "small"
model_parallelism, train_batch_size, keep_checkpoint_max = {
    "small": (1, 16, 100),
    "base": (2, 128, 8),
    "large": (8, 64, 4),
    "3B": (8, 16, 1),
    "11B": (8, 16, 1)}[MODEL_SIZE]


# Mesh Tensorflow Transformer
model = t5.models.MtfModel(
    model_dir=FINETUNE_MODEL_DIR,
    tpu=TPU_ADDRESS,
    tpu_topology=TPU_TOPOLOGY,
    model_parallelism=model_parallelism,
    batch_size=train_batch_size,
    sequence_length={"inputs": 1500, "targets": 256},
    # pick the correct scheduler, according to the model you want to train
    learning_rate_schedule = learning_rate_scheduler,
    save_checkpoints_steps=5000,
    keep_checkpoint_max=keep_checkpoint_max,
    iterations_per_loop=100,
)

<function truncated_rsqrt at 0x7d0f89f51a20>


# Learning Scheduler

In [17]:
# Upload one of the four gin files according to the selected scheduler.
LOCAL_GIN_PATH = "/content/operative_config.gin"

# Finetuning the Model

In [None]:
from t5 import models

TRAIN_STEPS =  500000#@param {type: "integer"}
with gin.unlock_config():
    gin.parse_config_file(LOCAL_GIN_PATH)
    # Stat finetuning
    model.finetune(mixture_or_task_name=MIXTURE_NAME,
               finetune_steps=TRAIN_STEPS,
               pretrained_model_dir=PRETRAIN_MODEL_DIR)
    # model.train(mixture_or_task_name=MIXTURE_NAME,
    #             steps=TRAIN_STEPS)

    model.bach_size=32
    model.eval(
        mixture_or_task_name=MIXTURE_NAME,
        checkpoint_steps=-1,
        split="test"
    )


INFO:root:system_path_file_exists:gs://snippet-summarization/models/pre-trained/operative_config.gin
ERROR:root:Path not found: gs://snippet-summarization/models/pre-trained/operative_config.gin
From /usr/local/lib/python3.10/dist-packages/mesh_tensorflow/transformer/utils.py:2043: TPUConfig.__new__ (from tensorflow_estimator.python.estimator.tpu.tpu_config) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.keras instead.
From /usr/local/lib/python3.10/dist-packages/mesh_tensorflow/transformer/utils.py:2059: RunConfig.__init__ (from tensorflow_estimator.python.estimator.tpu.tpu_config) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.keras instead.
From /usr/local/lib/python3.10/dist-packages/tensorflow_estimator/python/estimator/tpu/tpu_config.py:268: RunConfig.__init__ (from tensorflow_estimator.python.estimator.run_config) is deprecated and will be removed in a future version.
Instructions for updating:
U