In [None]:
from IPython.display import clear_output 
!pip install tensorflow==2.9 t5 tensorflow-text==2.9
#!pip install -q t5 tensorflow-text==2.4.3
#!pip install -q tensorflow-text==2.8.0rc0

clear_output()

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow==2.9
  Downloading tensorflow-2.9.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (511.7 MB)
[K     |████████████████████████████████| 511.7 MB 205 bytes/s 
[?25hCollecting t5
  Downloading t5-0.9.3-py3-none-any.whl (153 kB)
[K     |████████████████████████████████| 153 kB 72.2 MB/s 
[?25hCollecting tensorflow-text==2.9
  Downloading tensorflow_text-2.9.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.6 MB)
[K     |████████████████████████████████| 4.6 MB 61.6 MB/s 
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 55.3 MB/s 
Collecting sacrebleu
  Downloading sacrebleu-2.3.1-py3-none-any.whl (118 kB)
[K     |████████████████████████████████| 11

In [None]:
print("Installing dependencies...")
import functools
import os
import gin
import tensorflow_gcs_config
from google.colab import auth
import tensorflow.compat.v1 as tf
import tensorflow_datasets as tfds
from contextlib import contextmanager
import logging as py_logging
import t5

In [None]:
TOKENIZER_DIR = "gs://github-actions-generation/tokenizer" #@param { type: "string" }
if not TOKENIZER_DIR or TOKENIZER_DIR == "gs://": 
  raise ValueError("You must enter a TOKENIZER_DIR.")

print("Setting up GCS access...")
os.environ['USE_AUTH_EPHEM'] = '0'
from google.colab import auth
auth.authenticate_user()

# Set credentials for GCS reading/writing from Colab and TPU.
TPU_TOPOLOGY = "2x2"
try:
  tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
  TPU_ADDRESS = tpu.get_master()
  print('Running on TPU:', TPU_ADDRESS)
except ValueError:
  raise BaseException('ERROR: Not connected to a TPU runtime; please see the previous cell in this notebook for instructions!')
tf.config.experimental_connect_to_host(TPU_ADDRESS)
tensorflow_gcs_config.configure_gcs_from_colab_auth()

tf.disable_v2_behavior()


#LOGGING
tf.get_logger().propagate = False
py_logging.root.setLevel('INFO')

@contextmanager
def tf_verbosity_level(level):
  og_level = tf.logging.get_verbosity()
  tf.logging.set_verbosity(level)
  yield
  tf.logging.set_verbosity(og_level)

In [None]:
from t5.data import postprocessors as t5_postprocessors
from t5.seqio import Feature,SentencePieceVocabulary


VOCAB_PREFIX = 'sp-actions-bpe'
vocab_model_path = os.path.join(TOKENIZER_DIR, f'{VOCAB_PREFIX}.model')
vocab_path = os.path.join(TOKENIZER_DIR, f'{VOCAB_PREFIX}.vocab')
print(vocab_model_path)


num_special_mask_tokens = 100
TaskRegistry = t5.data.TaskRegistry
TfdsTask = t5.data.TfdsTask


def get_default_vocabulary():
  return SentencePieceVocabulary(vocab_model_path, num_special_mask_tokens)

DEFAULT_OUTPUT_FEATURES = {
    "inputs": Feature(
        vocabulary=get_default_vocabulary(), add_eos=True, required=False),

    "targets": Feature(
        vocabulary=get_default_vocabulary(), add_eos=True)
}

gs://github-actions-generation/tokenizer/sp-actions-bpe.model


In [None]:
path_pretraining_task1 = 'gs://github-actions-generation/datasets/pretrain.tsv'

nq_tsv_path = {
    "train":      path_pretraining_task1,
}


def nq_dataset_task1(split, shuffle_files=True):
  # We only have one file for each split.
  del shuffle_files

  # Load lines from the text file as examples.

  ds = tf.data.TextLineDataset(nq_tsv_path[split])
  ds = ds.map(
      functools.partial(tf.io.decode_csv, record_defaults=["string","string"],
                        field_delim="\t", use_quote_delim=False),
      num_parallel_calls=tf.data.experimental.AUTOTUNE)
  
  ds = ds.map(lambda *ex: dict(zip(["input", "output"], ex)))
  return ds

print("A few raw train examples...")
for ex in tfds.as_numpy(nq_dataset_task1("train").take(5)):
    print(ex)


def preprocessing_task1(ds):

  def to_inputs_and_targets(ex):

        inputs = tf.strings.join(['denoising: ' + ex['input']], separator=' ')
        class_label = tf.strings.join([ex['output']], separator=' ')
        return {'inputs': inputs, 'targets': class_label }
    
  return ds.map(to_inputs_and_targets, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    
t5.data.TaskRegistry.remove('masking_task_pretraining')
t5.data.TaskRegistry.add(
        "masking_task_pretraining",
        dataset_fn = nq_dataset_task1,
        splits = ["train"],
        text_preprocessor = preprocessing_task1,
        output_features = DEFAULT_OUTPUT_FEATURES
    )

A few raw train examples...
{'input': b"{'version': 1, 'root<extra_id_0> {'level': '<extra_id_1> ', 'handlers': ['file', '<extra_id_2> ']}, 'loggers<extra_id_3> {'apscheduler': {'level<extra_id_4> 'ERROR', 'handlers': ['file', '<extra_id_5> ']}}, 'handlers': {'<extra_id_6> ': {'class': '<extra_id_7> .StreamHandler', 'level': 'INFO',<extra_id_8> atter': 'console_form<extra_id_9> '}, 'file<extra_id_10> {'class': 'logging.<extra_id_11> .TimedR<extra_id_12> atingFileHandler', '<extra_id_13> ': 'file_formatters', 'filename<extra_id_14> '${workdir}/logs/robot.log', 'level':<extra_id_15> DEBUG', 'when': 'midnight', 'interval': 1}},<extra_id_16> atters': {'<extra_id_17> _formatters':<extra_id_18> format': '%(asctime)s - %(threadName)s -<extra_id_19> (levelname)s: %(message)<extra_id_20> ', 'datefmt': '%Y/%<extra_id_21> d<extra_id_22> H:%<extra_id_23> :%S'}, 'file_formatters': {'format': '%(asctime<extra_id_24> s - %(threadName<extra_id_25> s<extra_id_26> %(levelname)s: %(message)s<extra_id_27>

<t5.data.dataset_providers.FunctionTask at 0x7fd80525f210>

In [None]:
def _rate_num_input_examples(task):
  if "train" in task.splits:
    return float(task.num_input_examples("train"))
  elif "validation" in task.splits:
    return float(task.num_input_examples("validation"))
  else:
    raise ValueError("Task %s does not have a train or validation split." % (task.name))

FLAGS = tf.app.flags.FLAGS
tf.app.flags.DEFINE_string ('f', '', 'kernel')

<absl.flags._flagvalues.FlagHolder at 0x7fd8052571d0>

In [None]:
from mesh_tensorflow.transformer.learning_rate_schedules import learning_rate_schedule_noam
from t5 import models

MODEL_SIZE = "small" 

MODEL_DIR = 'gs://github-actions-generation/models/pre-trained-models/yaml'
model_parallelism, train_batch_size, keep_checkpoint_max = {
    "small": (1, 128, 50),
    "base": (2, 16, 8),
    "large": (8, 64, 4),
    "3B": (8, 16, 1),
    "11B": (8, 16, 1)}[MODEL_SIZE]

tf.io.gfile.makedirs(MODEL_DIR)

model = models.mtf_model.MtfModel(
    model_dir=MODEL_DIR,
    tpu=TPU_ADDRESS,
    tpu_topology=TPU_TOPOLOGY,
    model_parallelism=model_parallelism,
    batch_size=train_batch_size,
    learning_rate_schedule = learning_rate_schedule_noam,
    sequence_length={"inputs": 512, "targets": 512},
    save_checkpoints_steps=10000,
    keep_checkpoint_max=keep_checkpoint_max,
    iterations_per_loop=100,
)

In [None]:
PATH_GIN_FILE = '/content/operative_config.gin'
import gin

with gin.unlock_config():
    gin.parse_config_file(PATH_GIN_FILE)
    #RUN FINE-TUNING
    TRAIN_STEPS = 300000
    model.train("masking_task_pretraining", TRAIN_STEPS)


From /usr/local/lib/python3.7/dist-packages/tensorflow/python/training/training_util.py:397: Variable.initialized_value (from tensorflow.python.ops.variables) is deprecated and will be removed in a future version.
Instructions for updating:
Use Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts.
SimdMeshImpl ignoring devices ['', '', '', '', '', '', '', '']
Using default tf glorot_uniform_initializer for variable encoder/block_000/layer_000/SelfAttention/relative_attention_bias  The initialzer will guess the input and output dimensions  based on dimension order.
Using default tf glorot_uniform_initializer for variable decoder/block_000/layer_000/SelfAttention/relative_attention_bias  The initialzer will guess the input and output dimensions  based on dimension order.
From /usr/local/lib/python3.7/dist-packages/tensorflow/python/training/saver.py:1175: get_checkpoint_mtimes (from tensorflow.python.training.checkpoint_ma