In [47]:
import datetime
import tensorflow as tf
import numpy as np
import pandas as pd
import transformers
import datasets
from datasets import load_dataset
import tensorflow_datasets as tensorflow_datasets
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, TFT5ForConditionalGeneration
import pickle as pkl
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [2]:
data_dir='./data'
log_dir='./logs'
save_path='./models'
tokenizer_path='cache/t5-base'

In [3]:
tokenizer=AutoTokenizer.from_pretrained(tokenizer_path)

In [4]:
train=load_dataset('squad',split='train',cache_dir=data_dir)
val=load_dataset('squad',split='validation',cache_dir=data_dir)

Reusing dataset squad (./data\squad\plain_text\1.0.0\4fffa6cf76083860f85fa83486ec3028e7e32c342c218ff2a620fc6b2868483a)
Reusing dataset squad (./data\squad\plain_text\1.0.0\4fffa6cf76083860f85fa83486ec3028e7e32c342c218ff2a620fc6b2868483a)


In [5]:
WARMUP_STEPS=1e4
BATCH_SIZE=4
ENCODER_MAXLEN=250
DECODER_MAX_LEN=75
BUFFER_SIZE=1000
LEN_TRAIN=len(train)
LEN_VAL=len(val)
TRAIN_STEPS=int(np.ceil(LEN_TRAIN/BATCH_SIZE))
VAL_STEPS=int(np.ceil(LEN_VAL/BATCH_SIZE))

In [8]:
def encode(instance,encoder_maxlen=ENCODER_MAXLEN,decoder_maxlen=DECODER_MAX_LEN):
    context=instance['context']
    question=instance['question']
    answers=instance['answers']['text']
    
    new_question=f'question: {str(question)} context: {str(context)} </s>'
    new_answers=', '.join([answer for answer in list(answers)])
    new_answers=f'{new_answers} <\s>'

    encoder_inputs=tokenizer(new_question,truncation=True,return_tensors='tf',max_length=encoder_maxlen,pad_to_max_length=True)
    decoder_inputs=tokenizer(new_answers,truncation=True,return_tensors='tf',max_length=decoder_maxlen,pad_to_max_length=True)

    input_ids=encoder_inputs['input_ids'][0]
    input_attention=encoder_inputs['attention_mask'][0]
    target_ids=decoder_inputs['input_ids'][0]
    target_attention=decoder_inputs['attention_mask'][0]

    outputs={
        'input_ids':input_ids,
        'attention_mask':input_attention,
        'labels':target_ids,
        'decoder_attention_mask':target_attention
    }
    return outputs

In [9]:
train_ds=train.map(encode)
val_ds=val.map(encode)

100%|██████████| 87599/87599 [02:35<00:00, 563.60ex/s]
100%|██████████| 10570/10570 [00:19<00:00, 556.05ex/s]


In [12]:
train_ds.save_to_disk('datasets/train_ds')
val_ds.save_to_disk('datasets/val_ds')

In [6]:
train_ds=datasets.load_from_disk('datasets/train_ds')
val_ds=datasets.load_from_disk('datasets/val_ds')

In [7]:
def convert_to_tf_dataset(dataset):
    cols=['input_ids','attention_mask','labels','decoder_attention_mask']
    dataset.set_format(type='tensorflow',columns=cols)
    return_types={
        'input_ids':tf.int32,
        'attention_mask':tf.int32,
        'labels':tf.int32,
        'decoder_attention_mask':tf.int32,
    }
    return_shapes={
        'input_ids':tf.TensorShape([None]),
        'attention_mask':tf.TensorShape([None]),
        'labels':tf.TensorShape([None]),
        'decoder_attention_mask':tf.TensorShape([None]),
    }
    ds=tf.data.Dataset.from_generator(lambda:dataset,return_types,return_shapes)
    return ds

In [8]:
tf_train_ds=convert_to_tf_dataset(train_ds)
tf_val_ds=convert_to_tf_dataset(val_ds)

In [15]:
tf.data.experimental.save(tf_train_ds,'datasets/tf_train_ds')
tf.data.experimental.save(tf_val_ds,'datasets/tf_val_ds')

In [9]:
def ready_ds(ds,batch_size=BATCH_SIZE,buffer_size=BUFFER_SIZE):
    ds=ds.shuffle(buffer_size).batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)
    return ds

In [10]:
final_train_ds=ready_ds(tf_train_ds)
final_val_ds=ready_ds(tf_val_ds)

In [43]:
class T5Model(TFT5ForConditionalGeneration):

    def __init__(self,*args,log_dir=None,cache_dir=None,**kwargs):
        super().__init__(*args,**kwargs)
        self.loss_tracker=tf.keras.metrics.Mean(name='loss')
    
    # @tf.function
    def train_step(self,data):
        x=data
        y=tf.reshape(x['labels'],[-1,1])
        with tf.GradientTape() as tape:
            outputs=self(x,training=True)
            logits=outputs[1]
            loss=tf.reduce_mean(outputs[0])
            grads=tape.gradient(loss,self.trainable_variables)
        self.optimizer.apply_gradients(zip(grads,self.trainable_variables))
        lr=self.optimizer._decayed_lr(tf.float32)
        self.loss_tracker.update_state(y,loss)
        self.compiled_metrics.update_state(y,logits)
        metrics={m.name:m.result() for m in self.metrics}
        metrics.update({'lr':lr})
        return metrics

    def test_step(self,data):
        x=data
        y=tf.reshape(x['labels'],[-1,1])
        outputs=self(x,training=False)
        loss=tf.reduce_mean(outputs[0])
        logits=outputs[1]
        self.loss_tracker.update_state(y,loss)
        self.compiled_metrics.update_state(y,logits)
        metrics={m.name:m.result() for m in self.metrics}
        return metrics

In [12]:
class CustomLrSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self,warmup_steps=WARMUP_STEPS):
        super().__init__()
        self.warmup_steps=tf.cast(warmup_steps,tf.float32)
    
    def __call__(self,step):
        step=tf.cast(step,tf.float32)
        m=tf.cast(tf.maximum(self.warmup_steps,step),tf.float32)
        lr=tf.math.rsqrt(m)
        return lr


In [34]:
start_profile_batch=TRAIN_STEPS+10
stop_profile_batch=start_profile_batch+100
profile_range=f'{start_profile_batch},{stop_profile_batch}'

log_path=log_dir
tensorboard_callback=[tf.keras.callbacks.TensorBoard(log_dir=log_path,histogram_freq=1,update_freq=20,profile_batch=profile_range)]

checkpoint_path=save_path+'/'+'T5.h5'
checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(save_path,monitor='val_loss',save_best_only=True)

callbacks=[tensorboard_callback,checkpoint_callback]
metrics=[tf.keras.metrics.SparseTopKCategoricalAccuracy(name='accuracy')]

In [35]:
learning_rate=CustomLrSchedule()
optimizer=tf.keras.optimizers.Adam(learning_rate)

In [44]:
model=T5Model.from_pretrained('t5-base',cache_dir='model_cache/t5-base')

All model checkpoint layers were used when initializing T5Model.

All the layers of T5Model were initialized from the model checkpoint at t5-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use T5Model for predictions without further training.


In [45]:
model.compile(optimizer=optimizer,metrics=metrics)

In [20]:
%tensorboard --logdir ./logs

ERROR: Timed out waiting for TensorBoard to start. It may still be running as pid 14956.

In [None]:
model.fit(final_train_ds,epochs=5,steps_per_epoch=TRAIN_STEPS,callbacks=callbacks,validation_data=final_val_ds,validation_steps=VAL_STEPS,initial_epoch=0)