In [10]:
import tensorflow as tf
import numpy as np
import pandas as pd
import transformers
import datasets
from datasets import load_dataset
import tensorflow_datasets as tensorflow_datasets
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, TFT5ForConditionalGeneration
import pickle as pkl
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [2]:
data_dir='./data'
log_dir='./logs'
save_path='./models'
tokenizer_path='cache/t5-base'

In [3]:
tokenizer=AutoTokenizer.from_pretrained(tokenizer_path)

In [4]:
train=load_dataset('squad',split='train',cache_dir=data_dir)
val=load_dataset('squad',split='validation',cache_dir=data_dir)

Reusing dataset squad (./data\squad\plain_text\1.0.0\4fffa6cf76083860f85fa83486ec3028e7e32c342c218ff2a620fc6b2868483a)
Reusing dataset squad (./data\squad\plain_text\1.0.0\4fffa6cf76083860f85fa83486ec3028e7e32c342c218ff2a620fc6b2868483a)


In [5]:
WARMUP_STEPS=1e4
BATCH_SIZE=4
ENCODER_MAXLEN=250
DECODER_MAX_LEN=75
BUFFER_SIZE=1000
LEN_TRAIN=len(train)
LEN_VAL=len(val)
TRAIN_STEPS=int(np.ceil(LEN_TRAIN/BATCH_SIZE))
VAL_STEPS=int(np.ceil(LEN_VAL/BATCH_SIZE))

In [8]:
def encode(instance,encoder_maxlen=ENCODER_MAXLEN,decoder_maxlen=DECODER_MAX_LEN):
    context=instance['context']
    question=instance['question']
    answers=instance['answers']['text']
    
    new_question=f'question: {str(question)} context: {str(context)} </s>'
    new_answers=', '.join([answer for answer in list(answers)])
    new_answers=f'{new_answers} <\s>'

    encoder_inputs=tokenizer(new_question,truncation=True,return_tensors='tf',max_length=encoder_maxlen,pad_to_max_length=True)
    decoder_inputs=tokenizer(new_answers,truncation=True,return_tensors='tf',max_length=decoder_maxlen,pad_to_max_length=True)

    input_ids=encoder_inputs['input_ids'][0]
    input_attention=encoder_inputs['attention_mask'][0]
    target_ids=decoder_inputs['input_ids'][0]
    target_attention=decoder_inputs['attention_mask'][0]

    outputs={
        'input_ids':input_ids,
        'attention_mask':input_attention,
        'labels':target_ids,
        'decoder_attention_mask':target_attention
    }
    return outputs

In [9]:
train_ds=train.map(encode)
val_ds=val.map(encode)

100%|██████████| 87599/87599 [02:35<00:00, 563.60ex/s]
100%|██████████| 10570/10570 [00:19<00:00, 556.05ex/s]


In [12]:
train_ds.save_to_disk('datasets/train_ds')
val_ds.save_to_disk('datasets/val_ds')

In [13]:
def convert_to_tf_dataset(dataset):
    cols=['input_ids','attention_mask','labels','decoder_attention_mask']
    dataset.set_format(type='tensorflow',columns=cols)
    return_types={
        'input_ids':tf.int32,
        'attention_mask':tf.int32,
        'labels':tf.int32,
        'decoder_attention_mask':tf.int32,
    }
    return_shapes={
        'input_ids':tf.TensorShape([None]),
        'attention_mask':tf.TensorShape([None]),
        'labels':tf.TensorShape([None]),
        'decoder_attention_mask':tf.TensorShape([None]),
    }
    ds=tf.data.Dataset.from_generator(lambda:dataset,return_types,return_shapes)
    return ds

In [14]:
tf_train_ds=convert_to_tf_dataset(train_ds)
tf_val_ds=convert_to_tf_dataset(val_ds)

In [15]:
tf.data.experimental.save(tf_train_ds,'datasets/tf_train_ds')
tf.data.experimental.save(tf_val_ds,'datasets/tf_val_ds')

In [16]:
def ready_ds(ds,batch_size=BATCH_SIZE,buffer_size=BUFFER_SIZE):
    ds=ds.shuffle(buffer_size).batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)
    return ds

In [17]:
final_train_ds=ready_ds(tf_train_ds)
final_val_ds=ready_ds(tf_val_ds)

In [18]:
class CustomLrSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self,warmup_steps=WARMUP_STEPS):
        super().__init__()
        self.warmup_steps=tf.cast(warmup_steps,tf.float32)
    
    def __call__(self,step):
        step=tf.cast(step,tf.float32)
        m=tf.cast(tf.maximum(self.warmup_steps,step),tf.float32)
        lr=tf.math.rsqrt(m)
        return lr
