# Fine Tuning Flan T5

References:

1. https://www.datacamp.com/tutorial/flan-t5-tutorial
2. https://www.youtube.com/watch?v=r6XY80Z9eSA (Ignore Lightning, this also uses trainer instead of seq2seq trainer)
3. https://discuss.huggingface.co/t/trainer-vs-seq2seqtrainer/3145 (Why I'm using seq2seq trainer instead of regular trainer)

In [12]:
DATASET_FILE_PATH= 'temp/zillow_qa_dataset.json'

In [13]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
import nltk
import evaluate
import numpy as np
from datasets import load_dataset

dataset = load_dataset('json', data_files=DATASET_FILE_PATH)

dataset = dataset['train'].train_test_split(test_size=0.2)

dataset

DatasetDict({
    train: Dataset({
        features: ['answer', 'question'],
        num_rows: 200
    })
    test: Dataset({
        features: ['answer', 'question'],
        num_rows: 50
    })
})

In [14]:
tokenizer = T5Tokenizer.from_pretrained('google/flan-t5-base')
model = T5ForConditionalGeneration.from_pretrained('google/flan-t5-base')
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

def preprocess_function(examples):
    inputs = [q for q in examples['question']]
    targets = [a for a in examples['answer']]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=512, truncation=True)

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)

tokenized_dataset

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.

Map: 100%|██████████| 200/200 [00:00<00:00, 6772.27 examples/s]

Map: 100%|██████████| 50/50 [00:00<00:00, 3222.47 examples/s]


DatasetDict({
    train: Dataset({
        features: ['answer', 'question', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 200
    })
    test: Dataset({
        features: ['answer', 'question', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 50
    })
})

In [15]:
nltk.download("punkt", quiet=True)
metric = evaluate.load("rouge")

def compute_metrics(eval_preds):
   preds, labels = eval_preds

   # decode preds and labels
   labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
   decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
   decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

   # rougeLSum expects newline after each sentence
   decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
   decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

   result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
  
   return result


In [16]:
L_RATE = 3e-4
BATCH_SIZE = 8
PER_DEVICE_EVAL_BATCH = 4
WEIGHT_DECAY = 0.01
SAVE_TOTAL_LIM = 3
NUM_EPOCHS = 3

In [17]:
training_args = Seq2SeqTrainingArguments(
   output_dir="./results",
   evaluation_strategy="epoch",
   learning_rate=L_RATE,
   per_device_train_batch_size=BATCH_SIZE,
   per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
   weight_decay=WEIGHT_DECAY,
   save_total_limit=SAVE_TOTAL_LIM,
   num_train_epochs=NUM_EPOCHS,
   predict_with_generate=True,
   push_to_hub=False
)


trainer = Seq2SeqTrainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_dataset["train"],
   eval_dataset=tokenized_dataset["test"],
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics
)


trainer.train()



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

                                               

[A[A                                         
[A                                            


 33%|███▎      | 25/75 [02:16<01:21,  1.62s/it]
[A

[A[A

[A[A

{'eval_loss': 0.5464061498641968, 'eval_rouge1': 0.7978820470717023, 'eval_rouge2': 0.7318542568542568, 'eval_rougeL': 0.7924674329501917, 'eval_rougeLsum': 0.7920908593322387, 'eval_runtime': 13.7166, 'eval_samples_per_second': 3.645, 'eval_steps_per_second': 0.948, 'epoch': 1.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

                                               

[A[A                                         
[A                                            


 33%|███▎      | 25/75 [03:08<01:21,  1.62s/it]
[A

[A[A

[A[A

{'eval_loss': 0.43127501010894775, 'eval_rouge1': 0.8043130815544607, 'eval_rouge2': 0.7341915954415956, 'eval_rougeL': 0.7948535851122056, 'eval_rougeLsum': 0.7945443349753695, 'eval_runtime': 13.9954, 'eval_samples_per_second': 3.573, 'eval_steps_per_second': 0.929, 'epoch': 2.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

                                               

[A[A                                         
[A                                            


 33%|███▎      | 25/75 [04:02<01:21,  1.62s/it]
[A

[A[A

                                               

[A[A                                         
 33%|███▎      | 25/75 [04:02<01:21,  1.62s/it]
100%|██████████| 75/75 [02:38<00:00,  2.11s/it]

{'eval_loss': 0.39839261770248413, 'eval_rouge1': 0.7878743842364531, 'eval_rouge2': 0.7140102027602027, 'eval_rougeL': 0.7798929939792008, 'eval_rougeLsum': 0.7794474548440067, 'eval_runtime': 13.851, 'eval_samples_per_second': 3.61, 'eval_steps_per_second': 0.939, 'epoch': 3.0}
{'train_runtime': 158.5445, 'train_samples_per_second': 3.784, 'train_steps_per_second': 0.473, 'train_loss': 0.6017985026041667, 'epoch': 3.0}





TrainOutput(global_step=75, training_loss=0.6017985026041667, metrics={'train_runtime': 158.5445, 'train_samples_per_second': 3.784, 'train_steps_per_second': 0.473, 'total_flos': 12004652335104.0, 'train_loss': 0.6017985026041667, 'epoch': 3.0})

In [18]:
# Save the trained model and tokenizer
model.save_pretrained('../models/t5')
tokenizer.save_pretrained('../models/t5')


('../models/t5\\tokenizer_config.json',
 '../models/t5\\special_tokens_map.json',
 '../models/t5\\spiece.model',
 '../models/t5\\added_tokens.json')