In [1]:
import pandas as pd
import pickle
import nltk
import evaluate
import numpy as np

from datasets import load_dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from transformers import DataCollatorForSeq2Seq

In [2]:
# configure options
pd.set_option('max_colwidth', 200)

In [3]:
dataset = load_dataset('csv', data_files='datasets/final_sample_small.csv')
dataset = dataset["train"].train_test_split(test_size=0.2)

In [4]:
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base", max_new_tokens=512)
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [5]:
task_prefix = "answer the question: "

# Define our preprocessing function
def preprocess_function(examples):
    # The "inputs" are the tokenized answer:
    inputs = [task_prefix + doc for doc in examples["Title"]]
    model_inputs = tokenizer(inputs, truncation=True)
    
    # The "labels" are the tokenized outputs:
    labels = tokenizer(text_target=examples['Response'], truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/12000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [6]:
nltk.download("punkt", quiet=True)
metric = evaluate.load("rouge")

def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # decode preds and labels
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # rougeLSum expects newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return result

In [7]:
model_dir = './t5-flan-py-stackoverflow'

# Set up training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir=model_dir,
    evaluation_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    push_to_hub=False,
)

# Set up trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [8]:
# Train the model
trainer.train()



  0%|          | 0/4500 [00:00<?, ?it/s]

{'loss': 3.0436, 'learning_rate': 0.0002666666666666666, 'epoch': 0.33}
{'loss': 2.8788, 'learning_rate': 0.0002333333333333333, 'epoch': 0.67}
{'loss': 2.8256, 'learning_rate': 0.00019999999999999998, 'epoch': 1.0}


  0%|          | 0/750 [00:00<?, ?it/s]

{'eval_loss': 2.595390558242798, 'eval_rouge1': 0.10496883042212396, 'eval_rouge2': 0.019267080537174096, 'eval_rougeL': 0.0869708647836025, 'eval_rougeLsum': 0.09461379494286501, 'eval_runtime': 934.7972, 'eval_samples_per_second': 3.209, 'eval_steps_per_second': 0.802, 'epoch': 1.0}
{'loss': 2.6621, 'learning_rate': 0.00016666666666666666, 'epoch': 1.33}
{'loss': 2.6405, 'learning_rate': 0.0001333333333333333, 'epoch': 1.67}
{'loss': 2.626, 'learning_rate': 9.999999999999999e-05, 'epoch': 2.0}


  0%|          | 0/750 [00:00<?, ?it/s]

{'eval_loss': 2.517094135284424, 'eval_rouge1': 0.11528531964657868, 'eval_rouge2': 0.021652437832587385, 'eval_rougeL': 0.09296964514257781, 'eval_rougeLsum': 0.10229390838740146, 'eval_runtime': 920.5193, 'eval_samples_per_second': 3.259, 'eval_steps_per_second': 0.815, 'epoch': 2.0}
{'loss': 2.5324, 'learning_rate': 6.666666666666666e-05, 'epoch': 2.33}
{'loss': 2.5157, 'learning_rate': 3.333333333333333e-05, 'epoch': 2.67}
{'loss': 2.5048, 'learning_rate': 0.0, 'epoch': 3.0}


  0%|          | 0/750 [00:00<?, ?it/s]

{'eval_loss': 2.4948184490203857, 'eval_rouge1': 0.11578223991082395, 'eval_rouge2': 0.02164258992028923, 'eval_rougeL': 0.09304653141685382, 'eval_rougeLsum': 0.10283131075013124, 'eval_runtime': 1336.0293, 'eval_samples_per_second': 2.245, 'eval_steps_per_second': 0.561, 'epoch': 3.0}
{'train_runtime': 34371.7603, 'train_samples_per_second': 1.047, 'train_steps_per_second': 0.131, 'train_loss': 2.6921787109375, 'epoch': 3.0}


TrainOutput(global_step=4500, training_loss=2.6921787109375, metrics={'train_runtime': 34371.7603, 'train_samples_per_second': 1.047, 'train_steps_per_second': 0.131, 'train_loss': 2.6921787109375, 'epoch': 3.0})

In [9]:
trainer.save_model(model_dir)

In [12]:
from huggingface_hub import notebook_login
notebook_login()

trainer.push_to_hub()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

OSError: Looks like you do not have git installed, please install.