In [1]:
import pandas as pd
import pickle
import nltk
import evaluate
import numpy as np

from datasets import load_dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import AdamW, Seq2SeqTrainer, Seq2SeqTrainingArguments
from transformers import DataCollatorForSeq2Seq

In [2]:
# configure options
pd.set_option('max_colwidth', 200)

In [3]:
dataset = load_dataset('csv', data_files='datasets/final_sample.csv')
dataset = dataset["train"].train_test_split(test_size=0.2)

In [4]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [5]:
task_prefix = "generate_answer: "

# Define our preprocessing function
def preprocess_function(examples):
    # The "inputs" are the tokenized answer:
    inputs = [task_prefix + doc for doc in examples["title_body"]]
    model_inputs = tokenizer(inputs, truncation=True)
    
    # The "labels" are the tokenized outputs:
    labels = tokenizer(text_target=examples['Response'], truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/74552 [00:00<?, ? examples/s]

Map:   0%|          | 0/18639 [00:00<?, ? examples/s]

In [6]:
nltk.download("punkt", quiet=True)
metric = evaluate.load("rouge")

def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # decode preds and labels
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # rougeLSum expects newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return result

In [7]:
model_dir = './t5-sm-py-stackoverflow'

# Set up training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir=model_dir,
    evaluation_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    predict_with_generate=True,
    push_to_hub=False
)

# Set up trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [None]:
# Train the model
trainer.train()

In [None]:
trainer.save_model(model_dir)

In [None]:
from huggingface_hub import notebook_login
notebook_login()

trainer.push_to_hub()