In [2]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
import json

# Import starting model

In [3]:
# Import pre-trained model
tokenizer = T5Tokenizer.from_pretrained("deep-learning-analytics/triviaqa-t5-base")
model = T5ForConditionalGeneration.from_pretrained("deep-learning-analytics/triviaqa-t5-base")

# Connect to device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = model.to(device)


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [5]:
# Test example question from hugging face model card
text = "What is the capitol of the US"

preprocess_text = text.strip().replace("\n","")
tokenized_text = tokenizer.encode(preprocess_text, return_tensors="pt").to(device)

outs = model.generate(
            tokenized_text,
            max_length=10,
            num_beams=2,
            early_stopping=True
           )

dec = [tokenizer.decode(ids) for ids in outs]
print("Predicted Answer: ", dec)

Predicted Answer:  ['<pad> Washington</s>']


# Import new data

In [6]:
# Load the data from each of the 3 categories
with open('Open-trivia-database-master/en/todo/entertainment.json', 'r') as file:
    entertainment = json.load(file)
with open('Open-trivia-database-master/en/todo/science_and_nature.json', 'r') as file:
    science = json.load(file)
with open('Open-trivia-database-master/en/todo/food_and_drink.json', 'r') as file:
    food = json.load(file)

# Find number of questions in each category
print("Number of questions in each category:")
print("Entertainment: ", len(entertainment))
print("Science: ", len(science))
print("Food: ", len(food))

Number of questions in each category:
Entertainment:  802
Science:  2340
Food:  980


In [8]:
# Keep only categories, questions, and answers
def filter_columns(data, columns_to_keep):
    return [{column: item[column] for column in columns_to_keep} for item in data]

# Define the columns you want to keep
columns_to_keep = ['question', 'answers', 'category_id']

# Create a dictionary with filtered data
trivia_data = {
    'entertainment': filter_columns(entertainment, columns_to_keep),
    'science': filter_columns(science, columns_to_keep),
    'food': filter_columns(food, columns_to_keep)
}

trivia_data['entertainment'][0]

{'question': '_____ in the name of love?',
 'answers': ['Stop'],
 'category_id': 'ENTERTAINMENT'}

In [9]:
# Keep only the "first" answer in answers
def keep_first_answer(data):
    for item in data:
        item['answers'] = item['answers'][0]
    return data

# Apply the function to the data
trivia_data = {key: keep_first_answer(value) for key, value in trivia_data.items()}

# Rename answers column to "answer"
for category in trivia_data:
    for item in trivia_data[category]:
        item['answer'] = item.pop('answers')

trivia_data['entertainment'][0]

{'question': '_____ in the name of love?',
 'category_id': 'ENTERTAINMENT',
 'answer': 'Stop'}

# Train on new data

In [22]:
import json
from datasets import Dataset

# Combine all categories into one dataset
combined_data = []
for category, questions in trivia_data.items():
    for q in questions:
        if 'question' in q and 'answer' in q:  # Ensure required fields exist
            combined_data.append({"question": q["question"], "answer": q["answer"]})

# Convert to Hugging Face Dataset
dataset = Dataset.from_dict({"question": [d["question"] for d in combined_data],
                             "answer": [d["answer"] for d in combined_data]})

# Split into train and temp (validation + test) datasets
temp_split = dataset.train_test_split(test_size=0.2)

# Split the temp dataset into validation and test datasets
validation_test_split = temp_split["test"].train_test_split(test_size=0.5)

# Combine the splits into a single dictionary
split_dataset = {
    "train": temp_split["train"],
    "validation": validation_test_split["train"],
    "test": validation_test_split["test"]
}

In [23]:
# Find number of samples in each split
print("Number of samples in each split:")
print("Train: ", len(split_dataset["train"]))
print("Validation: ", len(split_dataset["validation"]))
print("Test: ", len(split_dataset["test"]))

Number of samples in each split:
Train:  3297
Validation:  412
Test:  413


In [24]:
# Preprocess the data
def preprocess_function(examples):
    inputs = examples["question"]
    targets = examples["answer"]
    model_inputs = tokenizer(inputs, max_length=25, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=10, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize the dataset
tokenized_train = split_dataset["train"].map(preprocess_function, batched=True)
tokenized_val = split_dataset["validation"].map(preprocess_function, batched=True)
tokenized_test = split_dataset["test"].map(preprocess_function, batched=True)

Map: 100%|██████████| 3297/3297 [00:00<00:00, 5127.61 examples/s]
Map: 100%|██████████| 412/412 [00:00<00:00, 4732.32 examples/s]
Map: 100%|██████████| 413/413 [00:00<00:00, 955.18 examples/s]


In [None]:
from transformers import TrainingArguments, Trainer

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=12,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
)

# Train
trainer.train()


  0%|          | 0/550 [00:00<?, ?it/s]Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
  1%|          | 4/550 [00:51<1:58:16, 13.00s/it]

KeyboardInterrupt: 