In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset
from transformers import (
    T5ForConditionalGeneration,
    T5TokenizerFast,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
)

In [None]:
class TennisDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=256):
        self.data = df
        self.tokenizer = tokenizer
        self.max_length = max_length

        self.input_encodings = tokenizer(
            df["input_text"].tolist(),
            padding="max_length",
            truncation=True,
            max_length=max_length
        )

        self.target_encodings = tokenizer(
            df["target_text"].tolist(),
            padding="max_length",
            truncation=True,
            max_length=max_length
        )

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return {
            "input_ids": torch.tensor(self.input_encodings["input_ids"][idx]),
            "attention_mask": torch.tensor(self.input_encodings["attention_mask"][idx]),
            "labels": torch.tensor(self.target_encodings["input_ids"][idx])
        }

In [None]:
train_df = pd.read_csv("data/train.tsv", sep="\t")
test_df  = pd.read_csv("data/test.tsv", sep="\t")

print("Train samples:", len(train_df))
print("Test samples:", len(test_df))

In [None]:
tokenizer = T5TokenizerFast.from_pretrained("t5-small")
model     = T5ForConditionalGeneration.from_pretrained("t5-small")

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results_t5",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    learning_rate=3e-4,
    weight_decay=0.01,
    save_strategy="no",
    evaluation_strategy="no",
    predict_with_generate=True,
    logging_steps=10,
)

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)

In [None]:
print("Starting fine-tuning...")
trainer.train()
print("Fine-tuning complete!")

In [None]:
save_dir = "tiny_t5_tennis_report_model_clean"
trainer.save_model(save_dir)
tokenizer.save_pretrained(save_dir)

print("Model saved to:", save_dir)