In [None]:
!pip install git+https://github.com/huggingface/transformers
!pip install -U "transformers>=4.40.0" "datasets>=2.18.0" "accelerate>=0.28.0"
!pip install datasets
!pip install sentencepiece
!pip install accelerate
!pip install evaluate

In [None]:
MODEL = "google/mt5-large"
REPO = "mt5-base-urdu"
EPOCHS = 6

# 1. Load Dataset

In [None]:
def filter_function(example):
    return not example['is_impossible']

In [None]:
from datasets import Dataset, DatasetDict
import json

# <<< CHANGE THESE PATHS TO MATCH YOUR SYSTEM >>>
train_json_path = "/Users/basusmac/Desktop/Github Repositories/NLP Project/SQuAD/train.json"
dev_json_path   = "/Users/basusmac/Desktop/Github Repositories/NLP Project/SQuAD/dev.json"

def flatten_squad(data_dict):
    rows = []
    for article in data_dict["data"]:
        title = article.get("title", "")
        for para in article["paragraphs"]:
            context = para["context"]
            for qa in para["qas"]:
                qid = qa.get("id", "")
                question = qa["question"]
                is_impossible = qa.get("is_impossible", False)

                # SQuAD v2: answers may be empty
                if qa.get("answers") and len(qa["answers"]) > 0:
                    first = qa["answers"][0]
                    ans_text = first["text"]
                    ans_start = first["answer_start"]
                else:
                    ans_text = ""
                    ans_start = -1

                rows.append({
                    "id": qid,
                    "title": title,
                    "context": context,
                    "question": question,
                    "answer": ans_text,          # <- IMPORTANT: named 'answer'
                    "answer_start": ans_start,
                    "is_impossible": is_impossible,
                })
    return rows

with open(train_json_path, "r", encoding="utf-8") as f:
    train_data = json.load(f)

with open(dev_json_path, "r", encoding="utf-8") as f:
    dev_data = json.load(f)

train_rows = flatten_squad(train_data)
dev_rows   = flatten_squad(dev_data)

dataset = DatasetDict({
    "train": Dataset.from_list(train_rows),
    "validation": Dataset.from_list(dev_rows),
})

print(dataset)

In [None]:
from transformers import MT5Tokenizer
import torch

In [None]:
tokenizer = MT5Tokenizer.from_pretrained(MODEL)

In [None]:
def add_eos_to_examples(example):
    example['input_text'] = 'question: %s  context: %s' % (example['question'], example['context'])
    example['target_text'] = '%s' % example['answer']
    return example

def convert_to_features(example_batch):
    input_encodings = tokenizer.batch_encode_plus(example_batch['input_text'], truncation=True, padding="max_length", max_length=512)
    target_encodings = tokenizer.batch_encode_plus(example_batch['target_text'], truncation=True, padding="max_length", max_length=30)

    encodings = {
        'input_ids': input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids'],
    }

    return encodings

In [None]:
train_dataset = dataset["train"].map(add_eos_to_examples)
train_dataset = train_dataset.map(convert_to_features, batched=True)

valid_dataset = dataset["validation"].map(add_eos_to_examples, load_from_cache_file=False)
valid_dataset = valid_dataset.map(convert_to_features, batched=True, load_from_cache_file=False)

columns = ['input_ids', 'attention_mask', 'labels']
train_dataset.set_format(type='torch', columns=columns)
valid_dataset.set_format(type='torch', columns=columns)

In [None]:
torch.save(train_dataset, 'train_data.pt')
torch.save(valid_dataset, 'valid_data.pt')

In [None]:
len(train_dataset), len(valid_dataset)

# 2. Training

In [None]:
import transformers
print(transformers.__version__)

In [None]:
from transformers import AutoModelForSeq2SeqLM, MT5Tokenizer

tokenizer = MT5Tokenizer.from_pretrained(MODEL)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL)

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir=REPO,              # folder to save checkpoints
    num_train_epochs=2,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=16,   # effective batch ~16
    learning_rate=5e-5,
    logging_steps=200,                # log every 200 steps
    save_steps=2000,                  # save every 2000 steps (tune if you want)
)

In [None]:
from transformers import DataCollatorForSeq2Seq

# build collator using tokenizer + model
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,   # should be tokenized dataset
    eval_dataset=valid_dataset,    # should be tokenized dataset
    data_collator=data_collator,
)

In [None]:
trainer.train()

# 2. Evaluation

In [None]:
from tqdm import tqdm

In [None]:
# If you've already created `dataset` earlier (train+validation), you can skip reloading.
# Just make sure `dataset` still exists in memory.

dataset["train"] = dataset["train"].filter(filter_function)
dataset["validation"] = dataset["validation"].filter(filter_function)
dataset

In [None]:
from datasets import Dataset, DatasetDict

def merge_duplicate_ids(dataset):
    data_list = dataset.to_dict()
    grouped_data = {}

    for i in range(len(data_list['id'])):
        idx = data_list['id'][i]
        if idx not in grouped_data:
            grouped_data[idx] = {
                'id': idx,
                'title': data_list['title'][i],
                'context': data_list['context'][i],
                'question': data_list['question'][i],
                'is_impossible': data_list['is_impossible'][i],
                'answer': [data_list['answer'][i]],
                'answer_start': [data_list['answer_start'][i]]
            }
        else:
            grouped_data[idx]['answer'].append(data_list['answer'][i])
            grouped_data[idx]['answer_start'].append(data_list['answer_start'][i])

    merged_data = list(grouped_data.values())
    return merged_data

merged_validation_data = merge_duplicate_ids(dataset['validation'])

merged_validation_dataset = Dataset.from_dict({k: [dic[k] for dic in merged_validation_data] for k in merged_validation_data[0]})

merged_validation_dataset

In [None]:
def add_eos_to_examples(example):
    example['input_text'] = 'question: %s  context: %s' % (example['question'], example['context'])
    return example

In [None]:
valid_dataset = merged_validation_dataset.map(add_eos_to_examples, load_from_cache_file=False)

In [None]:
import evaluate

metric = evaluate.load("squad")

In [None]:
from glob import glob
import torch

device = (
    "cuda" if torch.cuda.is_available()
    else "mps" if hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
    else "cpu"
)
print("Evaluation using device:", device)

def evaluate(model_dir, dataset):
    checkpoints = sorted(glob(model_dir + "/checkpoint-*"))
    for checkpoint in checkpoints:
        tokenizer = MT5Tokenizer.from_pretrained(checkpoint)
        model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint).to(device)
        
        predictions = []
        references = []
        for data in tqdm(dataset):
            input_ids = tokenizer(data["input_text"], return_tensors="pt").input_ids
            outputs = model.generate(input_ids.to(device), max_new_tokens=30)
            pred = tokenizer.decode(outputs[0], skip_special_tokens=True)
            ...

In [None]:
evaluate(REPO, valid_dataset)