In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pip install torch>=2.6.0

In [None]:
!pip install -qU transformers datasets optimum

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
import pandas as pd

train_data = pd.read_json("/kaggle/input/arabic-squad-v20/asquadv2-train.json", encoding='ISO-8859-1')
validation_data = pd.read_json("/kaggle/input/arabic-squad-v20/asquadv2-val.json", encoding='ISO-8859-1')
test_data = pd.read_json("/kaggle/input/arabic-squad-v20/asquadv2-test.json", encoding='ISO-8859-1')

print("Dataset loaded successfully!")


In [None]:
# Normalize and clean data
def normalize_data(data):
    data = pd.json_normalize(data['data'], ['paragraphs', 'qas'], ['title', ['paragraphs', 'context']])
    data = data.rename(columns={'paragraphs.context': 'context', 'question': 'question', 'answers': 'answers'})
    data['answers'] = data['answers'].apply(lambda x: {'text': [x[0]['text']] if x else [], 'answer_start': [x[0]['answer_start']] if x else []})
    data = data[data["is_impossible"] == False].drop(['is_impossible', 'plausible_answers'], axis=1, errors='ignore')
    return data

train_data = normalize_data(train_data)
validation_data = normalize_data(validation_data)
test_data = normalize_data(test_data)

In [None]:
# Extract relevant fields
def extract_relevant_data(dataset):
    return pd.DataFrame({
        "context": dataset["context"],
        "question": dataset["question"],
        "answer": dataset["answers"].apply(lambda ans: ans["text"][0] if ans["text"] else "")
    })

train_data = extract_relevant_data(train_data)
validation_data = extract_relevant_data(validation_data)
test_data = extract_relevant_data(test_data)

# Check lengths
print(f"Train data size: {len(train_data)}")
print(f"Validation data size: {len(validation_data)}")
print(f"Test data size: {len(test_data)}")

In [None]:
train_data.info()

**Answer Generation**

In [None]:
from datasets import DatasetDict, Dataset, load_dataset
import pandas as pd
def format_for_answer_generation(dataset):
    return pd.DataFrame({
        "text": "<context>" + dataset["context"] + "<question>" + dataset["question"],
        "required": "<answer>" + dataset["answer"]
    })
# Process training data
train_ag = format_for_answer_generation(train_data)
train_ag_dataset = Dataset.from_pandas(train_ag)

# Process validation data
val_ag = format_for_answer_generation(validation_data)  # Assuming val_data exists
val_ag_dataset = Dataset.from_pandas(val_ag)

# Process test data
test_ag = format_for_answer_generation(test_data)  
test_ag_dataset = Dataset.from_pandas(test_ag)


# Create dataset dictionary including train, validation, and test sets
datasets_ag = DatasetDict({
    "train": train_ag_dataset,
    "validation": val_ag_dataset,
    "test": test_ag_dataset
})


In [None]:
datasets_ag

In [None]:
datasets_ag['train'] = datasets_ag['train'].remove_columns(['__index_level_0__'])
datasets_ag['validation'] = datasets_ag['validation'].remove_columns(['__index_level_0__'])


In [None]:
datasets_ag

In [None]:
datasets_ag["train"][0]

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq, GenerationConfig

# model_name = "UBC-NLP/AraT5-base"
model_name = "UBC-NLP/AraT5v2-base-1024"

tokenizer = AutoTokenizer.from_pretrained(model_name)

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    padding=True,  # Dynamically pad to the longest in the batch
    label_pad_token_id=-100  # Ensures padded tokens in labels don't affect training
)

def tokenize_function(examples):
    model_inputs = tokenizer(
        examples["text"],  # Input: Context
        max_length=768,  # Limit input length
        truncation=True,
        padding="max_length",  # Ensures uniform input size
    )

    labels = tokenizer(
        examples["required"],  # Output: Target
        max_length=30,  # Target length is much shorter
        truncation=True,
        padding="max_length"  # Ensures uniform output size
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# tokenized_qg_datasets = datasets_qg.map(tokenize_function, batched=True, remove_columns=["text", "required"])
tokenized_ag_datasets = datasets_ag.map(tokenize_function, batched=True, remove_columns=["text", "required"])


In [None]:
print(type(tokenized_ag_datasets))  # Should be <class 'datasets.dataset_dict.DatasetDict'>
print(tokenized_ag_datasets)  # Prints dataset details
print(tokenized_ag_datasets["train"][0])


In [None]:
# Function to decode and print input and labels
def print_decoded_example(dataset, split, index):
    print(f"\nExample from {split} split (index {index}):")
    print("Decoded input:", tokenizer.decode(dataset[split][index]["input_ids"]))
    print("Decoded labels:", tokenizer.decode(dataset[split][index]["labels"]))
    print("-" * 50)

# Print examples from train, validation, and test splits
print_decoded_example(tokenized_ag_datasets, "train", 200)  # First example from training set
print_decoded_example(tokenized_ag_datasets, "validation", 2)  # First example from validation set


In [None]:
# Function to check special tokens
def check_special_tokens(tokenizer):
    print("\nSpecial tokens:")
    print(f"<context>: {tokenizer.convert_tokens_to_ids('<context>')}")
    print(f"<answer>: {tokenizer.convert_tokens_to_ids('<answer>')}")
    print(f"<question>: {tokenizer.convert_tokens_to_ids('<question>')}")
    print(f"</s>: {tokenizer.convert_tokens_to_ids('</s>')}")

In [None]:
# Define special tokens
special_tokens_dict = {"additional_special_tokens": ["<context>", "<answer>", "<question>"]}

# Add them to the tokenizer
tokenizer.add_special_tokens(special_tokens_dict)

# Check again
check_special_tokens(tokenizer)

In [None]:
!pip install -qU  wandb

In [None]:
import wandb
# !huggingface-cli login --token {'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'}
wandb.login(key='77eafacbf29d3f89b810de78fe1f766a9b6e6fe8')

device = "cuda"

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq

# Load the model
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [None]:
# Resize model embeddings if needed (only if using a model)
model.resize_token_embeddings(len(tokenizer))

In [None]:
# # Check for NaN or inf in the dataset
# import numpy as np

# def check_for_invalid_values(dataset):
#     for split in dataset:
#         print(f"Checking {split} split...")
#         for example in dataset[split]:
#             if np.isnan(example["input_ids"]).any() or np.isinf(example["input_ids"]).any():
#                 print(f"Invalid input_ids found in {split} split!")
#             if np.isnan(example["labels"]).any() or np.isinf(example["labels"]).any():
#                 print(f"Invalid labels found in {split} split!")

# check_for_invalid_values(tokenized_ag_datasets)

In [None]:
# Decode a few examples to verify tokenization
for i in range(3):
    print(f"\nExample {i + 1}:")
    print("Input IDs:", tokenized_ag_datasets["train"][i]["input_ids"])
    print("Decoded Input:", tokenizer.decode(tokenized_ag_datasets["train"][i]["input_ids"]))
    print("Labels:", tokenized_ag_datasets["train"][i]["labels"])
    print("Decoded Labels:", tokenizer.decode(tokenized_ag_datasets["train"][i]["labels"]))

In [None]:
!pip install -U accelerate

In [None]:
# def compute_metrics(eval_preds):
#     """
#     Compute F1, Precision, Recall for Hugging Face Trainer.
#     Uses Arabic-aware token-level matching.
#     """
#     preds, labels = eval_preds

#     # Replace -100 in labels with pad token id
#     labels = np.where(labels == -100, tokenizer.pad_token_id, labels)

#     # Optionally: clip preds to tokenizer vocab size (avoids huge token ids)
#     preds = np.clip(preds, 0, tokenizer.vocab_size - 1)

#     try:
#         decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
#         decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
#     except Exception as e:
#         print("❌ Error during decoding:", e)
#         print("Preds type:", type(preds), "Shape:", preds.shape)
#         print("Labels type:", type(labels), "Shape:", labels.shape)
#         raise e

#     f1s, precisions, recalls = [], [], []

#     for pred, label in zip(decoded_preds, decoded_labels):
#         pred_tokens = set(arabic_tokenize(pred))
#         label_tokens = set(arabic_tokenize(label))

#         tp = len(pred_tokens & label_tokens)
#         precision = tp / len(pred_tokens) if pred_tokens else 0
#         recall = tp / len(label_tokens) if label_tokens else 0
#         f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

#         f1s.append(f1)
#         precisions.append(precision)
#         recalls.append(recall)

#     print(f"🔍 Sample prediction: {decoded_preds[0]}")
#     print(f"✅ Sample label:      {decoded_labels[0]}")
#     print(f"📊 F1: {np.mean(f1s):.4f}, Precision: {np.mean(precisions):.4f}, Recall: {np.mean(recalls):.4f}")

#     return {
#         "f1": np.mean(f1s),
#         "precision": np.mean(precisions),
#         "recall": np.mean(recalls),
#     }



In [None]:
import os
import gc
import torch
from transformers import (
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
)

training_args = Seq2SeqTrainingArguments(
    output_dir="./finetuned-AraT5-QA",
    eval_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    predict_with_generate=True,
    weight_decay=0.01,
    # save_total_limit=2,
    num_train_epochs=15,
    fp16=False,  # Disable mixed precision for debugging
    generation_num_beams=3,
    gradient_accumulation_steps=4,
    save_strategy="epoch",
    save_total_limit=2,
    report_to="none",
    load_best_model_at_end=True  # Uncommented to load the best model at the end
)

seed = 42

# Shuffle before selecting the subset
train_dataset = tokenized_ag_datasets["train"].shuffle(seed=seed).select(range(min(20000, len(tokenized_ag_datasets["train"]))))
val_dataset = tokenized_ag_datasets["validation"].shuffle(seed=seed).select(range(min(2000, len(tokenized_ag_datasets["validation"]))))
# Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

import torch
import gc

# Before training
gc.collect()
torch.cuda.empty_cache()

import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
trainer.train()

In [None]:
trainer.save_model("./arat5_qa")
tokenizer.save_pretrained("./arat5_qa")