In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/arabic-history-questions-dataset/all_data_flattened.json
/kaggle/input/arabic-history-questions-dataset/all_data.json
/kaggle/input/arabic-history-questions-dataset/train.json
/kaggle/input/arabic-history-questions-dataset/test.json
/kaggle/input/arabic-history-questions-dataset/test_raw.json
/kaggle/input/arabic-history-questions-dataset/val_raw.json
/kaggle/input/arabic-history-questions-dataset/train_raw.json
/kaggle/input/arabic-history-questions-dataset/val.json


In [2]:
!pip install -qU json-repair==0.29.1

In [3]:
# Function to parse and repair JSON
def parse_json(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        text = file.read()
    try:
        return json_repair.loads(text)  # Repair and parse JSON
    except Exception as e:
        print(f"خطأ في تحليل JSON: {e}")  # Arabic error message
        return None

In [4]:
import json
from json_repair import json_repair

# Load and process the Arabic JSON file
train_path = "/kaggle/input/arabic-history-questions-dataset/train.json"
validation_path = "/kaggle/input/arabic-history-questions-dataset/val.json"

train = parse_json(train_path)
validation = parse_json(validation_path)

In [5]:
train_data = pd.DataFrame(train)
validation_data = pd.DataFrame(validation)

In [6]:
train_data.rename(columns={"paragraph": "context"}, inplace=True)
train_data.head()

Unnamed: 0,context,question,answer
0,ـ مساكن الطبقة الوسطى: كانت مخصصة للفنيين من ا...,ما هو الغرض الرئيسي من الممر الذي يتفرع إلى به...,للاستقبال والترفيه
1,القصر الفرعونى: تميز القصر الفرعونى بالاتساع و...,أين كانت تقع غرفة النوم في القصر الفرعوني؟,في أقصى مكان في القصر
2,توفي إمبراطور قوانغشو في ١٤ نوفمبر ١٩٠٨ ، وفي ...,ما الذي جعل الحكومة التي أنشأها زايفنغ تُعرف ب...,لأن خمسة من أعضائه كانوا من العائلة الإمبراطورية
3,ظهر الدور العلمى للمسجد مع بداية تأسيس النبي ص...,ما هي الأهداف التعليمية للكتاتيب في الدولة الإ...,تعليم القرآن والقراءة والكتابة والحساب
4,كانت الأسرة الغنية تستأجر مرضعات لأطفالهم، ويب...,ما هي المواد الدراسية التي كان يدرسها الابن في...,الهندسة والجغرافيا والأدب


In [7]:
validation_data.rename(columns={"paragraph": "context"}, inplace=True)
validation_data.head()

Unnamed: 0,context,question,answer
0,كانت الحكومة المصرية على عهد محمد على حكومة مط...,ما الفرق بين حكومة محمد علي وعصر المماليك؟,وجود نظام إداري منظم
1,شروط الوزير ومهامه شروط اختيار الوزير: ⯌العدل ...,ما هو الدور الإداري للوزير؟,رئيس الإدارة المركزية
2,علم الميكانيكا: استفاد العلماء المسلمون في علم...,ما هو الغرض من استخدام آلات رفع الماء؟,لعدم إهدار الماء
3,البيعة: عهد على الطاعة من الرعية للراعي، وإنفا...,من الذي نزل بالوحي على الرسول ﷺ؟,الملك جبريل
4,لما تقدمت المدارس العالية والخصوصية التى أنشأه...,من أعضاء مجلس ديوان المدارس؟,كلوت بك ورفاعة رافع الطهطاوي


**Question Generation Without Answers**

In [8]:
from datasets import DatasetDict, Dataset, load_dataset
import pandas as pd
def format_for_answer_generation(dataset):
    return pd.DataFrame({
        "text": "Generate question: "+ dataset["context"],
        "required": dataset["question"]
    })
# Process training data
train_qg = format_for_answer_generation(train_data)
train_qg_dataset = Dataset.from_pandas(train_qg)

# Process validation data
val_qg = format_for_answer_generation(validation_data)  # Assuming val_data exists
val_qg_dataset = Dataset.from_pandas(val_qg)

# # Process test data
# test_qg = format_for_answer_generation(test_data)  
# test_qg_dataset = Dataset.from_pandas(test_qg)


# Create dataset dictionary including train, validation, and test sets
datasets_qg = DatasetDict({
    "train": train_qg_dataset,
    "validation": val_qg_dataset
})


In [9]:
datasets_qg

DatasetDict({
    train: Dataset({
        features: ['text', 'required'],
        num_rows: 4702
    })
    validation: Dataset({
        features: ['text', 'required'],
        num_rows: 550
    })
})

In [10]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq, GenerationConfig

# model_name = "UBC-NLP/AraT5-base"
model_name = "UBC-NLP/AraT5v2-base-1024"

tokenizer = AutoTokenizer.from_pretrained(model_name)

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    padding=True,  # Dynamically pad to the longest in the batch
    label_pad_token_id=-100  # Ensures padded tokens in labels don't affect training
)

def tokenize_function(examples):
    model_inputs = tokenizer(
        examples["text"],  # Input: Context
        max_length=512,  # Limit input length
        truncation=True,
        padding="max_length",  # Ensures uniform input size
    )

    labels = tokenizer(
        examples["required"],  # Output: Target
        max_length=30,  # Target length is much shorter
        truncation=True,
        padding="max_length"  # Ensures uniform output size
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_qg_datasets = datasets_qg.map(tokenize_function, batched=True, remove_columns=["text", "required"])
# tokenized_ag_datasets = datasets_ag.map(tokenize_function, batched=True, remove_columns=["text", "required"])


2025-06-19 21:57:43.873792: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750370264.066110      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750370264.123846      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


tokenizer_config.json:   0%|          | 0.00/2.37k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/8.40M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Map:   0%|          | 0/4702 [00:00<?, ? examples/s]

Map:   0%|          | 0/550 [00:00<?, ? examples/s]

In [11]:
print(type(tokenized_qg_datasets))  # Should be <class 'datasets.dataset_dict.DatasetDict'>
print(tokenized_qg_datasets)  # Prints dataset details
print(tokenized_qg_datasets["train"][0])

<class 'datasets.dataset_dict.DatasetDict'>
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 4702
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 550
    })
})
{'input_ids': [34113, 550, 15033, 109590, 109535, 109993, 53204, 39495, 23689, 109590, 1157, 48665, 6778, 6380, 36, 6804, 109553, 109598, 4510, 1336, 70121, 3767, 109598, 69678, 238, 5656, 86340, 55726, 3767, 97875, 21610, 40, 53439, 5650, 445, 1685, 109598, 5026, 373, 66530, 21610, 40, 3397, 109598, 4510, 65128, 32623, 498, 109567, 13057, 1195, 26236, 688, 1046, 241, 109624, 174, 70917, 109598, 4510, 23650, 1859, 27402, 87604, 1964, 109598, 91549, 109551, 9818, 21610, 40, 3802, 9, 4683, 109580, 10465, 109598, 4510, 27402, 54314, 109552, 2314, 17913, 36, 70917, 109598, 4510, 29444, 2207, 42933, 444, 238, 42194, 38, 405, 552, 8892, 30774, 136, 444, 42907, 89668, 5185, 3767, 109598, 2163, 64081, 2109, 42

In [12]:
# Function to decode and print input and labels
def print_decoded_example(dataset, split, index):
    print(f"\nExample from {split} split (index {index}):")
    print("Decoded input:", tokenizer.decode(dataset[split][index]["input_ids"]))
    print("Decoded labels:", tokenizer.decode(dataset[split][index]["labels"]))
    print("-" * 50)

# Print examples from train, validation, and test splits
print_decoded_example(tokenized_qg_datasets, "train", 200)  # First example from training set
print_decoded_example(tokenized_qg_datasets, "validation", 2)  # First example from validation set



Example from train split (index 200):
Decoded input: Generate question: وقد تميز فن النحت الرومانى بمميزات عديدة أهمها: ١. الواقعية الشديدة: مثل رأس قيصر المصنوع من حجر البازلت وتمثال بومبى. وتماثيل أغسطس. ٢. الابتكار والتجديد: وخاصةً فى نحت التماثيل الشخصية وكذلك الفنون المعمارية ومنها: معبد البانثييون، وأقواس النصر والأعمدة التذكارية.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><p

In [13]:
# # Function to check special tokens
# def check_special_tokens(tokenizer):
#     print("\nSpecial tokens:")
#     print(f"<context>: {tokenizer.convert_tokens_to_ids('<context>')}")
#     print(f"<answer>: {tokenizer.convert_tokens_to_ids('<answer>')}")
#     print(f"<question>: {tokenizer.convert_tokens_to_ids('<question>')}")
#     print(f"</s>: {tokenizer.convert_tokens_to_ids('</s>')}")

In [14]:
# # Define special tokens
# special_tokens_dict = {"additional_special_tokens": ["<context>", "<answer>", "<question>"]}

# # Add them to the tokenizer
# tokenizer.add_special_tokens(special_tokens_dict)

# # Check again
# check_special_tokens(tokenizer)


In [15]:
!pip install -qU  wandb

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.2/23.2 MB[0m [31m70.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [16]:
import wandb
# !huggingface-cli login --token {'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'}
wandb.login(key='77eafacbf29d3f89b810de78fe1f766a9b6e6fe8')

device = "cuda"

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mnoureenaboarab2003[0m ([33mnoureenaboarab2003-ain-shams-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [17]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq

# Load the model
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

config.json:   0%|          | 0.00/699 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.47G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

In [18]:
# Resize model embeddings if needed (only if using a model)
# model.resize_token_embeddings(len(tokenizer))

In [19]:
# Check for NaN or inf in the dataset
import numpy as np

def check_for_invalid_values(dataset):
    for split in dataset:
        print(f"Checking {split} split...")
        for example in dataset[split]:
            if np.isnan(example["input_ids"]).any() or np.isinf(example["input_ids"]).any():
                print(f"Invalid input_ids found in {split} split!")
            if np.isnan(example["labels"]).any() or np.isinf(example["labels"]).any():
                print(f"Invalid labels found in {split} split!")

check_for_invalid_values(tokenized_qg_datasets)

Checking train split...
Checking validation split...


In [20]:
# # Check special tokens
# print("Special tokens:")
# print(f"<context>: {tokenizer.convert_tokens_to_ids('<context>')}")
# print(f"<answer>: {tokenizer.convert_tokens_to_ids('<answer>')}")
# print(f"<question>: {tokenizer.convert_tokens_to_ids('<question>')}")
# print(f"</s>: {tokenizer.convert_tokens_to_ids('</s>')}")

# Decode a few examples to verify tokenization
for i in range(3):
    print(f"\nExample {i + 1}:")
    print("Input IDs:", tokenized_qg_datasets["train"][i]["input_ids"])
    print("Decoded Input:", tokenizer.decode(tokenized_qg_datasets["train"][i]["input_ids"]))
    print("Labels:", tokenized_qg_datasets["train"][i]["labels"])
    print("Decoded Labels:", tokenizer.decode(tokenized_qg_datasets["train"][i]["labels"]))


Example 1:
Input IDs: [34113, 550, 15033, 109590, 109535, 109993, 53204, 39495, 23689, 109590, 1157, 48665, 6778, 6380, 36, 6804, 109553, 109598, 4510, 1336, 70121, 3767, 109598, 69678, 238, 5656, 86340, 55726, 3767, 97875, 21610, 40, 53439, 5650, 445, 1685, 109598, 5026, 373, 66530, 21610, 40, 3397, 109598, 4510, 65128, 32623, 498, 109567, 13057, 1195, 26236, 688, 1046, 241, 109624, 174, 70917, 109598, 4510, 23650, 1859, 27402, 87604, 1964, 109598, 91549, 109551, 9818, 21610, 40, 3802, 9, 4683, 109580, 10465, 109598, 4510, 27402, 54314, 109552, 2314, 17913, 36, 70917, 109598, 4510, 29444, 2207, 42933, 444, 238, 42194, 38, 405, 552, 8892, 30774, 136, 444, 42907, 89668, 5185, 3767, 109598, 2163, 64081, 2109, 42194, 29884, 11301, 29884, 57664, 109598, 2163, 1443, 61, 5952, 48665, 2573, 1718, 109598, 370, 281, 109779, 109686, 109536, 86182, 367, 61, 5952, 48665, 26329, 43033, 109598, 661, 57694, 437, 78564, 42850, 1700, 86182, 367, 718, 109544, 10289, 36223, 766, 19, 26576, 15379, 31138,

In [21]:
!pip install -U accelerate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting accelerate
  Downloading accelerate-1.8.0-py3-none-any.whl.metadata (19 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch>=2.0.0->accelerate)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusparse-cu12==12.3.1.170 (from torch>=2.0.0->acc

In [22]:
# from transformers import EvalPrediction
# from sklearn.metrics import f1_score
# import numpy as np
# from torch.optim.lr_scheduler import CyclicLR
# from transformers import AutoModelForSeq2SeqLM

# class SafeCircularLRTrainer(Seq2SeqTrainer):
#     def __init__(self, *args, **kwargs):
#         super().__init__(*args, **kwargs)
        
#     def create_scheduler(self, num_training_steps: int, optimizer=None):
#         if optimizer is None:
#             optimizer = self.optimizer
            
#         # More conservative cyclic LR
#         self.lr_scheduler = CyclicLR(
#             optimizer,
#             base_lr=1e-5,    # Higher base LR
#             max_lr=3e-5,     # Lower max LR
#             step_size_up=num_training_steps//6,  # Slower cycles
#             mode='triangular',
#             cycle_momentum=False
#         )
#         return self.lr_scheduler


In [23]:
import torch
from torch.optim.lr_scheduler import OneCycleLR
from torch.optim import AdamW
import math

# Calculate total training steps correctly
dataset_size = len(tokenized_qg_datasets["train"])
batch_size = 4  # per_device_train_batch_size
grad_accum_steps = 8  # gradient_accumulation_steps
num_epochs = 15

# Effective batch size per step
effective_batch_size = batch_size * grad_accum_steps

# Steps per epoch (use ceiling to ensure we don't underestimate)
steps_per_epoch = math.ceil(dataset_size / effective_batch_size)

# Total steps across all epochs
total_steps = steps_per_epoch * num_epochs

print(f"Dataset size: {dataset_size}")
print(f"Steps per epoch: {steps_per_epoch}")
print(f"Total steps: {total_steps}")

training_args = Seq2SeqTrainingArguments(
    output_dir="./finetuned-AraT5-QG",
    eval_strategy="epoch",
    learning_rate=3e-5,  # This will be the max learning rate for OneCycleLR
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    predict_with_generate=True,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=15,
    fp16=True,
    generation_num_beams=3,
    gradient_accumulation_steps=8,
    save_strategy="epoch",
    report_to="none",
    load_best_model_at_end=True,
    # Add these for better control
    logging_steps=50,  # Log every 50 steps
    eval_steps=steps_per_epoch,  # Evaluate every epoch
    save_steps=steps_per_epoch,  # Save every epoch
    dataloader_drop_last=False,  # Don't drop incomplete batches
)

# Create optimizer
optimizer = AdamW(
    model.parameters(),
    lr=training_args.learning_rate,
    weight_decay=training_args.weight_decay
)

# Create OneCycleLR scheduler with correct total_steps
scheduler = OneCycleLR(
    optimizer,
    max_lr=training_args.learning_rate,
    total_steps=total_steps,
    pct_start=0.1,  # Percentage of cycle spent increasing learning rate
    anneal_strategy='cos',  # Cosine annealing
    cycle_momentum=True,
    base_momentum=0.85,
    max_momentum=0.95,
    div_factor=25.0,  # Initial learning rate = max_lr / div_factor
    final_div_factor=1e4  # Final learning rate = initial_lr / final_div_factor
)

# Trainer with custom optimizer and scheduler
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_qg_datasets["train"],
    eval_dataset=tokenized_qg_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    optimizers=(optimizer, scheduler)  # Pass custom optimizer and scheduler
)

import gc

# Clean up memory before training
gc.collect()
torch.cuda.empty_cache()

# Start training
try:
    trainer.train()
except Exception as e:
    print(f"Training error: {e}")
    print(f"Current step: {trainer.state.global_step}")
    print(f"Expected total steps: {total_steps}")
    raise

Dataset size: 4702
Steps per epoch: 147
Total steps: 2205


  trainer = Seq2SeqTrainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,33.9462,22.357359
2,27.997,6.793698
3,7.0553,5.750888
4,6.034,3.632491
5,3.4008,0.995703
6,1.7869,0.813469
7,1.0566,0.75149
8,0.9353,0.732574
9,0.8465,0.720284
10,0.8023,0.716298


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


In [27]:
trainer.save_model("./arat5_qg/final_model")
tokenizer.save_pretrained("./arat5_qg/final_model")

('./arat5_qg/final_model/tokenizer_config.json',
 './arat5_qg/final_model/special_tokens_map.json',
 './arat5_qg/final_model/spiece.model',
 './arat5_qg/final_model/added_tokens.json',
 './arat5_qg/final_model/tokenizer.json')