In [1]:
import json
import pandas as pd

# Load JSON file
file_path = "/kaggle/input/stanford-question-answering-dataset/train-v1.1.json"
with open(file_path, "r", encoding="utf-8") as file:
    data = json.load(file)

# Extract relevant information
records = []
for topic in data["data"]:
    title = topic["title"]
    for paragraph in topic["paragraphs"]:
        context = paragraph["context"]
        for qas in paragraph["qas"]:
            question = qas["question"]
            question_id = qas["id"]
            answers = [ans["text"] for ans in qas["answers"]]  # Collect all possible answers
            records.append([title, context, question, question_id, answers])

# Convert to Pandas DataFrame
df = pd.DataFrame(records, columns=["Title", "Context", "Question", "Question_ID", "Answers"])

# Display DataFrame
print(df.head())


                      Title  \
0  University_of_Notre_Dame   
1  University_of_Notre_Dame   
2  University_of_Notre_Dame   
3  University_of_Notre_Dame   
4  University_of_Notre_Dame   

                                             Context  \
0  Architecturally, the school has a Catholic cha...   
1  Architecturally, the school has a Catholic cha...   
2  Architecturally, the school has a Catholic cha...   
3  Architecturally, the school has a Catholic cha...   
4  Architecturally, the school has a Catholic cha...   

                                            Question  \
0  To whom did the Virgin Mary allegedly appear i...   
1  What is in front of the Notre Dame Main Building?   
2  The Basilica of the Sacred heart at Notre Dame...   
3                  What is the Grotto at Notre Dame?   
4  What sits on top of the Main Building at Notre...   

                Question_ID                                    Answers  
0  5733be284776f41900661182               [Saint Bernadette Soubir

In [2]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
huggingface_value = user_secrets.get_secret("huggingface")
wandb_value = user_secrets.get_secret("wandb")

In [3]:
from huggingface_hub import login
login(token=huggingface_value)

In [4]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, TrainingArguments, Trainer
import torch
#TinyLlama/TinyLlama_v1.1
#meta-llama/Llama-3.2-3B
# Load the model and tokenizer
model_name = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name,device_map="auto")

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [5]:
from datasets import Dataset , DatasetDict
max_length = 256
def tokenize_function(examples):
    text = [f"Context: {inp} \nGenerated Question:" for inp in examples["Context"]]
    
    examples["input_ids"] = tokenizer(text, truncation=True, padding="max_length", max_length=max_length, return_tensors = "pt").input_ids
    examples["labels"] = tokenizer(examples["Question"], truncation=True, padding="max_length", max_length=max_length, return_tensors = "pt").input_ids
    
    return examples
    
data = Dataset.from_pandas(df)

tokenizer.pad_token = tokenizer.eos_token
tokenized_dataset = data.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(['Title', 'Context', 'Question', 'Question_ID', 'Answers'])
# tokenized_dataset = tokenized_dataset.filter(lambda example,index:index%20==0,with_indices=True)
split_dataset = tokenized_dataset.train_test_split(test_size=0.2)



Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

In [6]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=32,
    lora_alpha = 32,
    lora_dropout=0.05,
    bias='none',
    task_type = TaskType.SEQ_2_SEQ_LM
)
peft_model = get_peft_model(model,peft_config=lora_config)

In [7]:
# from datasets import Dataset , DatasetDict
# split_dataset = DatasetDict({
#     "train": split_dataset["train"].with_format("torch"),  
#     "test": split_dataset["test"].with_format("torch")
# })

In [8]:
from transformers import TrainingArguments , Trainer
training_args = TrainingArguments(
                    output_dir=" ./llama_3.2_3b_fine-tuned",
                    learning_rate = 1e-5,
                    num_train_epochs = 5,
                    weight_decay = 0.01,
                    # per_device_train_batch_size=2,
                    # per_device_eval_batch_size=2,
                    # gradient_accumulation_steps=4,
                    auto_find_batch_size = True,
                    evaluation_strategy  = 'epoch',
                    save_strategy="epoch",  # Save checkpoints at each epoch
                    save_total_limit=2,  # Keep only the last 2 checkpoints to save space
                    load_best_model_at_end=True,  # Load best model after training
                    fp16=True)

trainer = Trainer(
    model = peft_model,
    # tokenizer = tokenizer,
    args = training_args,
    train_dataset=split_dataset["train"],
    eval_dataset=split_dataset["test"]
)



In [9]:
import wandb

wandb.login(key=wandb_value)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mbahaa_beshoy[0m ([33mbahaa_beshoy-helwan-university[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [10]:
trainer.train()



Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.1489,0.13429
2,0.1431,0.129126
3,0.1378,0.127241
4,0.1382,0.126283
5,0.1384,0.126035


TrainOutput(global_step=43800, training_loss=0.1503094338717526, metrics={'train_runtime': 6822.8482, 'train_samples_per_second': 51.356, 'train_steps_per_second': 6.42, 'total_flos': 2.434643974422528e+16, 'train_loss': 0.1503094338717526, 'epoch': 5.0})

In [11]:
#resume_from_checkpoint=True

In [12]:
peft_model.save_pretrained("./t5-small-lora-adapter")

In [13]:
# from peft import PeftModel

# base_model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
# peft_model = PeftModel.from_pretrained(base_model, "./t5-small-lora-adapter")

In [16]:
sample_context = """Rats are nocturnal, and out in the night the brown ratâ€™s eyes are small and black and
shiny; when a flashlight shines into them in the dark, the eyes of a rat light up like the
eyes of a deer. Though it forages* in darkness, the brown rat has poor eyesight. It makes
up for this with, first of all, an excellent sense of smell. . . . They have an excellent sense
of taste, detecting the most minute amounts of poison, down to one part per million. A
brown rat has strong feet, the two front paws each equipped with four clawlike nails, the
rear paws even longer and stronger. It can run and climb with squirrel-like agility. It is
an excellent swimmer, surviving in rivers and bays, in sewer streams and toilet bowls."""

# Tokenize the input context
input_text = f"Context: {sample_context} \nGenerated Question:"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(model.device)

# Generate questions
outputs = model.generate(input_ids, max_length=256, num_beams=5, early_stopping=True)

# Decode the generated question
generated_question = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generated Question:", generated_question)

Generated Question: What is a brown rat?
