In [1]:
import json
import pandas as pd

# Load JSON file
file_path = "/kaggle/input/stanford-question-answering-dataset/train-v1.1.json"
with open(file_path, "r", encoding="utf-8") as file:
    data = json.load(file)

# Extract relevant information
records = []
for topic in data["data"]:
    title = topic["title"]
    for paragraph in topic["paragraphs"]:
        context = paragraph["context"]
        for qas in paragraph["qas"]:
            question = qas["question"]
            question_id = qas["id"]
            answers = [ans["text"] for ans in qas["answers"]]  # Collect all possible answers
            records.append([title, context, question, question_id, answers])

# Convert to Pandas DataFrame
df = pd.DataFrame(records, columns=["Title", "Context", "Question", "Question_ID", "Answers"])

# Display DataFrame
print(df.head())


                      Title  \
0  University_of_Notre_Dame   
1  University_of_Notre_Dame   
2  University_of_Notre_Dame   
3  University_of_Notre_Dame   
4  University_of_Notre_Dame   

                                             Context  \
0  Architecturally, the school has a Catholic cha...   
1  Architecturally, the school has a Catholic cha...   
2  Architecturally, the school has a Catholic cha...   
3  Architecturally, the school has a Catholic cha...   
4  Architecturally, the school has a Catholic cha...   

                                            Question  \
0  To whom did the Virgin Mary allegedly appear i...   
1  What is in front of the Notre Dame Main Building?   
2  The Basilica of the Sacred heart at Notre Dame...   
3                  What is the Grotto at Notre Dame?   
4  What sits on top of the Main Building at Notre...   

                Question_ID                                    Answers  
0  5733be284776f41900661182               [Saint Bernadette Soubir

In [2]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
huggingface_value = user_secrets.get_secret("huggingface")
wandb_value = user_secrets.get_secret("wandb")

In [3]:
from huggingface_hub import login
login(token=huggingface_value)

In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
import torch
#TinyLlama/TinyLlama_v1.1
#meta-llama/Llama-3.2-3B
# Load the model and tokenizer
model_name = "meta-llama/Llama-3.2-3B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name,device_map="auto")

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/844 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

In [5]:
from datasets import Dataset , DatasetDict
max_length = 256
def tokenize_function(examples):
    text = [f"Context: {inp} \nQuestions:" for inp in examples["Context"]]
    
    examples["input_ids"] = tokenizer(text, truncation=True, padding="max_length", max_length=max_length, return_tensors = "pt").input_ids
    examples["labels"] = tokenizer(examples["Question"], truncation=True, padding="max_length", max_length=max_length, return_tensors = "pt").input_ids
    
    return examples
    
data = Dataset.from_pandas(df)

tokenizer.pad_token = tokenizer.eos_token
tokenized_dataset = data.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(['Title', 'Context', 'Question', 'Question_ID', 'Answers'])
tokenized_dataset = tokenized_dataset.filter(lambda example,index:index%20==0,with_indices=True)
split_dataset = tokenized_dataset.train_test_split(test_size=0.2)



Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Filter:   0%|          | 0/87599 [00:00<?, ? examples/s]

In [6]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=32,
    lora_alpha = 32,
    lora_dropout=0.05,
    bias='none',
    task_type = TaskType.SEQ_2_SEQ_LM
)
peft_model = get_peft_model(model,peft_config=lora_config)

In [7]:
# from datasets import Dataset , DatasetDict
# split_dataset = DatasetDict({
#     "train": split_dataset["train"].with_format("torch"),  
#     "test": split_dataset["test"].with_format("torch")
# })

In [8]:
from transformers import TrainingArguments , Trainer
training_args = TrainingArguments(
                    output_dir=" ./llama_3.2_3b_fine-tuned",
                    learning_rate = 1e-5,
                    num_train_epochs = 5,
                    weight_decay = 0.01,
                    # per_device_train_batch_size=2,
                    # per_device_eval_batch_size=2,
                    # gradient_accumulation_steps=4,
                    auto_find_batch_size = True,
                    evaluation_strategy  = 'epoch',
                    save_strategy="epoch",  # Save checkpoints at each epoch
                    save_total_limit=2,  # Keep only the last 2 checkpoints to save space
                    load_best_model_at_end=True,  # Load best model after training
                    fp16=True)

trainer = Trainer(
    model = peft_model,
    # tokenizer = tokenizer,
    args = training_args,
    train_dataset=split_dataset["train"],
    eval_dataset=split_dataset["test"]
)



In [9]:
import wandb

wandb.login(key=wandb_value)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mbahaa_beshoy[0m ([33mbahaa_beshoy-helwan-university[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [10]:
trainer.train()



Epoch,Training Loss,Validation Loss
1,2.3671,0.387965
2,0.3743,0.368633
3,0.3613,0.36601
4,0.3571,0.361277
5,0.3575,0.360566


TrainOutput(global_step=4380, training_loss=0.5943609960547321, metrics={'train_runtime': 4982.773, 'train_samples_per_second': 3.516, 'train_steps_per_second': 0.879, 'total_flos': 7.610142874927104e+16, 'train_loss': 0.5943609960547321, 'epoch': 5.0})

In [11]:
#resume_from_checkpoint=True

In [22]:
from IPython.display import FileLink

FileLink('/kaggle/working/Llama_3.2_3b_fine_tuned_lora_model/adapter_model.safetensors')

In [17]:
from safetensors.torch import load_model, save_model
import torch

# Assume `model` is your trained PyTorch model
model_path = "/kaggle/working/Llama_3.2_3b_fine_tuned_lora_model/adapter_model.safetensors"

save_model(peft_model, model_path)


In [27]:

# Generate a correct download link
from IPython.display import FileLink

FileLink("/wandb/debug-internal.log")