In [7]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

from IPython.display import clear_output
!pip install transformers accelerate peft bitsandbytes datasets GPUtil trl
clear_output()

In [2]:
!pip install scikit-learn
clear_output()

### Dataset preparation

In [3]:
import json

file_path = "/kaggle/input/interview-questions/interview_questions.jsonl"

data = []
with open(file_path, 'r', encoding='utf-8') as f:
    for line in f:
        data.append(json.loads(line))

In [4]:
import random
from datasets import Dataset, DatasetDict

def format_prompt(example):
    """Format in Llama instruction format"""
    prompt = f"""<s>[INST] {example['instruction']} [/INST] {example['output']}</s>"""
    return {"text": prompt}

formatted_data = [format_prompt(item) for item in data]

random.shuffle(formatted_data)
split_idx = int(len(formatted_data) * 0.9)
train_data = formatted_data[:split_idx]
val_data = formatted_data[split_idx:]

train_dataset = Dataset.from_list(train_data)
val_dataset = Dataset.from_list(val_data)
    
dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset
})

In [9]:
from transformers import AutoTokenizer

model_path = "/kaggle/input/llama3-2/pytorch/default/1"

tokenizer = AutoTokenizer.from_pretrained(
    model_path,
    use_fast = True
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    result = tokenizer(
        examples['text'],
        truncation=True,
        max_length=512,
        padding='max_length'
    )
    result["labels"] = result["input_ids"].copy()
    return result

tokenized_datasets = dataset_dict.map(
        tokenize_function,
        batched=True,
        remove_columns=dataset_dict['train'].column_names
    )
    
print(f"Train examples: {len(tokenized_datasets['train'])}")
print(f"Validation examples: {len(tokenized_datasets['validation'])}")

Map:   0%|          | 0/811 [00:00<?, ? examples/s]

Map:   0%|          | 0/91 [00:00<?, ? examples/s]

Train examples: 811
Validation examples: 91


### LoRa Fine-Tuning

In [22]:
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling
)
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training
)
from trl import SFTTrainer

OUTPUT_DIR = "kaggle/working/lora_llama3_3b"

MAX_SEQ_LEN = 2048
BATCH_SIZE = 2
GRAD_ACCUM = 8
EPOCHS = 3
LR = 2e-4

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    quantization_config=bnb_config,
    device_map="auto",
    dtype=torch.bfloat16
)

model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj"
    ],
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

trainable params: 9,175,040 || all params: 3,221,924,864 || trainable%: 0.2848


In [19]:
!pip install -U trl transformers peft accelerate
clear_output()

In [23]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACCUM,
    num_train_epochs=EPOCHS,
    learning_rate=LR,
    warmup_ratio=0.05,
    lr_scheduler_type="cosine",
    logging_steps=10,
    save_strategy="epoch",
    bf16=True,
    tf32=True,
    gradient_checkpointing=True,
    optim="paged_adamw_32bit",
    report_to="none",
    max_grad_norm=1.0,
    run_name="lora_llama3.2_h100"
)

data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False
    )

trainer = SFTTrainer(
    model=model,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    args=training_args,
    data_collator = data_collator
)

trainer.train()

Truncating train dataset:   0%|          | 0/811 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/91 [00:00<?, ? examples/s]

Step,Training Loss
10,3.3096
20,2.1033
30,1.7571
40,1.6953
50,1.5717
60,1.453
70,1.3673
80,1.3745
90,1.3258
100,1.2685


TrainOutput(global_step=153, training_loss=1.5011901699639614, metrics={'train_runtime': 278.6596, 'train_samples_per_second': 8.731, 'train_steps_per_second': 0.549, 'total_flos': 2.113638997111603e+16, 'train_loss': 1.5011901699639614, 'entropy': 0.5288935886187986, 'num_tokens': 1245696.0, 'mean_token_accuracy': 0.7602718445387754, 'epoch': 3.0})