## Setup Environment

In [2]:
# Install all necessary libraries
!pip install -q transformers datasets peft accelerate bitsandbytes evaluate sentencepiece --prefer-binary

# Clear GPU memory if Colab reused a session
import torch
torch.cuda.empty_cache()

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.1/60.1 MB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25h

## Import Libraries

In [3]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer, TrainingArguments, DataCollatorForLanguageModeling, BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model, TaskType

## Load Base Model & Tokenizer

In [4]:
# TinyLlama is a small open model (Llama2 style)
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

# Load model in 8-bit precision to save VRAM
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config = BitsAndBytesConfig(load_in_8bit=True),   # quantized weights
    device_map="auto"      # automatically uses GPU
)

# Ensure pad token exists (TinyLlama uses EOS as padding)
tokenizer.pad_token = tokenizer.eos_token

print(model)

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear8bitLt(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear8bitLt(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear8bitLt(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear8bitLt(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear8bitLt(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear8bitLt(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear8bitLt(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm):

## Load and Explore Dataset

In [5]:
# Load PubMed QA (medical question answering dataset)
dataset = load_dataset("qiaojin/PubMedQA", "pqa_labeled")
print(dataset)

# Each record has: question, context, long_answer, final_decision
# Let's format it for LLM input/output
def format_example(example):
  prompt = f"Question: {example['question']}\nContext: {".".join(example['context']['contexts'])}\nAnswer:"
  answer = example['long_answer']
  return {"prompt": prompt, "answer": answer}

dataset = dataset.map(format_example)
print(dataset)

README.md: 0.00B [00:00, ?B/s]

pqa_labeled/train-00000-of-00001.parquet:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['pubid', 'question', 'context', 'long_answer', 'final_decision'],
        num_rows: 1000
    })
})


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['pubid', 'question', 'context', 'long_answer', 'final_decision', 'prompt', 'answer'],
        num_rows: 1000
    })
})


## Prepare Train/Test Splits

In [6]:
# Select smaller subset for demo
train_dataset = dataset["train"].select(range(0, 800))
test_dataset  = dataset["train"].select(range(800, 1000))

## Tokenize Dataset

In [7]:
# Function to tokenize prompt+answer pairs
def tokenize_function(example):
  full_text = [
        " ".join(p) + " " + " ".join(a)
        for p, a in zip(example["prompt"], example["answer"])
    ]
  tokenized = tokenizer(
        full_text,               # Creates one full_text per example
        truncation=True,         # cut long text
        padding="max_length",    # pad shorter text
        max_length=256           # 256 tokens max
  )
  # For causal language modeling, labels = input_ids
  tokenized["labels"] = tokenized["input_ids"].copy()
  return tokenized

tokenized_train = train_dataset.map(tokenize_function, batched=True, remove_columns=train_dataset.column_names)
tokenized_test  = test_dataset.map(tokenize_function, batched=True, remove_columns=test_dataset.column_names)


Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

## Setup LoRA (Low-Rank Adaptation)

In [8]:
# LoRA adds small trainable matrices instead of retraining full model

peft_config = LoraConfig(
    r=8,                        # rank dimension
    lora_alpha=32,              # scaling factor
    target_modules=["q_proj", "v_proj"],  # apply LoRA on attention projections
    lora_dropout=0.05,          # small dropout for regularization
    bias="none",                # don't fine-tune bias terms
    task_type=TaskType.CAUSAL_LM
)
from peft import prepare_model_for_kbit_training
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()  # Shows how many params will train

trainable params: 1,126,400 || all params: 1,101,174,784 || trainable%: 0.1023


## Training Setup

In [23]:
# Data collator batches variable-length sequences efficiently
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

# Define training configuration
training_args = TrainingArguments(
    output_dir="./tinyllama-medical-lora",   # where to save model checkpoints

    # Training
    num_train_epochs=2,                      # 1–3 is typical for LoRA; 2 is a good start
    per_device_train_batch_size=2,           # small batch size = lower VRAM usage
    gradient_accumulation_steps=4,           # effectively makes batch size = 2 * 4 = 8
    learning_rate=2e-4,                      # recommended LoRA LR
    max_grad_norm=0.3,                       # gradient clipping — prevents exploding gradients
    weight_decay=0.01,                       # regularization for stability (slightly higher than 0.001)

    # Evaluation
    eval_strategy="steps",             # correct parameter name (not eval_strategy)
    eval_steps=100,                          # evaluate every 100 steps
    per_device_eval_batch_size=2,            # batch size for evaluation
    save_strategy="steps",                   # save checkpoints every N steps
    save_steps=200,                          # save model every 200 steps
    load_best_model_at_end=True,             # automatically reload best model

    # Optimization
    optim="paged_adamw_32bit",               # optimized AdamW for low memory
    lr_scheduler_type="cosine",              # cosine decay learning rate
    gradient_checkpointing=True,             # saves memory by not storing intermediate activations

    # Precision & Performance
    fp16=True,                               # use float16 if Colab GPU supports it (T4, A100, etc.)
    bf16=False,                              # only use bf16 if GPU supports it (A100+)
    group_by_length=True,                    # batches sequences of similar lengths for efficiency

    # Logging
    logging_steps=25,                        # log metrics every 25 steps
    report_to="tensorboard",                 # use TensorBoard for tracking (disable wandb)

    # Misc
    max_steps=-1,                            # -1 = ignore, use full dataset for num_train_epochs
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    data_collator=data_collator
)

## Fine-Tune

In [24]:
# Start LoRA fine-tuning
trainer.train()



Step,Training Loss,Validation Loss
100,0.9006,0.893563
200,0.8439,0.88465


TrainOutput(global_step=200, training_loss=0.8684183216094971, metrics={'train_runtime': 585.4313, 'train_samples_per_second': 2.733, 'train_steps_per_second': 0.342, 'total_flos': 2545185875558400.0, 'train_loss': 0.8684183216094971, 'epoch': 2.0})

## Evaluate / Test Generation

In [25]:
def generate_answer(question, context):
    prompt = f"Question: {question}\nContext: {context}\nAnswer:"
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = model.generate(**inputs)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

sample = test_dataset[2]
print("Question:", sample["question"])
print("Answer:", generate_answer(sample["question"], sample["context"]))


Question: Can nurse-led preoperative education reduce anxiety and postoperative complications of patients undergoing cardiac surgery?




Answer: Question: Can nurse-led preoperative education reduce anxiety and postoperative complications of patients undergoing cardiac surgery?
Context: {'contexts': ['The effect of preoperative education on anxiety and postoperative outcomes of cardiac surgery patients remains unclear.AIM: The aim of the study was to estimate the effectiveness of a nurse-led preoperative education on anxiety and postoperative outcomes.', 'A randomised controlled study was designed. All the patients who were admitted for elective cardiac surgery in a general hospital in Athens with knowledge of the Greek language were eligible to take part in the study. Patients in the intervention group received preoperative education by specially trained nurses. The control group received the standard information by the ward personnel. Measurements of anxiety were conducted on admission-A, before surgery-B and before discharge-C by the state-trait anxiety inventory.', 'The sample consisted of 395 patients (intervention