# Parameter-Efficient Fine-Tuning of LLaMA 3.2B for Medical Reasoning

# Install relevent pakages

In [1]:
%%capture
!pip install unsloth vllm
!pip install --force-reinstall --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git
!pip install rouge-score

# Import all relevent pakages

In [2]:
# Modules for fine-tuned
import os
import warnings
warnings.filterwarnings("ignore")

from unsloth import FastLanguageModel
import torch
import numpy as np
from trl import SFTTrainer  # Trainer for supervised fine-tuning(SFT)

# Hugging Face Modules 

from transformers import TrainingArguments    # Define the training hyperparameters
from huggingface_hub import login   # lets you login to API
from datasets import load_dataset    # Lets you load fine-tuning datasets
from unsloth import is_bfloat16_supported    # Checks if the hardware supports bfloat16 precision

# Import weights and biases

import wandb

# Import kaggle secrets

from kaggle_secrets import UserSecretsClient

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


2025-06-25 16:02:17.069475: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750867337.266937      75 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750867337.328088      75 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 06-25 16:02:37 [importing.py:53] Triton module has been replaced with a placeholder.
INFO 06-25 16:02:37 [__init__.py:239] Automatically detected platform cuda.


# Create API keys and login to Hugging Face and Weights and Biases

In [3]:
# Initialize Hugging Face & WnB tokens

user_secrets = UserSecretsClient() # from kaggle_secrets import UserSecretsClient
hugging_face_token = user_secrets.get_secret("HF_TOKEN")
wnb_token = user_secrets.get_secret("wnb")

# Login to Hugging Face

login(hugging_face_token)

# Login to WnB

wandb.login(key=wnb_token)  #import wandb

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33myasirbhutto[0m ([33myasirbhutto-khawaja-fareed-university-of-engineering-and[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

# Loading Model and Tokenizer

#### what are we doing in this step ?

In this step,we load llama 3.2 (3B) model and its tokenizer using FastLanguageModel.from_pretrained(). we also configure key parameters for efficient interference and fine-tuning.


#### key parameters

max_seq_length = 2048  # define the maximum sequence length a model can handle (i.e, number of tokens per input)

dtype = None           # Default data type (usually auto-detected)

load_in_4bit = True    # Enables 4-bit quantization -a memory-saving optimization


#### Intuition behind 4-bit quantization 

Imagine compressing a **high-resolution image** to a smaller size **-it takes up less space but still looks good enough**.similarly, **4-bit quantization reduces the precision of model weights**, making the model **smaller and faster while keeping most of its accurate**.Instead of storing precise **32-bit or 16-bit numbers**, we compute them into **4-bit values**. This allows **large language models to run effieciently on consumer GPUs** without needing massive amount of memory.

In [4]:
max_seq_length = 2048
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3.2-3B-Instruct",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

==((====))==  Unsloth 2025.6.5: Fast Llama patching. Transformers: 4.51.3. vLLM: 0.8.5.post1.
   \\   /|    Tesla T4. Num GPUs = 2. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.35G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

# Download the fine-tuning dataset and format it for fine-tuning

we will use the Medical 01 Reasoning SFT found here on Hugging face.

In [5]:
# Download the dataset using Hugging Face - fucntion imported using from dataset load_dataset

dataset = load_dataset("FreedomIntelligence/medical-o1-reasoning-SFT", "en", split="train")
dataset

README.md: 0.00B [00:00, ?B/s]

medical_o1_sft.json:   0%|          | 0.00/58.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/19704 [00:00<?, ? examples/s]

Dataset({
    features: ['Question', 'Complex_CoT', 'Response'],
    num_rows: 19704
})

In [6]:
# show an entry from dataset
dataset[1]

{'Question': 'A 33-year-old woman is brought to the emergency department 15 minutes after being stabbed in the chest with a screwdriver. Given her vital signs of pulse 110/min, respirations 22/min, and blood pressure 90/65 mm Hg, along with the presence of a 5-cm deep stab wound at the upper border of the 8th rib in the left midaxillary line, which anatomical structure in her chest is most likely to be injured?',
 'Complex_CoT': "Okay, let's figure out what's going on here. A woman comes in with a stab wound from a screwdriver. It's in her chest, upper border of the 8th rib, left side, kind of around the midaxillary line. First thought, that's pretty close to where the lung sits, right?\n\nLet's talk about location first. This spot is along the left side of her body. Above the 8th rib, like that, is where a lot of important stuff lives, like the bottom part of the left lung, possibly the diaphragm too, especially considering how deep the screwdriver went.\n\nThe wound is 5 cm deep. Tha

# Dataset Preparation

In [7]:
import numpy as np
from datasets import DatasetDict

# Split dataset manually
validation_size = 100
train_size = len(dataset) - validation_size
indices = np.random.permutation(len(dataset))
train_indices, val_indices = indices[:train_size], indices[train_size:]

# Create train/val datasets
train_dataset = dataset.select(train_indices)
val_dataset = dataset.select(val_indices)

# Combine into DatasetDict
dataset_dict = DatasetDict({"train": train_dataset, "validation": val_dataset})


**Next step is to structure the fine-tuning dataset according to train prompts style-why?**

* Each question is paired with chain-of-thought reasongin and the final response.
* Ensure every training example follows consistent patter
* Prevent the model from continouing beyond the expected length by adding the EOS (end of sequence) token.

In [8]:
# We need to format the dataset to fit our prompt training style

EOS_TOKEN = tokenizer.eos_token
EOS_TOKEN

'<|eot_id|>'

In [9]:
def formatting_prompts_func(examples):
    inputs = examples["Question"]
    cots = examples["Complex_CoT"]
    outputs = examples["Response"]
    texts = []

    
    for input, cot, output in zip(inputs, cots, outputs):
        convo = [
            {"role": "user", "content": input},
            {"role": "assistant", "content": f"<think>{cot}</think>\n<response>{output}</response>"}
        ]

        # Generate the formatted chat-style prompt
        
        text = tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False)
        texts.append(text)
    return {"text": texts}


dataset_dict = dataset_dict.map(formatting_prompts_func, batched=True, num_proc=2)

Map (num_proc=2):   0%|          | 0/19604 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/100 [00:00<?, ? examples/s]

# Fine-Tuning Setup with LoRA

In [10]:
model_lora = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules=["q_proj", 
                    "k_proj", 
                    "v_proj", 
                    "o_proj", 
                    "gate_proj", 
                    "up_proj", 
                    "down_proj"],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing=False,
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)

Unsloth 2025.6.5 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


# Compute ROUGE-L Scores Before Fine-Tuning

Now,we Initialize SFTTrainer, a supervised fine-tuning trainer from trl,to fine-tune model efficeintly on a dataser

In [11]:
from rouge_score import rouge_scorer

def compute_rouge_l(dataset, model, tokenizer, num_samples=10):
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    scores = []
    model.eval()

    
    for example in dataset.select(range(min(num_samples, len(dataset)))):
        question = example["Question"]
        ground_truth = example["Response"]
        convo = [{"role": "user", "content": question}]
        inputs = tokenizer([tokenizer.apply_chat_template(convo, tokenize=False)], return_tensors="pt").to("cuda")
        outputs = model.generate(**inputs, max_new_tokens=1200, use_cache=True)
        prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
        score = scorer.score(ground_truth, prediction)['rougeL'].fmeasure
        scores.append(score)
    return np.mean(scores)


pre_finetune_rouge = compute_rouge_l(dataset_dict["validation"], model_lora, tokenizer)
run = wandb.init(
    project='Fine-tune-Llama-3.2-3B-Instruct-on-Medical-COT-Dataset',
    job_type="training",
    anonymous="allow"
)
wandb.log({"pre_finetune_rouge_l": pre_finetune_rouge})


# Initialize Fine-Tuning Trainer

In [12]:
from trl import SFTTrainer
from transformers import TrainingArguments
import os
os.environ["TORCHDYNAMO_DISABLE"] = "1"
torch_compile=False,

trainer = SFTTrainer(
    model=model_lora,
    tokenizer=tokenizer,
    train_dataset=dataset_dict["train"],
    eval_dataset=dataset_dict["validation"],
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2, # Number of processes to use for dataset tokenization

    # Define training arguments
    args=TrainingArguments(
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        num_train_epochs=3,
        warmup_steps=100,
        max_steps=250,
        learning_rate=2e-4,
        fp16=False, # Will be True for T4
        bf16=False, # Will be False for T4
        logging_steps=10,
        optim="adamw_8bit", # AdamW optimizer with 8-bit quantization
        weight_decay=0.01,
        lr_scheduler_type="linear", # Linear learning rate schedule
        seed=3407,
        output_dir="outputs", # Directory to save checkpoints and logs

        # Reporting and Saving
        report_to="wandb", # Integrates with Weights & Biases
        run_name="llama3-3B-lr2e-4-steps1000", # Name for your W&B run
        save_strategy="steps",
        save_steps=50, # Save checkpoint every 50 steps
        save_total_limit=1, # Keep only the last checkpoint
        eval_strategy="steps",
        eval_steps=50, # Evaluate every 50 steps

        # Hugging Face Hub integration
        hub_model_id="Data4y/LLaMA3.2B-MedCOT", # Your desired Hugging Face Hub model ID
        logging_first_step=True, # Log metrics for the very first step
    ),
)

print("\nSFTTrainer initialized successfully.")

Unsloth: Tokenizing ["text"]:   0%|          | 0/19604 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"]:   0%|          | 0/100 [00:00<?, ? examples/s]


SFTTrainer initialized successfully.


# Model Training

In [13]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

Map (num_proc=4):   0%|          | 0/19604 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/100 [00:00<?, ? examples/s]

In [14]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 19,604 | Num Epochs = 1 | Total steps = 250
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 4 x 1) = 32
 "-____-"     Trainable parameters = 24,313,856/3,000,000,000 (0.81% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,Validation Loss
50,1.511,1.464307
100,1.4443,1.405845
150,1.4236,1.384464
200,1.4104,1.374464
250,1.4208,1.369599


Unsloth: Not an error, but LlamaForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


# Compute ROUGE-L Scores After Fine-Tuning

In [15]:
post_finetune_rouge = compute_rouge_l(dataset_dict["validation"], model_lora, tokenizer)
wandb.log({"post_finetune_rouge_l": post_finetune_rouge})

# Save Fine-Tuned Model and Tokenizer

In [16]:
model_lora.save_pretrained("lora_adapters")
tokenizer.save_pretrained("lora_adapters")

('lora_adapters/tokenizer_config.json',
 'lora_adapters/special_tokens_map.json',
 'lora_adapters/tokenizer.json')

# Upload to Hugging Face

In [17]:

# Load custom token
user_secrets = UserSecretsClient()
upload_token = user_secrets.get_secret("HF_TOKEN")

# Log in with that token
login(token=upload_token)

model_lora.push_to_hub("Data4y/medcot-llama3.2-3b-model", token=upload_token)
tokenizer.push_to_hub("Data4y/medcot-llama3.2-3b-tokenizer", token=upload_token)



README.md:   0%|          | 0.00/611 [00:00<?, ?B/s]

Uploading...:   0%|          | 0.00/97.3M [00:00<?, ?B/s]

Saved model to https://huggingface.co/Data4y/medcot-llama3.2-3b-model


Uploading...:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

# Run Model Inference

In [18]:
def generate_response(question, model, tokenizer):
    
    FastLanguageModel.for_inference(model)
    convo = [{"role": "user", "content": question}]
    inputs = tokenizer([tokenizer.apply_chat_template(convo, tokenize=False)], return_tensors="pt").to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=1200, use_cache=True)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response.split("<response>")[1].split("</response>")[0] if "<response>" in response else response

question = """A 61-year-old woman with a long history of involuntary urine loss during activities like coughing or sneezing but no 
leakage at night undergoes a gynecological exam and Q-tip test. Based on these findings, 
what would cystometry most likely reveal about her residual volume and detrusor contractions?"""

response = generate_response(question, model_lora, tokenizer)
print(f"Response: {response}")

Response: Based on the findings you've described, cystometry would most likely reveal that the woman has a normal residual volume and weak detrusor contractions. The symptoms of involuntary urine leakage during activities like sneezing or coughing suggest stress incontinence, particularly of the type where the urethra is hypermobile. Since she does not experience leakage at night, her bladder capacity is likely normal. However, the weak detrusor contractions during the cystometry would explain why she experiences incontinence during activities that increase abdominal pressure. This pattern aligns well with the diagnosis of stress urinary incontinence due to urethral hypermobility.


# Instructions for Loading and Using the Fine-Tuned Model

In [19]:
wandb.finish()

0,1
eval/loss,█▄▂▁▁
eval/runtime,█▁▁▂▂
eval/samples_per_second,▁██▇▇
eval/steps_per_second,▁▇█▇▆
post_finetune_rouge_l,▁
pre_finetune_rouge_l,▁
train/epoch,▁▁▂▂▂▂▂▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▇▇▇▇▇████
train/global_step,▁▁▂▂▂▂▂▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▇▇▇▇▇█████
train/grad_norm,██▆▃▃▁▂▂▂▂▂▃▂▂▂▂▂▃▃▂▂▃▂▃▂▂
train/learning_rate,▁▂▂▃▄▄▅▆▇▇██▇▇▆▆▅▅▄▄▃▃▂▂▂▁

0,1
eval/loss,1.3696
eval/runtime,50.1181
eval/samples_per_second,1.995
eval/steps_per_second,0.259
post_finetune_rouge_l,0.16228
pre_finetune_rouge_l,0.2549
total_flos,1.1808948297488794e+17
train/epoch,0.408
train/global_step,250.0
train/grad_norm,0.16293
