In [None]:
!uv pip install unsloth

In [None]:
from unsloth import FastLanguageModel

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-1B-Instruct",
    max_seq_length = 2048,
    load_in_4bit = True,
)

In [None]:
# Test the base model on a medical question
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer.pad_token = tokenizer.eos_token

model.eval()

prompt = """
Q. A 23-year-old pregnant woman at 22 weeks gestation presents with burning upon urination. She states it started 1 day ago and has been worsening despite drinking more water and taking cranberry extract. She otherwise feels well and is followed by a doctor for her pregnancy. Her temperature is 97.7°F (36.5°C), blood pressure is 122/77 mmHg, pulse is 80/min, respirations are 19/min, and oxygen saturation is 98% on room air. Physical exam is notable for an absence of costovertebral angle tenderness and a gravid uterus.
Which of the following is the best treatment for this patient?

Options:
A: Ampicillin
B: Ceftriaxone
C: Ciprofloxacin
D: Doxycycline
E: Nitrofurantoin

Please choose the correct answer.
"""

inputs = tokenizer(prompt, return_tensors="pt").to(device)
input_len = inputs["input_ids"].shape[-1]

output = model.generate(**inputs, max_new_tokens=128)

response = tokenizer.decode(output[0][input_len:], skip_special_tokens=True)

print("Test Prompt:\n", prompt.strip())
print("\nTest Response:\n", response.strip())

In [None]:
# Load the dataset
from datasets import load_dataset

dataset = load_dataset("Neelectric/MedQA-USMLE", split="train")

In [None]:
dataset

In [None]:
print("Question:\n", dataset[0]["question"])
print("\nOptions:\n", dataset[0]["options"])
print("\nAnswer:\n", dataset[0]["answer"])

In [None]:
# Format text according to the model
from unsloth.chat_templates import get_chat_template

tokenizer.pad_token = tokenizer.eos_token

tokenizer = get_chat_template(tokenizer, chat_template="llama-3.2")

def format_prompts(examples):
    questions = examples["question"]
    options_list = examples["options"]
    answers = examples["answer"]

    conversations = []
    for q, opts, ans in zip(questions, options_list, answers):
        options_text = "\n".join([f"{key}. {val}" for key, val in opts.items()])
        prompt = f"{q}\n\nOptions:\n{options_text}\n\nPlease choose the correct answer."
        response = f"The correct answer is {ans}."
        conversations.append([
            {"role": "user", "content": prompt},
            {"role": "assistant", "content": response},
        ])

    texts = [tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False) for convo in conversations]

    return {"text": texts}

dataset = dataset.map(format_prompts, batched=True)

In [None]:
dataset = dataset.map(format_prompts, batched=True)

In [None]:
dataset

In [None]:
print("Question:\n", dataset[0]["question"])
print("\nOptions:\n", dataset[0]["options"])
print("\nAnswer:\n", dataset[0]["answer"])
print("\nFormatted Text:\n", dataset[0]["text"])

In [None]:
# LoRA configuration

model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=42,
    use_rslora=False,
    loftq_config=None,
)

In [None]:
# Setting up SFTTrainer

from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length= 2048,
    dataset_num_proc=2,
    packing=False,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=200,
        learning_rate=2e-4,
        logging_steps=10,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=700,
        output_dir="outputs",
        report_to="none",
    ),
)

In [None]:
from unsloth.chat_templates import train_on_responses_only

trainer = train_on_responses_only(
    trainer,
    instruction_part="<|start_header_id|>user<|end_header_id|>\n\n",
    response_part="<|start_header_id|>assistant<|end_header_id|>\n\n",
)

In [None]:
# Memory stats

import torch

gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)

print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
trainer_stats = trainer.train()

In [None]:
# Save LoRA fine tuned model and tokenizer

output_dir = "medqa_finetuned_model"

model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

In [None]:
# Final memory and time stats

used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

# Inference

In [None]:
base_model_name = "unsloth/Llama-3.2-1B-Instruct"
lora_path = "medqa_finetuned_model"
max_seq_length = 2048

# === Load Base Model ===
base_model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = base_model_name,
    max_seq_length = max_seq_length,
    dtype = torch.float16,
    load_in_4bit = True,
)

tokenizer.pad_token = tokenizer.eos_token
tokenizer = get_chat_template(tokenizer, chat_template="llama-3.2")

base_model.eval()

# === Load LoRA Fine-Tuned Model ===
lora_model, _ = FastLanguageModel.from_pretrained(
    model_name = base_model_name,
    max_seq_length = max_seq_length,
    dtype = torch.float16,
    load_in_4bit = True,
)

lora_model.load_adapter(lora_path)
lora_model.eval()

In [None]:
def compare_model_outputs(base_model, lora_model, tokenizer, prompt, max_new_tokens=128):
    def generate_response(model, prompt):
        formatted_prompt = tokenizer.apply_chat_template(
            [{"role": "user", "content": prompt.strip()}],
            tokenize=False,
            add_generation_prompt=True,
        )

        inputs = tokenizer(formatted_prompt, return_tensors="pt").to(device)
        input_len = inputs["input_ids"].shape[-1]

        output = model.generate(**inputs, max_new_tokens=max_new_tokens)
        response = tokenizer.decode(output[0][input_len:], skip_special_tokens=True)

        return response.strip()

    print("Prompt:\n")
    print(prompt.strip())

    print("\nBase Model Response:\n")
    print(generate_response(base_model, prompt))

    print("\nFine-Tuned (LoRA) Model Response:\n")
    print(generate_response(lora_model, prompt))

In [None]:
# Test question

prompt = """
Q. A 23-year-old pregnant woman at 22 weeks gestation presents with burning upon urination. She states it started 1 day ago and has been worsening despite drinking more water and taking cranberry extract. She otherwise feels well and is followed by a doctor for her pregnancy. Her temperature is 97.7°F (36.5°C), blood pressure is 122/77 mmHg, pulse is 80/min, respirations are 19/min, and oxygen saturation is 98% on room air. Physical exam is notable for an absence of costovertebral angle tenderness and a gravid uterus.
Which of the following is the best treatment for this patient?

Options:
A: Ampicillin
B: Ceftriaxone
C: Ciprofloxacin
D: Doxycycline
E: Nitrofurantoin

Please choose the correct answer.
"""

In [None]:
compare_model_outputs(base_model, lora_model, tokenizer, prompt)