**INSTALL UNSLOTH & DEPENDENCIES**

In [1]:
!pip install -q unsloth
!pip install -q datasets accelerate bitsandbytes trl


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.7/69.7 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m432.3/432.3 kB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.7/60.7 MB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m506.8/506.8 kB[0m [31m47.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m129.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m566.4/566.4 kB[0m [31m48.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m423.1/423.1 kB[0m [31m44.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━

**IMPORTANT LIBRARIES**

In [2]:

import torch
from unsloth import FastLanguageModel
from datasets import load_dataset
from trl import SFTTrainer
from transformers import TrainingArguments


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


LOAD BASED MODEL(4-BIT QLORA)

In [3]:
model_name = "unsloth/llama-3-8b-bnb-4bit"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = 2048,
    load_in_4bit = True,
    dtype = None,
)


==((====))==  Unsloth 2026.2.1: Fast Llama patching. Transformers: 4.57.6.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.563 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.10.0+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.6.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.35. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/198 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

ADD LORA ADAPTER(PEFT)

In [4]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,                     # LoRA rank
    lora_alpha = 32,
    lora_dropout = 0.05,
    target_modules = [
        "q_proj", "k_proj", "v_proj",
        "o_proj", "gate_proj", "up_proj", "down_proj"
    ],
    use_gradient_checkpointing = "unsloth",
)


Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2026.2.1 patched 32 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


**MEDICAL Q&A DATASET**

In [5]:

dataset = load_dataset("pubmed_qa", "pqa_labeled", split="train[:2000]")

README.md: 0.00B [00:00, ?B/s]

pqa_labeled/train-00000-of-00001.parquet:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

**FORMAT DATASET**

In [6]:
def formatting_func(example):
    return f"""### Question:
{example['question']}

### Answer:
{example['long_answer']}"""

dataset = dataset.map(lambda x: {
    "text": formatting_func(x)
})

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

**SETUP QLORA TRAINING**

In [14]:

training_args = TrainingArguments(
    output_dir = "./medical_llama3",
    per_device_train_batch_size = 2,
    gradient_accumulation_steps = 4,
    learning_rate = 2e-4,
    num_train_epochs = 2,
    fp16 = not torch.cuda.is_bf16_supported(),
    bf16 = torch.cuda.is_bf16_supported(),
    logging_steps = 10,
    optim = "adamw_8bit",
    save_strategy = "epoch",
    report_to = "none"
)

**START FINE_TUNING**

In [15]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = 2048,
    args = training_args,
)

trainer.train()

Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/1000 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,000 | Num Epochs = 2 | Total steps = 250
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040 of 8,072,204,288 (0.52% trained)


Step,Training Loss
10,2.2824
20,1.9716
30,2.0785
40,2.0668
50,2.078
60,2.0806
70,1.9573
80,2.0192
90,2.0363
100,1.9576


TrainOutput(global_step=250, training_loss=1.8265573272705078, metrics={'train_runtime': 890.054, 'train_samples_per_second': 2.247, 'train_steps_per_second': 0.281, 'total_flos': 8047828790722560.0, 'train_loss': 1.8265573272705078, 'epoch': 2.0})

**MONITOR GPU MEMORY(GOOGLE_COLAB)**

In [10]:

torch.cuda.memory_summary()



**SAVE FINE_TUNE** **ADAPTER**

In [11]:
model.save_pretrained("medical_lora_adapter")
tokenizer.save_pretrained("medical_lora_adapter")


('medical_lora_adapter/tokenizer_config.json',
 'medical_lora_adapter/special_tokens_map.json',
 'medical_lora_adapter/tokenizer.json')

**Test MODEL ON NEW MEDICAL** **QUEST**

In [13]:
FastLanguageModel.for_inference(model)

prompt = """### Question:
What are the symptoms of fever?

### Answer:
"""

inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

outputs = model.generate(
    **inputs,
    max_new_tokens = 200,
    do_sample = True,
    temperature = 0.7,
)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

### Question:
What are the symptoms of fever?

### Answer:
Fever is a rise in body temperature. The body temperature of a healthy person is around 36.4 degrees Celsius. Fever is considered to be above 37.8 degrees Celsius. Fever is not a disease. It is a symptom of some disease. Fever is caused by a change in the body's thermostat. This thermostat is called hypothalamus. It is located in the brain. Fever can be caused by infections, injuries, and even stress. When we have fever, our body is trying to fight the infection. Infections like malaria, flu, and typhoid cause fever. Fever is also caused by injuries. Injuries like fractures, burns, and sunburns can cause fever. Even stress can cause fever. Fever can be caused by mental stress. Mental stress can cause fever. Stress can also cause other symptoms like headache, body ache, and nausea. Fever can be caused by a rise in the body temperature. When the body temperature rises, the body temperature rises.
