## LoRA training support

In [None]:
# Import the PyTorch library
import torch

# Get the major and minor version of the current CUDA device (GPU)
major_version, minor_version = torch.cuda.get_device_capability()

In [None]:
print("Major : ", major_version, "Minor :", minor_version)

Major :  7 Minor : 5


In [None]:
# Import the PyTorch library
import torch

# Get the major and minor version of the current CUDA device (GPU)
major_version, minor_version = torch.cuda.get_device_capability()

# Apply the following if the GPU has Ampere or Hopper architecture (RTX 30xx, RTX 40xx, A100, H100, L40, etc.)
!pip install uv
if major_version >= 8:
    # Install the Unsloth library for Ampere and Hopper architecture from GitHub
    !uv pip install "unsloth[colab_ampere] @ git+https://github.com/unslothai/unsloth.git" -q

# Apply the following for older GPUs (V100, Tesla T4, RTX 20xx, etc.)
else:
    # Install the Unsloth library for older GPUs from GitHub
    !uv pip install "unsloth[colab_new] @ git+https://github.com/unslothai/unsloth.git" -q

# Placeholder statement (does nothing)
pass

# Install the Hugging Face Transformers library from GitHub, which allows native 4-bit loading
!uv pip install "git+https://github.com/huggingface/transformers.git" -q

!uv pip install trl datasets -q


Collecting uv
  Downloading uv-0.8.4-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading uv-0.8.4-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.8/18.8 MB[0m [31m75.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: uv
Successfully installed uv-0.8.4


Import unsloth Libraries

In [None]:
from unsloth import FastLanguageModel
from google.colab import userdata

# Get the Hugging Face token from Colab secrets
hf_token = userdata.get('HF_TOKEN')

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "BioMistral/BioMistral-7B",
    max_seq_length = 2048,
    dtype = None,  # Automatically uses float16 on T4
    load_in_4bit = True, # Reduce memory usage using 4-bit quantization (can be set to False to disable)
    token = hf_token # Pass the Hugging Face token
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.8.1: Fast Mistral patching. Transformers: 4.55.0.dev0.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


pytorch_model.bin:   0%|          | 0.00/14.5G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

BioMistral/BioMistral-7B does not have a padding token! Will use pad_token = <unk>.


## Add LoRA Adapter and update only 1-10% of all parameters!

In [None]:
model = FastLanguageModel.get_peft_model(
    model, # Specify the existing model

    r = 16, # (Recommended values include 8, 16, 32, 64, 128, etc.) Rank parameter for LoRA. The smaller this value, the fewer parameters will be modified.

    target_modules=["q_proj", "k_proj"], # Specify the modules to which LoRA will be applied
    # target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], # Specify the modules to which LoRA will be applied

    lora_alpha = 16, # Alpha parameter for LoRA. This value determines the strength of the applied LoRA.

    lora_dropout = 0, # Dropout rate for LoRA. Currently supports only 0. (recommended 0)

    bias = "none", # Currently, only supports bias = "none"

    use_gradient_checkpointing = "unsloth", # Whether to use gradient checkpointing to improve memory efficiency

    random_state = 3407, # Seed value for random number generation

    max_seq_length = 2048, # Maximum sequence length for tokenization

    use_dora = False,
)

Not an error, but Unsloth cannot patch MLP layers with our manual autograd engine since either LoRA adapters
are not enabled or a bias term (like in Qwen) is used.
Unsloth 2025.8.1 patched 32 layers with 32 QKV layers, 32 O layers and 0 MLP layers.


In [None]:
model.print_trainable_parameters()


trainable params: 13,631,488 || all params: 7,255,363,584 || trainable%: 0.1879


## Load Dataset

In [None]:
from datasets import load_dataset
import json

# Load the training data of the cleaned version of the medical dataset
dataset = load_dataset("FreedomIntelligence/medical-o1-reasoning-SFT", "en", split="train")

# Define the function to convert into ChatML-style format
def convert_to_chatml(example):
    return {
        "messages": [
            {
                "role": "user",
                "content": f"Below is a medical scenario. Provide a detailed reasoning and answer.\n\n### Clinical Scenario:\n{example['Question']}"
            },
            {
                "role": "assistant",
                "content": f"### Reasoning Process:\n<think>{example['Complex_CoT']}</think>\n\n### Medical Response:\n{example['Response']}"
            }
        ]
    }

# Map the dataset to new format
formatted_dataset = dataset.map(convert_to_chatml)

# Remove original columns (optional)
formatted_dataset = formatted_dataset.remove_columns(["Question", "Complex_CoT", "Response"])

# Save to JSONL
formatted_dataset.to_json("biomistral_chat_format.jsonl", orient="records", lines=True)


Map:   0%|          | 0/19704 [00:00<?, ? examples/s]

Creating json from Arrow format:   0%|          | 0/20 [00:00<?, ?ba/s]

61491071

In [None]:
from datasets import load_dataset
from unsloth.chat_templates import get_chat_template

# Load dataset from JSONL file
new_df = load_dataset("json", data_files="biomistral_chat_format.jsonl", split="train")

# Load tokenizer with chat template
tokenizer = get_chat_template(tokenizer, chat_template="mistral")

# Fix: formatting function must return a dictionary
def formatting_prompts_func(examples):
    return {
        "text": [
            tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=False)
            for message in examples["messages"]
        ]
    }

# Apply formatting
formatted_df = new_df.map(formatting_prompts_func, batched=True, remove_columns=["messages"])

Map:   0%|          | 0/19704 [00:00<?, ? examples/s]

In [None]:
test = formatted_df["text"][0]
print(test)

<s>[INST] Below is a medical scenario. Provide a detailed reasoning and answer.

### Clinical Scenario:
Given the symptoms of sudden weakness in the left arm and leg, recent long-distance travel, and the presence of swollen and tender right lower leg, what specific cardiac abnormality is most likely to be found upon further evaluation that could explain these findings? [/INST]### Reasoning Process:
<think>Okay, let's see what's going on here. We've got sudden weakness in the person's left arm and leg - and that screams something neuro-related, maybe a stroke?

But wait, there's more. The right lower leg is swollen and tender, which is like waving a big flag for deep vein thrombosis, especially after a long flight or sitting around a lot.

So, now I'm thinking, how could a clot in the leg end up causing issues like weakness or stroke symptoms?

Oh, right! There's this thing called a paradoxical embolism. It can happen if there's some kind of short circuit in the heart - like a hole that

## Training Model

In [None]:
# Import the SFTTrainer class from the trl library
from trl import SFTTrainer

# Import the TrainingArguments class from the transformers library
from transformers import TrainingArguments

training_args = TrainingArguments(
    per_device_train_batch_size = 4,  # Batch size per device during training
    gradient_accumulation_steps = 4,  # Number of steps to accumulate gradients before performing an update
    num_train_epochs = 1,             # Increase the number of training epochs to 3
    warmup_steps = 10,                 # Specify the number of warm-up steps
    learning_rate = 2e-4,             # Specify the learning rate
    fp16 = not torch.cuda.is_bf16_supported(), # Set whether to use 16-bit floating-point precision (fp16)
    bf16 = torch.cuda.is_bf16_supported(), # Set whether to use 16-bit floating-point precision (bf16)
    logging_steps = 1,                # Specify the number of steps logging
    optim = "adamw_8bit",             # Specify the optimizer (here using 8-bit AdamW)
    weight_decay = 0.01,              # Specify the weight decay (L2 regularization)
    lr_scheduler_type = "cosine",     # Specify the type of learning rate scheduler (linear/cosine/polynomial)
    seed = 3407,                      # Specify the random seed
    output_dir = "outputs",           # Specify the output directory
    report_to="none",                 # or use "tensorboard" if you want TB instead
)

trainer = SFTTrainer(
    model = model,                    # Specify the model to be trained
    tokenizer = tokenizer,            # Specify the tokenizer for the model
    train_dataset = formatted_df,          # Specify the training dataset
    dataset_text_field = "text",      # Specify the text field in the dataset
    max_seq_length = 2048,            # Maximum sequence length for tokenization
    args = training_args,             # Specify the training arguments
)

Unsloth: Tokenizing ["text"]:   0%|          | 0/19704 [00:00<?, ? examples/s]

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 19,704 | Num Epochs = 1 | Total steps = 1,232
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 13,631,488 of 7,255,363,584 (0.19% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,3.3727
2,2.9948
3,2.9211
4,3.2898
5,2.6349
6,2.291
7,2.4987
8,1.469
9,1.4334
10,0.8805


In [None]:
from transformers import pipeline

# Load the fine-tuned model and tokenizer
pipe = pipeline(
    "text-generation",
    model=trainer.model,
    tokenizer=tokenizer,
    device=0  # or -1 for CPU
)

# Example test question
prompt = tokenizer.apply_chat_template([
    {"role": "user", "content": "Below is a medical scenario. Provide a detailed reasoning and answer.\n\n### Clinical Scenario:\nA patient presents with persistent cough, night sweats, and weight loss. What is the most likely diagnosis?"}
], tokenize=False, add_generation_prompt=True)

# Generate answer
output = pipe(prompt, max_new_tokens=512, do_sample=True, temperature=0.7)
print(output[0]['generated_text'])


Save unsloth model

In [None]:
from unsloth import unsloth_save_model

unsloth_save_model(model, "outputs/peft_model")
tokenizer.save_pretrained("outputs/peft_model")

In [None]:
trainer.save_model("outputs")
# Confirm it’s updated
print("Model has LoRA:", model)

In [None]:
# load later
from transformers import AutoTokenizer
from unsloth import FastLanguageModel

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "outputs/peft_model",
    max_seq_length = 2048,
    dtype = None,
    load_in_4bit = True  # or False if you prefer
)


In [None]:
    from huggingface_hub import login
    login() # You will be prompted to enter your Hugging Face token

In [None]:
from unsloth import FastLanguageModel
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "your_model_name_or_path", # e.g., "unsloth/llama-3-8b-bnb-4bit"
    max_seq_length = 2048, # or your max sequence length
    dtype = None, # or your desired dtype
    load_in_4bit = True, # or False if not using 4-bit
)

model.push_to_hub_gguf(
  "your_username/your_repo_name",
  tokenizer,
  quantization_method = "q4_k_m", # or other GGUF quantization methods
  token = "hf_...",
  )

In [None]:
from unsloth import unsloth_save_model
# Import the unsloth_save_model function from the Unsloth library

# unsloth_save_model has the same args as model.save_pretrained
# unsloth_save_model has the same arguments as model.save_pretrained
unsloth_save_model(model, tokenizer, "output_model", push_to_hub=False, token=None)
# Save the model and tokenizer as "output_model". Do not push to the Hugging Face Hub

colab_quantize_to_gguf("output_model", quantization_method="q4_k_m")
# Convert "output_model" to GGUF format. Use the quantization method "q4_k_m"
