In [None]:
%%capture
!pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
!pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
!pip install --no-deps unsloth

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 
dtype = None 
load_in_4bit = True 
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/mistral-7b-instruct-v0.3-bnb-4bit", 
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,

)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.2.9: Fast Mistral patching. Transformers: 4.48.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/4.14G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/157 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/141k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/446 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, 
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, 
    bias = "none",    

    use_gradient_checkpointing = "unsloth", 
    random_state = 3407,
    use_rslora = False, 
    loftq_config = None, 
)

Unsloth 2025.2.9 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [None]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "chatml", 
    mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, 
    map_eos_token = True, 
)

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }
pass

from datasets import load_dataset
dataset = load_dataset("bpingua/medquad_sharegpt_cleaned", split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)

Unsloth: Will map <|im_end|> to EOS = <|im_end|>.


README.md:   0%|          | 0.00/353 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/8.84M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16407 [00:00<?, ? examples/s]

Map:   0%|          | 0/16407 [00:00<?, ? examples/s]

In [None]:
dataset[5]["conversations"]

[{'from': 'human', 'value': 'What is (are) Glaucoma ?'},
 {'from': 'gpt',
  'value': 'The optic nerve is a bundle of more than 1 million nerve fibers. It connects the retina to the brain.'}]

In [None]:
print(dataset[5]["text"])

<|im_start|>user
What is (are) Glaucoma ?<|im_end|>
<|im_start|>assistant
The optic nerve is a bundle of more than 1 million nerve fibers. It connects the retina to the brain.<|im_end|>



In [None]:
dataset

Dataset({
    features: ['conversations', 'text'],
    num_rows: 16407
})

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, 
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs=1,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "wandb", 
    ),
)

Map (num_proc=2):   0%|          | 0/16412 [00:00<?, ? examples/s]

In [None]:

gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.741 GB.
5.785 GB of memory reserved.


In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 16,412 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 10
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,1.425
2,1.7225
3,1.655
4,1.406
5,1.4216
6,1.3057
7,1.2535
8,1.2188
9,1.1696
10,1.0894


In [None]:

used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

141.3359 seconds used for training.
2.36 minutes used for training.
Peak reserved memory = 5.785 GB.
Peak reserved memory for training = 0.0 GB.
Peak reserved memory % of max memory = 39.244 %.
Peak reserved memory for training % of max memory = 0.0 %.


In [None]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "chatml", 
    mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"},
    map_eos_token = True, 
)

FastLanguageModel.for_inference(model) 

messages = [
    {"from": "human", "value": "What is (are) Glaucoma ?"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, 
    return_tensors = "pt",
).to("cuda")

outputs = model.generate(input_ids = inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


['<|im_start|>user\nWhat is (are) Glaucoma ?<|im_end|>\n<|im_start|>assistant\nGlaucoma is a group of eye diseases that damage the optic nerve and result in vision loss and blindness. Glaucoma occurs when the normal fluid pressure inside the eyes slowly rises. However, with open-angle glaucoma—the most common form—there are no symptoms in the']

In [None]:
torch.cuda.empty_cache()

In [None]:
import gc
gc.collect()

0

In [None]:
if True: model.save_pretrained_merged("mistral-7b-instruct-v0.3-bnb-4bit-Medquad", tokenizer, save_method = "merged_16bit",)
if True: model.push_to_hub_merged("bpingua/mistral-7b-instruct-v0.3-bnb-4bit-Medquad", tokenizer, save_method = "merged_16bit", token = "")

Unsloth: Merging 4bit and LoRA weights to 4bit...
This might take 5 minutes...




Done.
Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 10 minutes for Llama-7b... Done.
Unsloth: Merging 4bit and LoRA weights to 4bit...
This might take 5 minutes...
Done.
Unsloth: Saving 4bit Bitsandbytes model. Please wait...


  0%|          | 0/1 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/4.14G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.19k [00:00<?, ?B/s]

Saved merged_4bit model to https://huggingface.co/bpingua/mistral-7b-instruct-v0.3-bnb-4bit-Medquad
