In [None]:
!pip install unsloth

#latest Unsloth
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

*Llama 3.2 Vision 11B, 90B
*16bit LoRA via `load_in_4bit=False` or 4bit QLoRA. Both are accelerated and use much less memory

In [None]:
from unsloth import FastVisionModel #LLM unsloth
import torch


#4bit pre quantized models supported
fourbit_models = [
    #Llama
    "unsloth/Llama-3.2-11B-Vision-Instruct-bnb-4bit", #Llama 3.2 vision support
    "unsloth/Llama-3.2-11B-Vision-bnb-4bit",
    "unsloth/Llama-3.2-90B-Vision-Instruct-bnb-4bit",
    "unsloth/Llama-3.2-90B-Vision-bnb-4bit",

    #Pixtral
    "unsloth/Pixtral-12B-2409-bnb-4bit",
    "unsloth/Pixtral-12B-Base-2409-bnb-4bit",

    #Qwen
    "unsloth/Qwen2-VL-2B-Instruct-bnb-4bit",
    "unsloth/Qwen2-VL-7B-Instruct-bnb-4bit",
    "unsloth/Qwen2-VL-72B-Instruct-bnb-4bit",

    #Llava
    "unsloth/llava-v1.6-mistral-7b-hf-bnb-4bit",
    "unsloth/llava-1.5-7b-hf-bnb-4bit",
]
#additional models at https://huggingface.co/unsloth

model, tokenizer = FastVisionModel.from_pretrained(
    "unsloth/Llama-3.2-11B-Vision-Instruct",
    load_in_4bit = True, #4bit to reduce memory use. False for 16bit LoRA.
    use_gradient_checkpointing = "unsloth", #true or "unsloth" for long context
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
Unsloth: Failed to patch Gemma3ForConditionalGeneration.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.3.19: Fast Mllama patching. Transformers: 4.51.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json:   0%|          | 0.00/375k [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.94G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/210 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/477 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.9k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/5.15k [00:00<?, ?B/s]

Add LoRA adapters for parameter efficient finetuning - this allows us to only efficiently train 1% of all parameters.

Also support finetuning ONLY the vision part of the model, or ONLY the language part. Or you can select both. You can also select to finetune the attention or the MLP layers

In [None]:
model = FastVisionModel.get_peft_model(
    model,
    finetune_vision_layers     = True, #False if not finetuning vision layers
    finetune_language_layers   = True, #False if not finetuning language layers
    finetune_attention_modules = True, #False if not finetuning attention layers
    finetune_mlp_modules       = True, #False if not finetuning MLP layers

    #hyperparameters
    r = 8,           #larger -> the higher the accuracy, but might overfit
    lora_alpha = 8,  #recommended alpha == r
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
    use_rslora = False,  #rank stabilized LoRA
    loftq_config = None,
    # target_modules = "all-linear", #optional
)

Unsloth: Making `model.base_model.model.vision_model.transformer` require gradients


<a name="Data"></a>
### Data Prep

Upload data folders to the mounted drive



In [None]:
#mount your Google Drive
from google.colab import drive
import os
drive.mount('/content/drive')



#path to data folder
base_folder = "/content/drive/MyDrive/training_data"



#debug information
print(f"Checking base folder: {base_folder}")
print(f"Folder exists: {os.path.exists(base_folder)}")
print(f"Contents of base folder: {os.listdir(base_folder) if os.path.exists(base_folder) else 'Not found'}")



#function for separate image and caption folders
def load_image_caption_pairs(base_folder):
    image_caption_pairs = []

    #image + caption paths
    images_folder = os.path.join(base_folder, "images")
    captions_folder = os.path.join(base_folder, "captions")

    print(f"Images folder: {images_folder}, exists: {os.path.exists(images_folder)}")
    print(f"Captions folder: {captions_folder}, exists: {os.path.exists(captions_folder)}")


    if os.path.exists(images_folder) and os.path.exists(captions_folder):

        #image files
        image_extensions = ['.png', '.jpg', '.jpeg']
        image_files = []
        for ext in image_extensions:
            image_files.extend([f for f in os.listdir(images_folder) if f.lower().endswith(ext)])

        print(f"Found {len(image_files)} image files")


        #for each image file
        paired_count = 0
        for img_file in image_files:
            img_path = os.path.join(images_folder, img_file)

            #find corresponding caption file with the same name but in the captions folder
            caption_file = os.path.splitext(img_file)[0] + '.txt'
            caption_path = os.path.join(captions_folder, caption_file)


            #check if caption file exists
            if os.path.exists(caption_path):
                paired_count += 1

                #read caption from file
                with open(caption_path, 'r') as f:
                    caption = f.read().strip()

                #add to dataset
                image_caption_pairs.append({
                    "image_path": img_path,
                    "caption": caption
                })

                #print first 3 pairs
                if paired_count <= 3:
                    print(f"Paired: {img_file} with {caption_file}")
                    print(f"Caption (preview): {caption[:50]}...")

        print(f"Found {paired_count} matching image-caption pairs")

    return image_caption_pairs



#load the data
data = load_image_caption_pairs(base_folder)
print(f"Dataset size: {len(data)}")



#convert to Hugging Face Dataset
if len(data) > 0:
    dataset = Dataset.from_list(data)


    #image loading function
    def process_images(examples):
        examples["image"] = [Image.open(path).convert("RGB") for path in examples["image_path"]]
        return examples

    #apply the transformation to the dataset
    dataset = dataset.map(process_images, batched=True)
    print(f"Dataset after processing: {dataset}")


    #verify that a sample can be accessed
    if len(dataset) > 0:
        print("First item in dataset:")
        print(f"Image path: {dataset[0]['image_path']}")
        print(f"Caption: {dataset[0]['caption']}")


        #instructions
        instruction = "You are an expert analyzer of disaster images for damage assessment. Describe accurately what you see in this image."



        def convert_to_conversation(sample):
            conversation = [
                { "role": "user",
                  "content" : [
                    {"type" : "text", "text" : instruction},
                    {"type" : "image", "image" : sample["image"]} ]
                },
                { "role" : "assistant",
                  "content" : [
                    {"type" : "text", "text" : sample["caption"]} ]
                },
            ]
            return { "messages" : conversation }



        #convert only after verifying dataset structure
        converted_dataset = [convert_to_conversation(sample) for sample in dataset]
        print(f"Converted dataset size: {len(converted_dataset)}")

        if len(converted_dataset) > 0:
            print("First item in converted dataset:")
            print(converted_dataset[0])
    else:
        print("Dataset is empty after processing!")
else:
    print("No image-caption pairs found! Check your folder structure and file naming.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Checking base folder: /content/drive/MyDrive/training_data
Folder exists: True
Contents of base folder: ['captions', 'images']
Images folder: /content/drive/MyDrive/training_data/images, exists: True
Captions folder: /content/drive/MyDrive/training_data/captions, exists: True
Found 90 image files
Paired: image_00001_crop_25.jpg with image_00001_crop_25.txt
Caption (preview): Major damage, roof missing and tarps covering the ...
Paired: image_00045_crop_0.jpg with image_00045_crop_0.txt
Caption (preview): Major damage, residential house flooded and surrou...
Paired: image_00055_crop_0.jpg with image_00055_crop_0.txt
Caption (preview): Affected damage, side of warehouse building ripped...
Found 90 matching image-caption pairs
Dataset size: 90


Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Dataset after processing: Dataset({
    features: ['image_path', 'caption', 'image'],
    num_rows: 90
})
First item in dataset:
Image path: /content/drive/MyDrive/training_data/images/image_00001_crop_25.jpg
Caption: Major damage, roof missing and tarps covering the roof
Converted dataset size: 90
First item in converted dataset:
{'messages': [{'role': 'user', 'content': [{'type': 'text', 'text': 'You are an expert radiographer. Describe accurately what you see in this image.'}, {'type': 'image', 'image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=132x114 at 0x7BC848A55450>}]}, {'role': 'assistant', 'content': [{'type': 'text', 'text': 'Major damage, roof missing and tarps covering the roof'}]}]}


<a name="Train"></a>
### Train the model
Huggingface TRL's `SFTTrainer`to train - [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer).

Use new `UnslothVisionDataCollator` which will help in our vision finetuning setup.

In [None]:
from unsloth import is_bf16_supported
from unsloth.trainer import UnslothVisionDataCollator
from trl import SFTTrainer, SFTConfig

#enable training with unsloth
FastVisionModel.for_training(model) # Enable for training!


#training paramters
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    data_collator = UnslothVisionDataCollator(model, tokenizer),
    train_dataset = converted_dataset,
    args = SFTConfig(
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 30,
        num_train_epochs = 3,
        learning_rate = 2e-4,
        fp16 = not is_bf16_supported(),
        bf16 = is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none",     #weights and Biases otherwise


        remove_unused_columns = False,
        dataset_text_field = "",
        dataset_kwargs = {"skip_prepare_dataset": True},
        dataset_num_proc = 4,
        max_seq_length = 2048,
    ),
)

In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.748 GB.
9.068 GB of memory reserved.


In [None]:
#run training
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 90 | Num Epochs = 3 | Total steps = 30
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 33,587,200/11,000,000,000 (0.31% trained)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,2.5486
2,2.0362
3,2.9489
4,2.5717
5,1.8996
6,1.4107
7,1.0518
8,1.0833
9,1.073
10,0.8251


In [None]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

NameError: name 'start_gpu_memory' is not defined

In [None]:
#local saving
model.save_pretrained("lora_model") # Local saving
tokenizer.save_pretrained("lora_model")


#repo saving
# model.push_to_hub("your_name/lora_model", token = "...")
# tokenizer.push_to_hub("your_name/lora_model", token = "...")

[]

In [None]:
if False:
    from unsloth import FastVisionModel
    model, tokenizer = FastVisionModel.from_pretrained(
        model_name = "lora_model", #fine tuned model
        load_in_4bit = True, #set to False for 16bit LoRA
    )
    FastVisionModel.for_inference(model) #inference


#pass in test image
image = dataset[0]["image"]
instruction = "You are an expert label tool that can identify damage as a result natural disasters. Describe accurately what you see in this image."


#instructions + tokenize for processing
messages = [
    {"role": "user", "content": [
        {"type": "image"},
        {"type": "text", "text": instruction}
    ]}
]
input_text = tokenizer.apply_chat_template(messages, add_generation_prompt = True)
inputs = tokenizer(
    image,
    input_text,
    add_special_tokens = False,
    return_tensors = "pt",
).to("cuda")


#run inference
from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 500,
                   use_cache = True, temperature = 1.5, min_p = 0.1)

Major Damage, likely from flooding<|eot_id|>
