In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    !pip install --no-deps unsloth
    !pip install qwen_vl_utils

### Get Images

In [None]:
import gdown
import zipfile

# https://drive.google.com/file/d/1YgDh3Vy3bqmZYP5XK2gpkCHsITm2xLRg/view?usp=sharing


file_id = "1YgDh3Vy3bqmZYP5XK2gpkCHsITm2xLRg"
output = "train_images_170625.zip"

# Download the file
gdown.download(f"https://drive.google.com/uc?id={file_id}", output, quiet=False)

with zipfile.ZipFile(output, 'r') as zip_ref:
    zip_ref.extractall("train_images")

# Remove the zip file
os.remove(output)

Downloading...
From (original): https://drive.google.com/uc?id=1YgDh3Vy3bqmZYP5XK2gpkCHsITm2xLRg
From (redirected): https://drive.google.com/uc?id=1YgDh3Vy3bqmZYP5XK2gpkCHsITm2xLRg&confirm=t&uuid=75fa55fd-6989-4ba1-8a08-d433ae666cbd
To: /content/train_images_170625.zip
100%|██████████| 815M/815M [00:06<00:00, 126MB/s]


### Get CoT Outputs

In [None]:
import requests
import pandas as pd

# Get deepseek r1 generated CoT captions of CT and MR images.
url = "https://raw.githubusercontent.com/canatess/RAD-ACE/refs/heads/main/cot_generated_deepseek.csv"

response = requests.get(url)

# Save responses as files
with open("df.csv", "wb") as f:
    f.write(response.content)


df = pd.read_csv("df.csv")

df.head()

Unnamed: 0,filename,modality,cot_output,question
0,['images/pmc_1503_0.jpg'],Computed Tomography,"<think>\nAlright, let's break down what I'm se...",Describe the structures and notable features i...
1,['images/pmc_1504_0.jpg'],Computed Tomography,"<think>\nOkay, so I'm looking at a chest CT sc...",Analyze the image in a comprehensive and detai...
2,['images/pmc_1505_0.jpg'],Computed Tomography,"<think>\nOkay, so I'm trying to figure out wha...",Analyze the image in a comprehensive and detai...
3,['images/pmc_1506_0.jpg'],Magnetic Resonance Imaging,"<think>\nOkay, I need to explain the visual co...",What details stand out in this image?
4,['images/pmc_1507_0.jpg'],Computed Tomography,"<think>\nAlright, I need to analyze the given ...",Provide a step-by-step interpretation of the i...


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16800 entries, 0 to 16799
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   filename    16800 non-null  object
 1   modality    16800 non-null  object
 2   cot_output  16800 non-null  object
 3   question    16800 non-null  object
dtypes: object(4)
memory usage: 525.1+ KB


In [None]:
# Convert DataFrame to list of dictionaries
dataset = df.to_dict(orient="records")

### Preprocess Data

In [None]:
import ast
import os
from PIL import Image
from concurrent.futures import ThreadPoolExecutor

def process_image(path):
    try:
        full_path = os.path.join("train_images/train_images", os.path.basename(path))
        image = Image.open(full_path).convert("RGB")
        image.thumbnail((296, 296))  # Faster resizing
        return image
    except Exception:
        return None

def convert_to_conversation(sample):
    question = sample["question"]
    cot_output = sample["cot_output"]

    try:
        image_paths = ast.literal_eval(sample["filename"])
    except (SyntaxError, ValueError):
        return None

    with ThreadPoolExecutor() as executor:
        images = list(filter(None, executor.map(process_image, image_paths)))

    if not images:
        return None

    conversation = [
        {"role": "user", "content": [{"type": "text", "text": question}, {"type": "image", "image": images[0]}]},
        {"role": "assistant", "content": [{"type": "text", "text": cot_output}]},
    ]
    return {"messages": conversation}

In [None]:
# Example formatted data for Qwen model
conversation = convert_to_conversation(dataset[0])
conversation

{'messages': [{'role': 'user',
   'content': [{'type': 'text',
     'text': 'Describe the structures and notable features in the image.'},
    {'type': 'image',
     'image': <PIL.Image.Image image mode=RGB size=296x274>}]},
  {'role': 'assistant',
   'content': [{'type': 'text',
     'text': '<think>\nAlright, let\'s break down what I\'m seeing here. The image provided is a CT scan focusing on the cervical spine, specifically looking at three different planes: axial, sagittal, and coronal. That gives me a comprehensive view of the area from various angles, which is crucial for accurately diagnosing any abnormalities.\n\nLooking at the axial slices first, I notice there\'s a fracture at the base of the odontoid process. The term "mildly comminuted" suggests that the fracture isn\'t too fragmented; it\'s just a small break. Now, moving on to the sagittal view, this gives me a better sense of alignment and any potential displacement. Here, I see that there\'s less than 2 mm of anterolist

In [None]:
# Process all the dataset takes around 1-2 min
from tqdm import tqdm

converted_dataset = []
for sample in tqdm(dataset, desc="Converting all"):
    converted = convert_to_conversation(sample)
    if converted is not None:
        converted_dataset.append(converted)

Converting all: 100%|██████████| 16800/16800 [01:18<00:00, 214.37it/s]


In [None]:
print("Length of conversation formatted dataset : ",len(converted_dataset))

Length of conversation formatted dataset :  12233


### Download Base Qwen-2.5VL-3B Model

In [None]:
from unsloth import FastVisionModel
model, tokenizer = FastVisionModel.from_pretrained(
    "unsloth/Qwen2.5-VL-3B-Instruct",
    load_in_4bit = False,
    use_gradient_checkpointing = "unsloth",
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.6.2: Fast Qwen2 patching. Transformers: 4.52.4.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: QLoRA and full finetuning all not selected. Switching to 16bit LoRA.


model.safetensors.index.json:   0%|          | 0.00/65.4k [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.51G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/238 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/575 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/5.80k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

chat_template.jinja:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

You have video processor config saved in `preprocessor.json` file which is deprecated. Video processor configs should be saved in their own `video_preprocessor.json` file. You can rename the file or load and save the processor back which renames it automatically. Loading from `preprocessor.json` will be removed in v5.0.


chat_template.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

### Setting Up LoRa For Efficient Fine-Tuning

In [None]:
model = FastVisionModel.get_peft_model(
    model,
    finetune_vision_layers     = True,
    finetune_language_layers   = True,
    finetune_attention_modules = True,
    finetune_mlp_modules       = True,

    r = 16,           # The larger, the higher the accuracy, but might overfit
    lora_alpha = 16,  # Recommended alpha == r at least
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

### Training Parameters and HyperParameters

In [None]:
from unsloth import is_bf16_supported, FastVisionModel
from unsloth.trainer import UnslothVisionDataCollator
from trl import SFTTrainer, SFTConfig

# Enable training optimizations
FastVisionModel.for_training(model)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=UnslothVisionDataCollator(model, tokenizer),  # Handles image + text inputs
    train_dataset=converted_dataset,
    args=SFTConfig(
        per_device_train_batch_size=2,        # Batch size per GPU
        gradient_accumulation_steps=4,        # Accumulate grads to simulate larger batch size
        warmup_steps=50,                      # LR warmup steps at start of training
        num_train_epochs=2,                   # Number of passes over dataset

        learning_rate=2e-4,                   # Initial learning rate
        fp16=not is_bf16_supported(),         # Use fp16 if bf16 is not supported
        bf16=is_bf16_supported(),             # Prefer bf16 if hardware supports it
        logging_steps=10,                     # Log every 10 steps
        optim="adamw_8bit",                   # 8-bit optimizer for memory efficiency

        weight_decay=0.01,                    # Regularization to prevent overfitting
        lr_scheduler_type="cosine",           # Cosine learning rate decay

        seed=3407,                            # For reproducibility
        output_dir="outputs",                 # Where to save checkpoints and logs
        report_to="none",                     # Disable W&B or HF tracking

        remove_unused_columns=False,          # Required for multimodal (image+text) training
        dataset_text_field="",                # Unused for vision; leave empty
        dataset_kwargs={"skip_prepare_dataset": True},  # Skip preprocessing (dataset already ready)
        dataset_num_proc=4,                   # Parallel processing workers
        max_seq_length=2048,                  # Max context length (e.g., Qwen2.5-VL)
    )
)


Unsloth: Model does not have a default image size - using 512


### Training

In [None]:
# Begin training
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 12,233 | Num Epochs = 2 | Total steps = 3,060
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,084,928/3,795,707,904 (1.08% trained)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
10,2.0218
20,1.8243
30,1.5446
40,1.4278
50,1.33
60,1.3152
70,1.2459
80,1.2415
90,1.1958
100,1.199


### Save Model

In [None]:
# Save model weights in a folder
model.save_pretrained("qwen3b")
tokenizer.save_pretrained("qwen3b")

[]

In [None]:
# Download zipped model into local computer
!zip -r qwen3b.zip qwen3b

from google.colab import files
files.download("qwen3b.zip")

  adding: qwen3b/ (stored 0%)
  adding: qwen3b/tokenizer_config.json (deflated 88%)
  adding: qwen3b/video_preprocessor_config.json (deflated 71%)
  adding: qwen3b/special_tokens_map.json (deflated 69%)
  adding: qwen3b/adapter_config.json (deflated 56%)
  adding: qwen3b/added_tokens.json (deflated 67%)
  adding: qwen3b/tokenizer.json (deflated 81%)
  adding: qwen3b/chat_template.jinja (deflated 65%)
  adding: qwen3b/preprocessor_config.json (deflated 50%)
  adding: qwen3b/adapter_model.safetensors (deflated 27%)
  adding: qwen3b/README.md (deflated 66%)
  adding: qwen3b/merges.txt (deflated 57%)
  adding: qwen3b/vocab.json (deflated 61%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Get Test Images

In [None]:
gdown.download("https://drive.google.com/uc?id=18nb9I1ff6YChu7X4H_OPeax4DmefePdJ", "images.zip", quiet=False)

with zipfile.ZipFile("images.zip", "r") as zip_ref:
    zip_ref.extractall("test_images")

image_dir = "test_images/photos"

Downloading...
From: https://drive.google.com/uc?id=18nb9I1ff6YChu7X4H_OPeax4DmefePdJ
To: /content/images.zip
100%|██████████| 1.00M/1.00M [00:00<00:00, 151MB/s]


### Generate Reports On Test Data and Save CSV

In [None]:
# Run through all and download the reports generated

def run_inference_on_image(model, tokenizer, image_path, instruction="Describe this image as if you are an radiologist"):
    image = Image.open(image_path).convert("RGB")
    image.thumbnail((296, 296))

    messages = [
        {"role": "user", "content": [
            {"type": "image"},
            {"type": "text", "text": instruction}
        ]}
    ]

    input_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
    inputs = tokenizer(image, input_text, add_special_tokens=False, return_tensors="pt").to("cuda")

    outputs = model.generate(**inputs, max_new_tokens=2048, use_cache=True)
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Remove system instruction
    decoded = decoded.split("assistant")[-1].strip()

    return decoded

In [None]:
results = []

# Get image filenames
image_files = [f for f in os.listdir(image_dir) if f.lower().endswith((".jpg", ".jpeg", ".png"))]

for fname in tqdm(image_files, desc="Running inference on images"):
    img_path = os.path.join(image_dir, fname)
    output = run_inference_on_image(model, tokenizer, img_path)
    results.append({"image_name": fname, "model_output": output})

# Save to CSV
df = pd.DataFrame(results)
df.to_csv("qwen3b_inference_results.csv", index=False)

# Download CSV
from google.colab import files
files.download("qwen3b_inference_results.csv")

Running inference on images: 100%|██████████| 25/25 [22:41<00:00, 54.47s/it]


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>