In [None]:
%%capture
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

### Load Dataset

In [None]:
BASE_PATH="bone-fracture-dataset/Bone_Fracture_Dataset/"
folders={"dataset/test/images/":"dataset/test_new/images/","dataset/train/images/":"dataset/new_train/images/"}

In [None]:
# Step 1: Upload kaggle.json
from google.colab import files
files.upload()

# Step 2: Configure Kaggle
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Step 3: Download Dataset
!kaggle datasets download -d tuantm04/bone-fracture-dataset

# Step 4: Unzip Dataset
!unzip bone-fracture-dataset.zip -d bone-fracture-dataset

# Step 5: Verify Files
!ls bone-fracture-dataset

KeyboardInterrupt: 

In [None]:
def denormalize_coordinates(norm_x, norm_y, norm_width, norm_height, image_width, image_height):
    # Convert center coordinates to pixels
    x = int(norm_x * image_width)
    y = int(norm_y * image_height)
    width = int(norm_width * image_width)
    height = int(norm_height * image_height)

    # Calculate corner points
    x1 = int(x - width/2)
    y1 = int(y - height/2)
    x2 = int(x + width/2)
    y2 = int(y + height/2)

    return x1, y1, x2, y2


In [None]:
import cv2
import pandas as pd
import os

def create_bounding_box(csv_file, path, new_path):
    # Create the new path if it doesn't exist
    if not os.path.exists(new_path + "images/"):
        os.makedirs(new_path + "images/")

    df = pd.read_csv(csv_file, header=None)
    df.head()

    for index, row in df.iterrows():
        # Read the image
        image = cv2.imread(path + "images/" + row[0])
        if image is None:
            print(f"Image {row[0]} not found, skipping.")
            continue

        height, width, channels = image.shape

        # Read the label file
        label_file_path = path + "labels/" + row[1]
        if not os.path.exists(label_file_path):
            print(f"Label file {row[1]} not found, skipping.")
            continue

        with open(label_file_path) as f:
            labels = f.read()
            for label in labels.split("\n"):
                if label:
                    coordinates = label.split(" ")
                    x, y, width, height = denormalize_coordinates(
                        float(coordinates[1]),
                        float(coordinates[2]),
                        float(coordinates[3]),
                        float(coordinates[4]),
                        width,
                        height
                    )
                    start_point = (int(x), int(y))
                    end_point = (int(width), int(height))
                    color = (0, 255, 0)  # Green color in BGR
                    thickness = 2
                    # Draw rectangle on the image
                    image = cv2.rectangle(image, start_point, end_point, color, thickness)

        # Save the modified image to the new path
        cv2.imwrite(new_path + "images/" + row[0], image)


create_bounding_box(BASE_PATH+"train.csv",BASE_PATH+"dataset/train/",BASE_PATH+"dataset/new_train/")
create_bounding_box(BASE_PATH+"test.csv",BASE_PATH+"dataset/test/",BASE_PATH+"dataset/new_test/")

In [None]:
import json
import pandas as pd
df=pd.read_csv(BASE_PATH+"train.csv",header=None)
df.head()
for index, row in df.iterrows():
    data = {
        "image": "dataset/train/images/"+row[0],
        "data": "The x-ray is for DOG, and e-ray represt the break in bone"
    }


    with open(BASE_PATH + "train.jsonl", "a") as f:
        json.dump(data, f)
        f.write('\n')

In [None]:
import json

def load_jsonl(filepath):
  """
  Loads a JSONL file into a list of dictionaries.

  Args:
    filepath: Path to the JSONL file.

  Returns:
    A list of dictionaries.
  """
  examples = []
  max_lines=2000
  with open(BASE_PATH+filepath, "r") as f:
    for i, line in enumerate(f):
      if max_lines is not None and i >= max_lines:
        break
      examples.append(json.loads(line))
  return examples


dataset=load_jsonl("train.jsonl")

In [None]:
import json
from PIL import Image
import os
instruction = "You are an expert radiographer. Describe accurately what you see in this image."

def get_image_from_local(image_file_name):
    image_path = os.path.join(BASE_PATH, image_file_name)

    if not os.path.exists(image_path):
        raise FileNotFoundError(f"Image file {image_path} does not exist.")

    image = Image.open(image_path)
    return image
def convert_to_conversation(sample):
    conversation = [
        { "role": "user",
          "content" : [
            {"type" : "text",  "text"  : instruction},
            {"type" : "image", "image" : get_image_from_local(sample["image"]).convert("RGB")} ]
        },
        { "role" : "assistant",
          "content" : [
            {"type" : "text",  "text"  : sample["data"]} ]
        },
    ]
    return { "messages" : conversation }
pass

converted_dataset = [convert_to_conversation(sample) for sample in dataset]

In [None]:
from unsloth import FastVisionModel # FastLanguageModel for LLMs
import torch

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Llama-3.2-11B-Vision-Instruct-bnb-4bit", # Llama 3.2 vision support
    "unsloth/Llama-3.2-11B-Vision-bnb-4bit",
    "unsloth/Llama-3.2-90B-Vision-Instruct-bnb-4bit", # Can fit in a 80GB card!
    "unsloth/Llama-3.2-90B-Vision-bnb-4bit",

    "unsloth/Pixtral-12B-2409-bnb-4bit",              # Pixtral fits in 16GB!
    "unsloth/Pixtral-12B-Base-2409-bnb-4bit",         # Pixtral base model

    "unsloth/Qwen2-VL-2B-Instruct-bnb-4bit",          # Qwen2 VL support
    "unsloth/Qwen2-VL-7B-Instruct-bnb-4bit",
    "unsloth/Qwen2-VL-72B-Instruct-bnb-4bit",

    "unsloth/llava-v1.6-mistral-7b-hf-bnb-4bit",      # Any Llava variant works!
    "unsloth/llava-1.5-7b-hf-bnb-4bit",
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastVisionModel.from_pretrained(
    "unsloth/Llama-3.2-11B-Vision-Instruct",
    load_in_4bit = True, # Use 4bit to reduce memory use. False for 16bit LoRA.
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for long context
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2024.12.1: Fast Mllama vision patching. Transformers: 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/210 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/477 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.9k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/5.15k [00:00<?, ?B/s]

In [None]:
model = FastVisionModel.get_peft_model(
    model,
    finetune_vision_layers     = True, # False if not finetuning vision layers
    finetune_language_layers   = True, # False if not finetuning language layers
    finetune_attention_modules = True, # False if not finetuning attention layers
    finetune_mlp_modules       = True, # False if not finetuning MLP layers

    r = 16,           # The larger, the higher the accuracy, but might overfit
    lora_alpha = 16,  # Recommended alpha == r at least
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
    # target_modules = "all-linear", # Optional now! Can specify a list if needed
)

In [None]:
FastVisionModel.for_inference(model) # Enable for inference!

image = converted_dataset[0]["messages"][0]["content"][1]["image"]
instruction = "You are an expert radiographer. Describe accurately what you see in this image."

messages = [
    {"role": "user", "content": [
        {"type": "image"},
        {"type": "text", "text": instruction}
    ]}
]
input_text = tokenizer.apply_chat_template(messages, add_generation_prompt = True)
inputs = tokenizer(
    image,
    input_text,
    add_special_tokens = False,
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128,
                   use_cache = True, temperature = 1.5, min_p = 0.1)

This radiograph appears to be a frontal view of the wrist, showcasing several distinct features:

**Bones and Structures**:

The radius, ulna, scaphoid, lunate, triquetrum, and pisiform bones are all clearly visible in the radiograph. 

Additionally, the proximal carpal bones, such as the capitates and hamate, appear normal.

The distal radius and ulna bones are also well-defined in this image.

**Notable Observations**:

The bone density of the hand and wrist is normal in this patient. 

A minor linear density is also present on the proximal and dist


In [None]:
from unsloth import is_bf16_supported
from unsloth.trainer import UnslothVisionDataCollator
from trl import SFTTrainer, SFTConfig

FastVisionModel.for_training(model) # Enable for training!

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    data_collator = UnslothVisionDataCollator(model, tokenizer), # Must use!
    train_dataset = converted_dataset,
    args = SFTConfig(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 30,
        # num_train_epochs = 1, # Set this instead of max_steps for full training runs
        learning_rate = 2e-4,
        fp16 = not is_bf16_supported(),
        bf16 = is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none",     # For Weights and Biases

        # You MUST put the below items for vision finetuning:
        remove_unused_columns = False,
        dataset_text_field = "",
        dataset_kwargs = {"skip_prepare_dataset": True},
        dataset_num_proc = 4,
        max_seq_length = 2048,
    ),
)

max_steps is given, it will override any value given in num_train_epochs


In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 2,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 30
 "-____-"     Number of trainable parameters = 67,174,400
🦥 Unsloth needs about 1-3 minutes to load everything - please wait!


Step,Training Loss
1,4.7561
2,4.6466
3,4.7219
4,4.7059
5,4.5461
6,4.0434
7,3.4529
8,2.8123
9,2.1848
10,1.5733


In [None]:
FastVisionModel.for_inference(model) # Enable for inference!

image = converted_dataset[0]["messages"][0]["content"][1]["image"]
instruction = "You are an expert radiographer. Describe accurately what you see in this image."

messages = [
    {"role": "user", "content": [
        {"type": "image"},
        {"type": "text", "text": instruction}
    ]}
]
input_text = tokenizer.apply_chat_template(messages, add_generation_prompt = True)
inputs = tokenizer(
    image,
    input_text,
    add_special_tokens = False,
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128,
                   use_cache = True, temperature = 1.5, min_p = 0.1)




The x-ray is for DOG, and e-ray represt the break in bone<|eot_id|>


In [None]:
import os
os.environ["HF_TOKEN"] = ""

model.save_pretrained("lora_model")
tokenizer.save_pretrained("lora_model")



model.push_to_hub("a2aniket/lora_model", token = "")
tokenizer.push_to_hub("a2aniket/lora_model", token = "")


README.md:   0%|          | 0.00/615 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/269M [00:00<?, ?B/s]

Saved model to https://huggingface.co/a2aniket/lora_model


  0%|          | 0/1 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]