In [None]:
# Cell 1: Install necessary libraries
!pip install --upgrade transformers trl datasets
!pip install --upgrade unsloth
!pip install --upgrade torch
!pip install --upgrade streamlit transformers torch pyngrok
!pip install nvidia-tensorrt
!pip install streamlit -q
!pip install pyngrok
# You can use localtunnel as nother option to expose your end point.
# # #!npm install -g localtunnel

Collecting transformers
  Downloading transformers-4.46.3-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting trl
  Downloading trl-0.12.1-py3-none-any.whl.metadata (10 kB)
Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading transformers-4.46.3-py3-none-any.whl (10.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
#Get Hugging Face token from user data
from google.colab import userdata
# In the left side bar, you can find a key logo, click on it and create your Hugging Face access token key variable
HF_TOKEN = userdata.get('HF_TOKEN')


In [None]:
#Import necessary libraries
import torch
from unsloth import FastVisionModel
from unsloth import is_bf16_supported
from unsloth.trainer import UnslothVisionDataCollator
from trl import SFTTrainer, SFTConfig
from datasets import load_dataset
from transformers import TextStreamer
from pyngrok import
from PIL import Image

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [None]:
# Load pre-trained model and tokenizer
model, tokenizer = FastVisionModel.from_pretrained(
    "unsloth/Llama-3.2-11B-Vision-Instruct",
    load_in_4bit = True,
    use_gradient_checkpointing = "unsloth",
    ignore_mismatched_sizes=True
)

==((====))==  Unsloth 2024.11.11: Fast Mllama vision patching. Transformers: 4.46.3.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json:   0%|          | 0.00/385k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/210 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/477 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.9k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/5.15k [00:00<?, ?B/s]

In [None]:
# Defining layer which we are going to fine-tune and apply PEFT model
model = FastVisionModel.get_peft_model(
    model,
    finetune_vision_layers = True,
    finetune_attention_modules =True,
    finetune_mlp_modules = True,
    r = 16,
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None
)

In [None]:
# Load the dataset, combining train and validation splits for a larger training set
dataset = load_dataset("unsloth/Radiology_mini", split="train")

instruction = """You are an expert radiographer.
Carefully examine the provided medical image.
Describe your observations accurately and comprehensively, including any abnormalities or significant features.
Based on your expertise, suggest the next steps or potential solutions, if applicable.
Ensure your response is clear, concise, and professional."""

README.md:   0%|          | 0.00/610 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/481M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/79.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1978 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/327 [00:00<?, ? examples/s]

In [None]:
#Define a function to convert samples to conversation format
def convert_to_conversation(sample):
  conversation = [
      {
          "role": "user",
          "content": [
              {
                  "type": "text",
                  "text": instruction

              },
              {
                  "type": "image",
                  "image": sample["image"]

              }
          ]
      },
      {
          "role": "assistant",
          "content": [
              { "type": "text", "text" : sample["caption"]}
          ]
      }
  ]

  return { "messages" : conversation }

In [None]:
#Convert the dataset to conversation format
converted_dataset = [convert_to_conversation(sample) for sample in dataset]


In [None]:
#Run inference on a sample image
FastVisionModel.for_inference(model)
image = dataset[5]["image"]
instruction = """You are an expert radiographer.
Carefully examine the provided medical image.
Describe your observations accurately and comprehensively, including any abnormalities or significant features.
Based on your expertise, suggest the next steps or potential solutions, if applicable.
Ensure your response is clear, concise, and professional."""

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",

            },
            {
                "type": "text",
                "text": instruction
            }
        ]
    }
]

input_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
inputs = tokenizer(
    image,
    input_text,
    return_tensors="pt",
    add_special_tokens=False
).to("cuda")

print("\nBT-Answer:\n")
text_streamer = TextStreamer(tokenizer, skip_prompt=True)
_ = model.generate(
    **inputs,
    streamer=text_streamer,
    max_new_tokens=125,
    use_cache=True,
    temperature=1.5,
    min_p=0.1
)


BT-Answer:

The provided medical image appears to be an angiogram, specifically an image of the subclavian artery. The subclavian artery is a major blood vessel that runs through the neck and shoulders.

**Observations:**

*   **Normal Subclavian Artery:** The subclavian artery is depicted in its typical location, supplying blood to the arms.
*   **Bulge in the Subclavian Artery:** A notable abnormality is observed, characterized by a bulge or dilation in the subclavian artery, proximal to its branching points into the axillary and vertebral arteries. This suggests a


In [None]:
#Fine-tune the model using SFTTrainer
FastVisionModel.for_training(model)

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    data_collator = UnslothVisionDataCollator(model, tokenizer),
    train_dataset = converted_dataset,
    args = SFTConfig(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        max_steps = 30,
        warmup_steps = 5,
        learning_rate = 2e-4,
        fp16 = not is_bf16_supported(),
        bf16 = is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none",
        remove_unused_columns = False,
        dataset_text_field = "",
        dataset_kwargs = {"skip_prepare_dataset": True},
        dataset_num_proc = 4,
        max_seq_length = 2048
    )
)

max_steps is given, it will override any value given in num_train_epochs


In [None]:
#Print GPU memory usage statistics
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")
# stats of the training
trainer_stats = trainer.train()

used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

GPU = Tesla T4. Max memory = 14.748 GB.
8.479 GB of memory reserved.


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 1,978 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 30
 "-____-"     Number of trainable parameters = 67,174,400
🦥 Unsloth needs about 1-3 minutes to load everything - please wait!


Step,Training Loss
1,3.3714
2,3.3519
3,3.1912
4,2.9044
5,2.4486
6,2.1701
7,1.7695
8,1.3766
9,1.1195
10,0.8838


1006.611 seconds used for training.
16.78 minutes used for training.
Peak reserved memory = 10.182 GB.
Peak reserved memory for training = 1.703 GB.
Peak reserved memory % of max memory = 69.04 %.
Peak reserved memory for training % of max memory = 11.547 %.


In [None]:
#Run inference with the fine-tuned model on a new image
print("\nMedChat Analysis\n")
FastVisionModel.for_inference(model)
image = Image.open("<put_your_radio_image_here>")

instruction =  """Carefully examine the provided medical image and elabore more insights and details, but more precised please.
Describe your observations accurately and comprehensively, including any abnormalities or significant features.
Based on your expertise, suggest the next steps or potential solutions, if applicable.
Ensure your response is clear, concise, and professional."""

messages = [
    {"role": "user", "content": [
        {"type": "image"},
        {"type": "text", "text": instruction}
    ]}
]
input_text = tokenizer.apply_chat_template(messages, add_generation_prompt = True)
inputs = tokenizer(
    image,
    input_text,
    add_special_tokens = False,
    return_tensors = "pt",
).to("cuda")

text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 600,
                   use_cache = True, temperature = 1.5, min_p = 0.1)



MedChat Analysis

This image depicts an abnormal cytological appearance in the bladder, characterized by the presence of blood.
Carefully observe your images to ensure accurate results.<|eot_id|>


In [None]:
#Run inference with the fine-tuned model on a new image
from PIL import Image


print("\nMedChat Analysis:\n")
FastVisionModel.for_inference(model)
image = Image.open("<put_your_radio_image_here>")

instruction =  """Carefully examine the provided medical image and elabore more insights and details, but more precised please.
Describe your observations accurately and comprehensively, including any abnormalities or significant features.
Based on your expertise, suggest the next steps or potential solutions, if applicable.
Ensure your response is clear, concise, and professional."""

messages = [
    {"role": "user", "content": [
        {"type": "image"},
        {"type": "text", "text": instruction}
    ]}
]
input_text = tokenizer.apply_chat_template(messages, add_generation_prompt = True)
inputs = tokenizer(
    image,
    input_text,
    add_special_tokens = False,
    return_tensors = "pt",
).to("cuda")

text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 600,
                   use_cache = True, temperature = 1.5, min_p = 0.1)



MedChat Analysis:

Abdominal CT scan. The arrow points at a right retroperitoneal mass which invades into the muscle<|eot_id|>


In [None]:
#Run inference with the fine-tuned model on a new image

print("\nMedChat Analysis:\n")
FastVisionModel.for_inference(model)
image = Image.open("<put_your_radio_image_here>")

instruction =  """Carefully examine the provided medical image and elabore more insights and details, but more precised please.
Describe your observations accurately and comprehensively, including any abnormalities or significant features.
Based on your expertise, suggest the next steps or potential solutions, if applicable.
Ensure your response is clear, concise, and professional."""

messages = [
    {"role": "user", "content": [
        {"type": "image"},
        {"type": "text", "text": instruction}
    ]}
]
input_text = tokenizer.apply_chat_template(messages, add_generation_prompt = True)
inputs = tokenizer(
    image,
    input_text,
    add_special_tokens = False,
    return_tensors = "pt",
).to("cuda")

text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 600,
                   use_cache = True, temperature = 1.5, min_p = 0.1)



MedChat Analysis:

Axial computed tomography image demonstrating a low attenuation mass involving the superior right extraconal orbit extending into the lateral rectus muscle<|eot_id|>


In [None]:
#Save the fine-tuned model and tokenizer

model.save_pretrained("BS_MedX_MedChat_weights_2")
tokenizer.save_pretrained("BS_MedX_MedChat_tokenizer_2")
#model.push_to_hub("BS_MedX_MedChat_tokenizer_2", tokenizer)

#model.save_pretrained_merged("BS_MedX_MedChat", tokenizer)


[]

In [None]:
#Push the model to Hugging Face Hub
your_repo = ""
model.push_to_hub(
   your_repo,
   tokenizer,
   save_method="merged_16bit",
   token=HF_TOKEN
)

In [None]:
#Define a function for inference and test it



#print("\nAfter training:\n")
image = Image.open("/content/radio_patient_1.bmp")

instruction =  """Carefully examine the provided medical image and elabore more insights and details, but more precised please.
Describe your observations accurately and comprehensively, including any abnormalities or significant features.
Based on your expertise, suggest the next steps or potential solutions, if applicable.
Ensure your response is clear, concise, and professional."""

def generate_text_from_image(model, tokenizer, image, instruction):
    # Preprocess the image here (you can extend this with actual image preprocessing logic)
    #image_input = image  # Placeholder for actual image processing logic

    # Concatenate instruction and image description

    prompt = [
    {"role": "user", "content": [
        {"type": "image"},
        {"type": "text", "text": instruction}
    ]}
]
    prompt = tokenizer.apply_chat_template(prompt, add_generation_prompt = True)
    # Move model and inputs to the appropriate device
    device = "cuda"
    model.to(device)  # Ensure the model is on GPU

    # Tokenize the input prompt and move tensors to GPU
    inputs = tokenizer(
    image,
    prompt,
    add_special_tokens = False,
    return_tensors = "pt",
).to("cuda")

#CUDA debugging and optimization

    # Check for invalid values in the input tensors
    if torch.isnan(inputs.input_ids).any() or torch.isinf(inputs.input_ids).any():
        raise ValueError("Input tensor contains NaN or Inf values.")

    # Use mixed precision to reduce memory usage
    from torch.amp import autocast
    with autocast('cuda'):
        # Use TextStreamer for efficient text generation
        text_streamer = TextStreamer(tokenizer, skip_prompt=True)
        outputs = model.generate(**inputs, streamer=text_streamer, max_new_tokens=600, use_cache=True, temperature=1.5, min_p=0.1)

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response
print(generate_text_from_image(model, tokenizer, image, instruction))

Fetal RBCs, Hemo- or Meth.<|eot_id|>
user

Carefully examine the provided medical image and elabore more insights and details, but more precised please.
Describe your observations accurately and comprehensively, including any abnormalities or significant features.
Based on your expertise, suggest the next steps or potential solutions, if applicable.
Ensure your response is clear, concise, and professional.assistant

Fetal RBCs, Hemo- or Meth.


In [None]:
#Start ngrok tunnel and run Streamlit app
%%writefile app.py
import streamlit as st
from unsloth import FastVisionModel
from PIL import Image
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
import os
from transformers import BitsAndBytesConfig

# Set environment variables for CUDA debugging and optimization
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
os.environ['TORCH_USE_CUDA_DSA'] = '1'  # Enable device-side assertions
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'  # Enable detailed CUDA error messages

# Load model and tokenizer
@st.cache_resource
def load_model():
    # Load quantization config
    quant_config = BitsAndBytesConfig(load_in_4bit=True)
    # Load the model with quantization
    model = AutoModelForCausalLM.from_pretrained(
        "/content/BS_MedX_MedChat_weights_2",
        quantization_config=quant_config
    )

    # Load tokenizer
    _, tokenizer = FastVisionModel.from_pretrained(
        "unsloth/Llama-3.2-11B-Vision-Instruct",
        load_in_4bit = True,
        use_gradient_checkpointing="unsloth",
        ignore_mismatched_sizes=True  # Handle potential size mismatches
    )

    # Move the model to GPU
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = model.to(device)

    return model, tokenizer

# Generate text from image and instruction
def generate_text_from_image(model, tokenizer, image, instruction):
    # Prepare the prompt
    prompt = [
        {"role": "user", "content": [
            {"type": "image"},
            {"type": "text", "text": instruction}
        ]}
    ]
    prompt = tokenizer.apply_chat_template(prompt, add_generation_prompt=True)

    # Move image and prompt to the correct device
    device = "cuda" if torch.cuda.is_available() else "cpu"

    # Tokenize inputs
    inputs = tokenizer(
        image,
        prompt,
        add_special_tokens=False,
        return_tensors="pt"
    ).to(device)

    # Check and clean inputs
    inputs = {k: v for k, v in inputs.items() if k in ['input_ids', 'attention_mask']}

    # Use mixed precision for memory efficiency
    from torch.cuda.amp import autocast
    with autocast(device_type='cuda'):
        # Initialize text streamer
        text_streamer = TextStreamer(tokenizer, skip_prompt=True)
        # Generate response
        outputs = model.generate(
            input_ids=inputs['input_ids'],
            attention_mask=inputs.get('attention_mask'),
            streamer=text_streamer,
            max_new_tokens=600,
            use_cache=True,
            temperature=1.5,
            min_p=0.1
        )

    # Decode the output
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Streamlit app interface
def main():
    st.title("MedChat: Your Medical Assistant")
    st.write("Upload an image and provide an instruction for evaluation.")

    # Image uploader
    image_file = st.file_uploader("Upload Image", type=["jpg", "jpeg", "png"])

    # Instruction input
    instruction = st.text_input("Enter instruction:")

    if image_file is not None and instruction:
        # Load model and tokenizer
        model, tokenizer = load_model()

        # Open and display the uploaded image
        image = Image.open(image_file)
        st.image(image, caption="Uploaded Image", use_container_width=True)

        # Add a loading spinner while generating response
        with st.spinner("Generating response... Please wait."):
            try:
                # Generate response
                response = generate_text_from_image(model, tokenizer, image, instruction)
                st.subheader("Model Response:")
                st.write(response)
            except ValueError as e:
                st.error(f"An error occurred: {e}")
    else:
        st.write("Please upload an image and enter an instruction.")

if __name__ == "__main__":
    main()


Overwriting app.py


In [None]:
# # import torch
# # print(torch.cuda.is_available())  # Should print True if GPU is available
# # print(torch.cuda.current_device())  # Show the current GPU device ID
# # print(torch.cuda.get_device_name(0))  # Check the name of the GPU
# import os
# # os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

# # !nvidia-smi

# os.environ['TORCH_USE_CUDA_DSA'] = '1'
# os.environ['CUDA_LAUNCH_BLOCKING'] = "1" # To find the line of code causing the error.

In [None]:
#Run the streamlit App


# Set your authtoken, similar to Hugging Face access token
ngrok.set_auth_token('NG_TOKEN')

# Start ngrok tunnel, passing the port as an integer
public_url = ngrok.connect(8502)
print(f"Public URL: {public_url}")

!streamlit run app.py --server.port 8502 &

Public URL: NgrokTunnel: "https://9439-34-142-255-155.ngrok-free.app" -> "http://localhost:8502"

Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8502[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8502[0m
[34m  External URL: [0m[1mhttp://34.142.255.155:8502[0m
[0m
🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
2024-12-01 23:24:52.617431: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-01 23:24:52.640958: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-01 23:24:52.648071: E external/local_xla/xla/stream_executor/