<a href="https://colab.research.google.com/github/amir-asari/VLM_Bootcamp/blob/main/Qwen2_5_VL_Image_Classification_Code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Qwen2.5-VL Image Classification with Hugging Face Transformers
This code is a consolidated script optimized for a single cell execution in a Google Colab environment with a GPU.

 ----------------------------------------------------------------
 1. Setup and Installation
 ----------------------------------------------------------------

Install necessary libraries. We use the latest transformers from source
to ensure Qwen2.5-VL compatibility, and 'flash-attn' for performance.

In [6]:
!pip install -q git+https://github.com/huggingface/transformers.git accelerate
!pip install -q flash-attn
!pip install -q qwen-vl-utils pillow

import torch
import warnings
from PIL import Image
import requests
from io import BytesIO
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor

# Suppress minor warnings for a clean Colab output
warnings.filterwarnings('ignore')

print("Installation and Imports complete.")

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m528.7/528.7 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.8/46.8 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m81.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for flash-attn (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstallation and Imports complete.


----------------------------------------------------------------
2. Model and Processor Loading
----------------------------------------------------------------

In [10]:
# We use the 7B model for a good balance of performance and capability.
MODEL_ID = "Qwen/Qwen2.5-VL-7B-Instruct"

# Check for GPU availability and set device
if torch.cuda.is_available():
    device = "cuda"
    dtype = torch.bfloat16 # Use bfloat16 for faster inference if supported
else:
    device = "cpu"
    dtype = torch.float32

print(f"--- Environment Setup ---")
print(f"Device: {device}")
print(f"Loading Model: {MODEL_ID}")
print("-" * 25)

try:
    # Load the Model and Processor
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
    processor = AutoProcessor.from_pretrained(MODEL_ID)

    # Load model with performance optimizations
    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
        MODEL_ID,
        torch_dtype=dtype,
        device_map="auto",
    ).eval()
    print("Model loaded successfully.")

except Exception as e:
    print(f"Error loading model or libraries: {e}")
    print("Please ensure you have a T4 GPU enabled in Colab runtime settings.")
    model = None

--- Environment Setup ---
Device: cuda
Loading Model: Qwen/Qwen2.5-VL-7B-Instruct
-------------------------


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]



Model loaded successfully.


----------------------------------------------------------------
3. Inference Function Definition
----------------------------------------------------------------

In [11]:
def classify_image(image_url: str, question: str):
    """
    Downloads an image, prepares the multimodal prompt, and generates a response
    using the Qwen2.5-VL model.
    """
    if model is None:
        return "Model failed to load. Cannot run inference."

    print(f"\n--- Running Inference ---")
    print(f"Question: {question}")

    # 1. Download and load the image
    try:
        response = requests.get(image_url, stream=True)
        response.raise_for_status()
        image = Image.open(BytesIO(response.content)).convert("RGB")
        print(f"Image loaded: {image_url}")
    except Exception as e:
        return f"Error loading image from URL: {e}"

    # 2. Construct the chat template (Qwen-VL uses a specific structure)
    conversation = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image},
                {"type": "text", "text": question}
            ],
        }
    ]

    # 3. Process the input (tokenization and tensor conversion)
    try:
        inputs = processor.apply_chat_template(
            conversation,
            tokenize=True,
            add_generation_prompt=True,
            return_dict=True,
            return_tensors="pt"
        ).to(device)

        # 4. Generate the response
        output_ids = model.generate(
            **inputs,
            do_sample=False,
            max_new_tokens=50,
        )

        # Decode the generated tokens
        response_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

        # Extract only the assistant's response part
        assistant_tag = "<|im_start|>assistant\n"
        if assistant_tag in response_text:
            response_text = response_text.split(assistant_tag)[-1].strip()

        return response_text

    except Exception as e:
        return f"Error during model inference: {e}"

----------------------------------------------------------------
4. Execution Examples
----------------------------------------------------------------

In [None]:
# Example 1: Standard Image Classification (What is this?)
image_url_1 = "https://www.activewild.com/wp-content/uploads/2021/12/Malayan-Tapir.jpg"
question_1 = "What kind of animal is this in the image? Be precise and only state the breed."

# Example 2: Classification with additional context/reasoning
image_url_2 = "https://placehold.co/600x400/FF5733/FFFFFF?text=Pie+Chart+Showing+60%25+Sales"
question_2 = "Based on this image, what type of chart is displayed, and what is the primary color used for the largest segment?"



# Result 1
result_1 = classify_image(image_url_1, question_1)
print(f"\n[Image 1 Classification Result]\n{result_1}")
print("=" * 50)

# Result 2
result_2 = classify_image(image_url_2, question_2)
print(f"\n[Image 2 Classification Result]\n{result_2}")
print("=" * 50)



--- Running Inference ---
Question: What kind of animal is this in the image? Be precise and only state the breed.
Image loaded: https://www.activewild.com/wp-content/uploads/2021/12/Malayan-Tapir.jpg
