<a href="https://colab.research.google.com/github/aurotripathy/RL-Experiments/blob/main/Copy_of_qwen_vl_expt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##### Ensure you change the runtime to A100 - 80GB

In [None]:
%pip install transformers
%pip install flash-attn --no-build-isolation

In [None]:
!nvidia-smi

In [None]:
jason_schema = {
  "$schema": "http://json-schema.org/draft-07/schema#",
  "title": "NutritionFacts",
  "type": "object",
  "properties": {
    "product_name": { "type": ["string", "null"] },
    "brand": { "type": ["string", "null"] },
    "source_url": { "type": "string", "format": "uri" },
    "servings_per_container": { "type": ["number", "string", "null"] },
    "serving_size": {
      "type": "object",
      "properties": {
        "quantity": { "type": ["number", "null"] },
        "unit": { "type": ["string", "null"] },
        "description": { "type": ["string", "null"] }
      },
      "required": ["quantity", "unit"]
    },
    "calories": { "type": ["number", "null"] },
    "macros": {
      "type": "object",
      "properties": {
        "total_fat_g": { "type": ["number", "null"] },
        "total_fat_dv_percent": { "type": ["number", "null"] },
        "saturated_fat_g": { "type": ["number", "null"] },
        "saturated_fat_dv_percent": { "type": ["number", "null"] },
        "trans_fat_g": { "type": ["number", "null", "string"] },
        "cholesterol_mg": { "type": ["number", "null", "string"] },
        "cholesterol_dv_percent": { "type": ["number", "null"] },
        "sodium_mg": { "type": ["number", "null"] },
        "sodium_dv_percent": { "type": ["number", "null"] },
        "total_carbohydrate_g": { "type": ["number", "null"] },
        "total_carbohydrate_dv_percent": { "type": ["number", "null"] },
        "dietary_fiber_g": { "type": ["number", "null", "string"] },
        "dietary_fiber_dv_percent": { "type": ["number", "null"] },
        "total_sugars_g": { "type": ["number", "null"] },
        "added_sugars_g": { "type": ["number", "null"] },
        "added_sugars_dv_percent": { "type": ["number", "null"] },
        "protein_g": { "type": ["number", "null"] }
      },
      "required": ["total_fat_g","saturated_fat_g","trans_fat_g","sodium_mg","total_carbohydrate_g","protein_g"]
    },
    "micronutrients_dv_percent": {
      "type": "object",
      "properties": {
        "vitamin_d": { "type": ["number", "null"] },
        "calcium": { "type": ["number", "null"] },
        "iron": { "type": ["number", "null"] },
        "potassium": { "type": ["number", "null"] }
      }
    },
    "notes": { "type": ["string", "null"] }
  },
  "required": ["source_url", "serving_size", "macros"]
}


In [None]:
# from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import Qwen3VLMoeForConditionalGeneration, AutoProcessor
import torch

model_name = "Qwen/Qwen3-VL-30B-A3B-Instruct"

# default: Load the model on the available device(s)
# model = Qwen3VLMoeForConditionalGeneration.from_pretrained(
#     model_name, dtype="auto", device_map="cuda"
# )


# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
model = Qwen3VLMoeForConditionalGeneration.from_pretrained(
    model_name,
    dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
    device_map="cuda",
)

In [None]:
import json

instruction = f"Extract all the nutrition information in JSON format according to the following schema:\n{json.dumps(jason_schema, indent=2)}"

processor = AutoProcessor.from_pretrained("Qwen/Qwen3-VL-30B-A3B-Instruct")

# image_link = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
image_link = "https://s7d1.scene7.com/is/image/hersheyprodcloud/0_34000_00246_7_701_24600_073_Item_Back_B?fmt=webp-alpha&hei=908&qlt=75"

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": image_link,
            },
            {"type": "text", "text": instruction},
        ],
    }
]

# Preparation for inference with the updated messages
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to("cuda")

# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=1024) # Increased max_new_tokens
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text[0]) # Print the first output

In [None]:
from IPython.display import Image, display
import requests

# Assuming the image is the first item in the first message's content
image_url = messages[0]['content'][0]['image']

# Download the image from the URL
response = requests.get(image_url)
with open("temp_image.jpeg", "wb") as f:
    f.write(response.content)

# Display the image
display(Image(filename="temp_image.jpeg"))