In [1]:
from PIL import Image
import requests
import torch
import os
from torchvision import io
from typing import Dict
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor

  from .autonotebook import tqdm as notebook_tqdm


### Finetuned Model

In [54]:
from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
from PIL import Image

# Load model and processor
model = Qwen2VLForConditionalGeneration.from_pretrained("aneesh-sathe/QwenVL2_VQA_LoRA", device_map="auto")
processor = AutoProcessor.from_pretrained("aneesh-sathe/QwenVL2_VQA_LoRA")

for image in os.listdir("test_images"):
    image = "test_images/" + image
    image = Image.open(image)

    conversation = [
        {
            "role":"user",
            "content":[
                {
                    "type":"image",
                },
                {
                    "type":"text",
                    "text":"Describe this image."
                }
            ]
        }
    ]


    # Preprocess the inputs
    text_prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|>\n<|im_start|>assistant\n"
    # Excepted output: '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|>\n<|im_start|>assistant\n'

    inputs = processor(text=[text_prompt], images=[image], padding=True, return_tensors="pt")
    inputs = inputs.to('cuda')

    # Inference: Generation of the output
    output_ids = model.generate(**inputs, max_new_tokens=1024)
    generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
    output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    print(output_text)
    print("----------------------------------")

Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:04<00:00,  2.08s/it]


['This image is a diagram from an early 20th-century book on urban planning, specifically illustrating a "group of slumless, smokeless cities." The diagram is designed to show how various services and facilities are interconnected within a central area, providing a structured and orderly living environment. Here is a detailed explanation of the components and their functions:\n\n1. **Central Area (Core)**:\n   - This is the central hub of the system, surrounded by five outer rings.\n   - It includes:\n     - **Reservoir and Waterfall**: Essential for water supply.\n     - **Agricultural College**: Providing educational and agricultural opportunities.\n     - **College for Blind**: Offering education to those with visual impairments.\n     - **New Forests**: A park or recreational area.\n     - **Reservoir & Waterfall**: Additional water source.\n\n2. **Outer Rings**:\n   - **Garden City**: A residential area with good housing.\n     - **Convalescent Homes**: For sick or injured individ

### Base Model

In [53]:
model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", device_map="auto")
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")


for image in os.listdir("test_images"):
    image = "test_images/" + image
    image = Image.open(image)

    conversation = [
        {
            "role":"user",
            "content":[
                {
                    "type":"image",
                },
                {
                    "type":"text",
                    "text":"Describe this image."
                }
            ]
        }
    ]


    # Preprocess the inputs
    text_prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|>\n<|im_start|>assistant\n"
    # Excepted output: '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|>\n<|im_start|>assistant\n'

    inputs = processor(text=[text_prompt], images=[image], padding=True, return_tensors="pt")
    inputs = inputs.to('cuda')

    # Inference: Generation of the output
    output_ids = model.generate(**inputs, max_new_tokens=1024)
    generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
    output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    print(output_text)
    print("----------------------------------")

Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:04<00:00,  2.45s/it]


['This image is a detailed plan of a planned urban area, likely from the late 19th or early 20th century. The plan is labeled as "Group of Slumless Smokeless Cities" and is numbered "No. 7." It is a circular map with various sections and labels indicating different components of the planned city.\n\n### Key Elements of the Plan:\n\n1. **Central Area:**\n   - **Population:** 158,000\n   - **Allotments:** 12,000\n   - **Railroad:** Inter-Municipal\n   - **Reservoir & Waterfall:** Central\n   - **Central Park:** Large\n   - **Municipal Railway:** To the Sea\n\n2. **Central Park:**\n   - **Population:** 12,000\n   - **Allotments:** 12,000\n   - **Reservoir & Waterfall:** Central\n   - **Municipal Railway:** To the Sea\n\n3. **Central Park:**\n   - **Population:** 12,000\n   - **Allotments:** 12,000\n   - **Reservoir & Waterfall:** Central\n   - **Municipal Railway:** To the Sea\n\n4. **Central Park:**\n   - **Population:** 12,000\n   - **Allotments:** 12,000\n   - **Reservoir & Waterfall:*