In [None]:
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
import torch

model_id = "Qwen/Qwen2.5-VL-7B-Instruct"
# default: Load the model on the available device(s)
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_id, device_map="cuda", torch_dtype=torch.bfloat16
)

processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")



In [None]:
from PIL import Image
import pandas as pd
import os
from tqdm import tqdm

folder_paths_raw = ['Test']
prefix = ['Data/standard/','Data/distorted/','Data/thin_hands/']
folder_paths = [p + f for p in prefix for f in folder_paths_raw]

def get_png_files(path):
    return [f for f in os.listdir(path) if f.endswith('.png')]

model_raw_name = model_id.split('/')[-1]

prompt = '''What time is shown on the clock in the given image?'''
for folder_path, prefix_tmp in zip(folder_paths, prefix):
    output_path = f'{prefix_tmp}{folder_paths_raw[0]}_{model_raw_name}_Results.xlsx'
    if os.path.exists(output_path):
        continue
    df = pd.DataFrame(columns=['answer', f'answer-{model_raw_name}'])
    i = 0
    png_files = get_png_files(folder_path)
    for file_name in tqdm(png_files, desc=f"Processing {folder_path} PNG files"):
        image_path = os.path.join(folder_path, file_name)
        image = Image.open(image_path)
        image = image.convert("RGB")
        messages = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "image",
                        "image": image,
                    },
                    {"type": "text", "text": prompt},
                ],
            }
        ]
        text = processor.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
        image_inputs, video_inputs = process_vision_info(messages)
        inputs = processor(
            text=[text],
            images=image_inputs,
            videos=video_inputs,
            padding=True,
            return_tensors="pt",
        )
        inputs = inputs.to("cuda")
        
        generated_ids = model.generate(**inputs, max_new_tokens=500, do_sample=False, temperature=None)
        generated_ids_trimmed = [
            out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
        ]
        output_text = processor.batch_decode(
            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
        )
        
        df.at[i, 'answer'] = file_name.replace('.png','').replace('_',':')
        df.at[i, f'answer-{model_raw_name}'] = output_text
        # print(decoded)
        i += 1
        
    df.to_excel(output_path, index=False, engine='openpyxl')

In [None]:
from datasets import load_dataset

ds = load_dataset("oliverj990/clock-faces-v1-times")
dataset = ds["train"]

In [None]:
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
import torch

model_id = "Qwen/Qwen2.5-VL-7B-Instruct"
# default: Load the model on the available device(s)
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_id, device_map="cuda", torch_dtype=torch.bfloat16
)

processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")



In [None]:
from PIL import Image
import pandas as pd
import os
from tqdm import tqdm


model_raw_name = model_id.split('/')[-1]

prompt = '''What time is shown on the clock in the given image?'''

output_path = f'Results/real_clocks/{model_raw_name}_Results.xlsx'
df = pd.DataFrame(columns=['answer', f'answer-{model_raw_name}'])

for index, row in tqdm(enumerate(dataset), total = len(dataset)):
    if os.path.exists(output_path):
        continue
    image = row['image']
    image = image.convert("RGB")
    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "image": image,
                },
                {"type": "text", "text": prompt},
            ],
        }
    ]
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")
    
    generated_ids = model.generate(**inputs, max_new_tokens=500, do_sample=False, temperature=None)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )
    
    df.at[index, 'answer'] = row['time']
    df.at[index, f'answer-{model_raw_name}'] = output_text
    # print(decoded)
        
df.to_excel(output_path, index=False, engine='openpyxl')