In [1]:
import gc
import os
import glob
import torch
from tqdm import tqdm
from PIL import Image
from transformers import AutoModelForCausalLM, AutoProcessor

In [40]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = AutoModelForCausalLM.from_pretrained("./weights/model_best", trust_remote_code=True).eval().to(device)
processor = AutoProcessor.from_pretrained("./weights/model_best", trust_remote_code=True)

You are using a model of type davit to instantiate a model of type florence2. This is not supported for all configurations of models and can yield errors.


In [46]:
def infer(image, prompt="What is the printed values?", max_new_tokens=128):
    torch.cuda.empty_cache()

    if image.mode != "RGB":
        image = image.convert("RGB")

    with torch.inference_mode():
        inputs = processor(text=prompt, images=image, return_tensors="pt").to(device)
        generated_ids = model.generate(
            input_ids=inputs["input_ids"],
            pixel_values=inputs["pixel_values"],
            max_new_tokens=max_new_tokens,
            num_beams=3
        )
        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        parsed_answer = processor.post_process_generation(generated_text, task="response", image_size=(image.width, image.height))["response"]
        
        inputs["input_ids"] = inputs["input_ids"].detach()
        inputs["pixel_values"] = inputs["pixel_values"].detach()
        generated_ids = generated_ids.detach()
        del inputs, generated_ids, generated_text
        torch.cuda.empty_cache()

        return parsed_answer

In [None]:
images_path = glob.glob("./dataset/energy-meter/val/*.jpg")

for image_path in images_path[:5]:
    image = Image.open(image_path).resize((256, 128))
    label = infer(image)
    display(image)
    print(label, end="\n\n")