In [18]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW,AutoModelForCausalLM, AutoProcessor
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from PIL import ImageOps,Image
import cv2
import re
import os
import numpy as np

In [8]:
model_id = "microsoft/Phi-3.5-vision-instruct"
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    torch_dtype=torch.float16,
    device_map="auto"
)
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)

processor.tokenizer.padding_side = 'left'


user_prompt = '<|user|>\n'
assistant_prompt = '<|assistant|>\n'
prompt_suffix = "<|end|>\n"

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.21s/it]


In [26]:
@torch.inference_mode()
def run_example(image, text_input=None, target_aspect_ratio=1.2):
    

    def pad_to_near_square(img, target_aspect_ratio=1.2):
        width, height = img.size
        aspect_ratio = width / height

        if aspect_ratio < target_aspect_ratio:

            new_width = int(target_aspect_ratio * height)
            padding = (new_width - width) // 2
            img = ImageOps.expand(img, (padding, 0, padding, 0))
        elif aspect_ratio > target_aspect_ratio:

            new_height = int(width / target_aspect_ratio)
            padding = (new_height - height) // 2
            img = ImageOps.expand(img, (0, padding, 0, padding)) 
        
        return img

    image = pad_to_near_square(image, target_aspect_ratio=target_aspect_ratio)
    

    prompt = f"{user_prompt}<|image_1|>\n{text_input}{prompt_suffix}{assistant_prompt}"
    

    inputs = processor(prompt, image, return_tensors="pt").to(model.device)


    with torch.no_grad():
        generate_ids = model.generate(
            **inputs, 
            max_new_tokens=1000,
            eos_token_id=processor.tokenizer.eos_token_id,
        )

    generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
    response = processor.batch_decode(
        generate_ids, 
        skip_special_tokens=True, 
        clean_up_tokenization_spaces=False
    )[0]
    return response

In [29]:
import cv2

def split_image_into_quarters(image_path):
    # Load the image
    image = cv2.imread(image_path)
    
    if image is None:
        raise ValueError(f"Image at path {image_path} could not be loaded.")
    
    # Get image dimensions
    height, width, _ = image.shape
    
    # Calculate sizes for each grid section
    third_width = width // 3
    third_height = height // 3
    
    # Define the regions for each of the 9 parts
    top_left = image[:third_height, :third_width]
    top_center = image[:third_height, third_width:2*third_width]
    top_right = image[:third_height, 2*third_width:]
    
    middle_left = image[third_height:2*third_height, :third_width]
    middle_center = image[third_height:2*third_height, third_width:2*third_width]
    middle_right = image[third_height:2*third_height, 2*third_width:]
    
    bottom_left = image[2*third_height:, :third_width]
    bottom_center = image[2*third_height:, third_width:2*third_width]
    bottom_right = image[2*third_height:, 2*third_width:]
    
    return [top_left, top_center, top_right, middle_left, middle_center, middle_right, bottom_left, bottom_center, bottom_right]

In [30]:
# Path to the test annotations file
annotations_file = "/home/student_resource 3/dataset/test.csv"
df = pd.read_csv(annotations_file)

results = []


data = df.iloc[0:100]

results = []

for idx, row in tqdm(data.iterrows(), total=len(data), desc="Inference Progress"):
    try:
        image_id = os.path.basename(row['image_link'])
        entity_name = row.get('entity_name', None) 
        entity_id = row.get('index', 0)

        if entity_name is None:
            print(f"Missing entity_name at row {idx}, skipping...")
            continue

        image_path = os.path.join("/home/images/test", image_id)

        # Construct the text input for the Phi model
        image = Image.open(image_path).convert("RGB")
        text_input = f"Extract the value of {entity_name} from the image. Ensure the extracted value is followed by its corresponding unit, if no unit is present give the next best one. Format the result as '<|value|> <|unit|>'."

        # Run inference using the Phi model
        final_output = run_example(image, text_input)

        if not bool(re.search(r'\d', final_output)):
            quarts = split_image_into_quarters(image_path)
            for i in quarts:
                final_output = run_example(Image.fromarray(i), text_input)
                if bool(re.search(r'\d', final_output)):
                    break
            final_output=""
            

        # Append results
        results.append({
            'index': entity_id,
            'prediction': final_output
        })

    except Exception as e:
        print(f"Error at row {idx}: {e}")

        # Append results even if an error occurs
        results.append({
            'index': entity_id,
            'prediction': ""
        })

# Convert results to DataFrame and save to CSV
results_df = pd.DataFrame(results)
results_df.to_csv('/home/inference0to10k22.csv', index=False)
print("Inference results for partial dataset saved to CSV.")

Inference Progress: 100%|██████████| 100/100 [02:52<00:00,  1.73s/it]

Inference results for partial dataset saved to CSV.



