# Python Scripts testings and development

In [3]:
import os
import glob
import torch
import open_clip # open_clip_torch
from PIL import Image
import faiss
from openai import AzureOpenAI
import numpy as np
from base64 import b64encode



  from .autonotebook import tqdm as notebook_tqdm


In [4]:

# -------------------------------
# 1️⃣ SETUP
# -------------------------------

client = AzureOpenAI(
    api_version="2024-02-01"
)

# Load open_clip model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, _, preprocess = open_clip.create_model_and_transforms('ViT-B-32', pretrained='laion2b_s34b_b79k')
tokenizer = open_clip.get_tokenizer('ViT-B-32')

model = model.to(device)


In [5]:
# -------------------------------
# 2️⃣ IMAGE EMBEDDING & INDEX
# -------------------------------

image_folder = "images/"
image_paths = glob.glob(os.path.join(image_folder, "*.png"))

image_embeddings = []
image_ids = []

for img_path in image_paths:
    image = preprocess(Image.open(img_path)).unsqueeze(0).to(device)
    with torch.no_grad():
        img_emb = model.encode_image(image)
        img_emb /= img_emb.norm(dim=-1, keepdim=True)
        img_emb = img_emb.cpu().numpy()
    image_embeddings.append(img_emb)
    image_ids.append(img_path)

image_embeddings = np.vstack(image_embeddings).astype('float32')

index = faiss.IndexFlatIP(image_embeddings.shape[1])
index.add(image_embeddings)


In [20]:

user_question = "how many feed forward layers in the given image"

In [21]:
# -------------------------------
# 3️⃣ TEXT EMBEDDING
# -------------------------------


text_tokens = tokenizer([user_question]).to(device)
with torch.no_grad():
    text_emb = model.encode_text(text_tokens)
    text_emb /= text_emb.norm(dim=-1, keepdim=True)
    text_emb = text_emb.cpu().numpy().astype('float32')



In [22]:
# -------------------------------
# 4️⃣ SEARCH
# -------------------------------

k = 1
D, I = index.search(text_emb, k)
top_image_paths = [image_ids[i] for i in I[0]]

print("Top matches:", top_image_paths)

Top matches: ['images/09a9f4be-8057-43d7-bdf2-a20404ab2165.png']


In [23]:
# -------------------------------
# 5️⃣ AZURE GPT-4o-mini
# -------------------------------

context = f"The user asked: '{user_question}'. Relevant images: {', '.join(top_image_paths)}."

img_path = top_image_paths[0] # "images/07393f28-9525-4a42-9bb5-2d153696739e.png"

# fetching image data
with open(img_path, "rb") as f:
    image_b64 = b64encode(f.read()).decode("utf-8")

data_url = f"data:image/png;base64,{image_b64}"                     # creating data url variable


response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {
            "role": "system",
            "content": "You are an expert vision assistant. Answer the query from the given image ONLY, if no answer will found then say you don't know."
        },
        {
            "role": "user",
            "content": [
                {"type": "text", "text": f"user query: {user_question}"},
                {"type": "image_url", "image_url": {"url": data_url}}
            ]
        }
    ],
    max_tokens=4096,
    temperature=1.0,
    top_p=1.0
)

print("\n💡 LLM Response:")
print(response.choices[0].message.content)



💡 LLM Response:
I don't know.
