In [2]:
!mkdir -p oxford_pets
%cd oxford_pets

!wget https://www.robots.ox.ac.uk/~vgg/data/pets/data/images.tar.gz
!wget https://www.robots.ox.ac.uk/~vgg/data/pets/data/annotations.tar.gz

!tar -xzf images.tar.gz
!tar -xzf annotations.tar.gz

%cd ..


/content/oxford_pets
--2025-11-22 14:20:40--  https://www.robots.ox.ac.uk/~vgg/data/pets/data/images.tar.gz
Resolving www.robots.ox.ac.uk (www.robots.ox.ac.uk)... 129.67.94.2
Connecting to www.robots.ox.ac.uk (www.robots.ox.ac.uk)|129.67.94.2|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://thor.robots.ox.ac.uk/pets/images.tar.gz [following]
--2025-11-22 14:20:41--  https://thor.robots.ox.ac.uk/pets/images.tar.gz
Resolving thor.robots.ox.ac.uk (thor.robots.ox.ac.uk)... 129.67.95.98
Connecting to thor.robots.ox.ac.uk (thor.robots.ox.ac.uk)|129.67.95.98|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 791918971 (755M) [application/octet-stream]
Saving to: ‘images.tar.gz’


2025-11-22 14:21:22 (18.7 MB/s) - ‘images.tar.gz’ saved [791918971/791918971]

--2025-11-22 14:21:22--  https://www.robots.ox.ac.uk/~vgg/data/pets/data/annotations.tar.gz
Resolving www.robots.ox.ac.uk (www.robots.ox.ac.uk)... 129.67.94.2
Connect

In [None]:
import os
from PIL import Image
from collections import Counter
import torch
from tqdm import tqdm

RAW_IMG_DIR = "oxford_pets/images"   

classnames = sorted({
    "_".join(fname.split("_")[:-1]).lower()
    for fname in os.listdir(RAW_IMG_DIR)
    if fname.lower().endswith(".jpg")
})
print("Num classes:", len(classnames))
print("Sample classes:", classnames[:10])

items = []
for fname in sorted(os.listdir(RAW_IMG_DIR)):
    if fname.lower().endswith(".jpg"):
        breed = "_".join(fname.split("_")[:-1]).lower()
        label = classnames.index(breed)
        items.append((os.path.join(RAW_IMG_DIR, fname), label))

print("Total images:", len(items))
print("Sample item:", items[0][0], " label=", items[0][1], classnames[items[0][1]])


from transformers import CLIPProcessor, CLIPModel, CLIPTokenizer

device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "openai/clip-vit-base-patch32"

clip_model = CLIPModel.from_pretrained(model_name).to(device)
processor = CLIPProcessor.from_pretrained(model_name)
tokenizer = CLIPTokenizer.from_pretrained(model_name)

@torch.no_grad()
def encode_images(pil_images):
    inputs = processor(images=pil_images, return_tensors="pt").to(device)
    feats = clip_model.get_image_features(**inputs)
    feats = feats / feats.norm(dim=-1, keepdim=True)
    return feats.cpu()

@torch.no_grad()
def encode_text(prompts):
    tokens = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
    feats = clip_model.get_text_features(**tokens)    
    feats = feats / feats.norm(dim=-1, keepdim=True)
    return feats.cpu()

tmpl = "a photo of a {}"
prompts = [tmpl.format(c.replace("_", " ")) for c in classnames]

print("\nExample prompts:")
for i in range(5):
    print(i, prompts[i])

text_emb = encode_text(prompts)
print("Text embedding shape:", text_emb.shape)

def zero_shot_eval(items, text_emb, debug=3):
    correct = 0
    total = 0

    for idx, (imgpath, label) in enumerate(tqdm(items)):
        img = Image.open(imgpath).convert("RGB")
        img_emb = encode_images([img])[0].numpy()
        sims = img_emb @ text_emb.T.numpy()
        pred = int(sims.argmax())

        if pred == label:
            correct += 1
        total += 1

        if idx < debug:
            print("\nDEBUG:", os.path.basename(imgpath))
            print(" TRUE:", label, classnames[label])
            top3 = sims.argsort()[-3:][::-1]
            for r in top3:
                print("  PRED", r, classnames[r], float(sims[r]))

    return correct / total

acc = zero_shot_eval(items, text_emb)
print("\nZero-shot accuracy:", acc)


Num classes: 37
Sample classes: ['abyssinian', 'american_bulldog', 'american_pit_bull_terrier', 'basset_hound', 'beagle', 'bengal', 'birman', 'bombay', 'boxer', 'british_shorthair']
Total images: 7390
Sample item: oxford_pets/images/Abyssinian_1.jpg  label= 0 abyssinian

Example prompts:
0 a photo of a abyssinian
1 a photo of a american bulldog
2 a photo of a american pit bull terrier
3 a photo of a basset hound
4 a photo of a beagle
Text embedding shape: torch.Size([37, 512])


  0%|          | 1/7390 [00:00<33:10,  3.71it/s]


DEBUG: Abyssinian_1.jpg
 TRUE: 0 abyssinian
  PRED 0 abyssinian 0.3889784812927246
  PRED 33 sphynx 0.30687782168388367
  PRED 11 egyptian_mau 0.29848116636276245


  0%|          | 2/7390 [00:00<30:37,  4.02it/s]


DEBUG: Abyssinian_10.jpg
 TRUE: 0 abyssinian
  PRED 0 abyssinian 0.2773655354976654
  PRED 11 egyptian_mau 0.2773251533508301
  PRED 33 sphynx 0.26471394300460815


  0%|          | 3/7390 [00:00<30:44,  4.00it/s]


DEBUG: Abyssinian_100.jpg
 TRUE: 0 abyssinian
  PRED 0 abyssinian 0.2988705635070801
  PRED 9 british_shorthair 0.2708992063999176
  PRED 5 bengal 0.24696201086044312


100%|██████████| 7390/7390 [33:17<00:00,  3.70it/s]


Zero-shot accuracy: 0.8220568335588633





In [8]:
!pip install openai==0.28

Collecting openai==0.28
  Downloading openai-0.28.0-py3-none-any.whl.metadata (13 kB)
Downloading openai-0.28.0-py3-none-any.whl (76 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.5/76.5 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 1.109.1
    Uninstalling openai-1.109.1:
      Successfully uninstalled openai-1.109.1
Successfully installed openai-0.28.0


In [2]:
import os
os.environ["OPENAI_API_KEY"] = "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"


In [None]:
import os, json, time, math
from pathlib import Path

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") or ""  
if not OPENAI_API_KEY:
    raise RuntimeError("Set OPENAI_API_KEY in your environment or paste it into OPENAI_API_KEY variable in this cell.")

LLM_NAME = "gpt-5.1"
TEMPERATURE = 0.0
MAX_TOKENS = 1024

CACHE_DIR = Path("vdt_gpt_cache")
CACHE_DIR.mkdir(exist_ok=True)

classnames = list(dict.fromkeys(classnames))  
print("Using", len(classnames), "classnames (first 8):", classnames[:8])

import openai
openai.api_key = OPENAI_API_KEY

def chat_with_gpt(system_prompt, user_prompt, model=LLM_NAME, temperature=TEMPERATURE, max_tokens=MAX_TOKENS):
    for attempt in range(4):
        try:
            resp = openai.ChatCompletion.create(
                model=model,
                messages=[
                    {"role":"system","content":system_prompt},
                    {"role":"user","content":user_prompt}
                ],
                temperature=temperature,
                max_tokens=max_tokens
            )
            return resp["choices"][0]["message"]["content"].strip()
        except Exception as e:
            print("OpenAI request failed (attempt", attempt+1, "):", str(e))
            time.sleep(2 ** attempt)
    raise RuntimeError("OpenAI calls failed repeatedly.")

ATTR_CACHE = CACHE_DIR / "attributes.json"
if ATTR_CACHE.exists():
    attributes = json.loads(ATTR_CACHE.read_text())
    print("Loaded attributes from cache:", attributes)
else:
    system = "You are ChatGPT. Return only a Python list of strings (no prose)."
    user = f"""
We will create visually descriptive attributes for the Oxford-IIIT Pets dataset to build visual descriptors (VDT).
Using only the following 37 breed names (exactly): {classnames}

Return exactly 20 short attribute names (one- or few-word each) that are visually observable in photos and useful to distinguish these pet breeds.
Examples of attribute categories: coat color, coat pattern, fur length, ear shape, tail shape, face flatness, body size, etc.

Return the output as a Python list literal like:
["attribute1", "attribute2", ...]
Do not add any explanation text.
"""
    raw = chat_with_gpt(system, user)
    try:
        attributes = eval(raw, {})
        assert isinstance(attributes, (list,tuple))
        attributes = [str(a).strip() for a in attributes][:20]
    except Exception:
        attributes = [line.strip("- ").strip() for line in raw.splitlines() if line.strip()]
        attributes = attributes[:20]
    print("Attributes:", attributes)
    with open(ATTR_CACHE, "w") as f:
        json.dump(attributes, f, indent=2)

VDT_CACHE = CACHE_DIR / "vdt_sentences.json"
vdt = {}
if VDT_CACHE.exists():
    vdt = json.loads(VDT_CACHE.read_text())
    print("Loaded per-class VDT from cache with", len(vdt), "classes.")
else:
    system = "You are ChatGPT. Return only a valid Python dictionary (no extra text)."
    for cls in classnames:
        cls_safe = cls.replace("_", " ")
        cache_file = CACHE_DIR / f"vdt_{cls}.json"
        if cache_file.exists():
            v = json.loads(cache_file.read_text())
            vdt[cls] = v
            print("Loaded cached", cls)
            continue

        user = f"""
Using the attribute list: {attributes}

For the class "{cls_safe}", produce EXACTLY {len(attributes)} short sentences (one sentence per attribute), describing how that attribute appears in this breed in photographs.
- Each sentence should be 6-20 words.
- Focus only on visual, image-observable features (color, patterns, ear/tail shape, fur length, body size, face, posture).
- Return a JSON array (Python list) of strings, with each sentence corresponding to the attribute at the same index in the attribute list.
Example expected output:
["Short dense coat with spotted pattern.", "Fur color is orange with black rosettes.", ...]

Return only the JSON list of strings. No commentary. If an attribute is not applicable, write a short sentence like "Not visually distinctive."
"""
        raw = chat_with_gpt(system, user)

        try:
            arr = eval(raw, {})
            if isinstance(arr, str):
                import json as _json
                arr = _json.loads(arr)
            assert isinstance(arr, (list,tuple))
            arr = [str(s).strip() for s in arr]
        except Exception:
            try:
                import json as _json
                arr = _json.loads(raw)
            except Exception:
                arr = [line.strip("- ").strip() for line in raw.splitlines() if line.strip()]
        if len(arr) != len(attributes):
            if len(arr) < len(attributes):
                arr += ["Not visually distinctive."] * (len(attributes)-len(arr))
            else:
                arr = arr[:len(attributes)]
        vdt[cls] = arr
        cache_file.write_text(json.dumps(arr, indent=2))
        print("Saved VDT for", cls)
        time.sleep(0.5)
    VDT_CACHE.write_text(json.dumps(vdt, indent=2))

print("VDT generation done. Classes collected:", len(vdt))

from transformers import CLIPProcessor, CLIPModel, CLIPTokenizer
device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "openai/clip-vit-base-patch32"
clip_model = CLIPModel.from_pretrained(model_name).to(device)
processor = CLIPProcessor.from_pretrained(model_name)
tokenizer = CLIPTokenizer.from_pretrained(model_name)

import torch
@torch.no_grad()
def encode_texts_clips(prompts):
    B = 64
    all_emb = []
    for i in range(0, len(prompts), B):
        batch = prompts[i:i+B]
        tokens = tokenizer(batch, padding=True, truncation=True, return_tensors="pt").to(device)
        feats = clip_model.get_text_features(**tokens)
        feats = feats / feats.norm(dim=-1, keepdim=True)
        all_emb.append(feats.cpu())
    return torch.cat(all_emb, dim=0)  

PROMPT_TEMPLATE = "a photo of a {classname}. {sentence}"
all_prompts = []
idx_to_pair = []  
for cls in classnames:
    sentences = vdt[cls]
    for j, sent in enumerate(sentences):
        p = PROMPT_TEMPLATE.format(classname=cls.replace("_", " "), sentence=sent)
        all_prompts.append(p)
        idx_to_pair.append((cls, j))

print("Total prompts:", len(all_prompts), "-> should be len(classes) * len(attributes) =", len(classnames)*len(attributes))

text_embeddings = encode_texts_clips(all_prompts)  
print("Encoded text embeddings shape:", text_embeddings.shape)

import numpy as np
D = text_embeddings.shape[1]
class_embs = {cls: None for cls in classnames}
counts = {cls: 0 for cls in classnames}
for i, (cls, j) in enumerate(idx_to_pair):
    vec = text_embeddings[i].numpy()
    if class_embs[cls] is None:
        class_embs[cls] = vec.copy()
    else:
        class_embs[cls] += vec
    counts[cls] += 1

for cls in class_embs:
    class_embs[cls] = class_embs[cls] / max(1, counts[cls])
    norm = np.linalg.norm(class_embs[cls])
    if norm > 0:
        class_embs[cls] = class_embs[cls] / norm

import torch
text_emb_vdt = torch.stack([torch.from_numpy(class_embs[c]) for c in classnames], dim=0)  
print("Aggregated class embedding matrix shape:", text_emb_vdt.shape)

from PIL import Image
@torch.no_grad()
def encode_images_batch(pil_images):
    inputs = processor(images=pil_images, return_tensors="pt").to(device)
    feats = clip_model.get_image_features(**inputs)
    feats = feats / feats.norm(dim=-1, keepdim=True)
    return feats.cpu()

def zero_shot_eval_with_emb(items, text_emb_matrix, max_images=None, debug=3):
    correct = 0
    total = 0
    for idx, (imgpath, label) in enumerate(items[:max_images] if max_images else items):
        img = Image.open(imgpath).convert("RGB")
        img_emb = encode_images_batch([img])[0].numpy()
        sims = img_emb @ text_emb_matrix.numpy().T  
        pred = int(sims.argmax())
        if pred == label:
            correct += 1
        total += 1
        if idx < debug:
            top3 = sims.argsort()[-3:][::-1]
            print("DBG:", Path(imgpath).name, "true:", classnames[label])
            for r in top3:
                print("   pred", r, classnames[r], float(sims[r]))
    return correct / total

try:
    baseline_text_emb  
except NameError:
    baseline_prompts = [f"a photo of a {c.replace('_',' ')}" for c in classnames]
    baseline_text_emb = encode_texts_clips(baseline_prompts)

print("Evaluating baseline (simple prompt) on 500 images (quick) ...")
acc_baseline = zero_shot_eval_with_emb(items, baseline_text_emb, max_images=500, debug=2)
print("Baseline (500) accuracy:", acc_baseline)

print("Evaluating VDT-GPT aggregated embeddings on same subset ...")
acc_vdt = zero_shot_eval_with_emb(items, text_emb_vdt, max_images=500, debug=2)
print("VDT-GPT (500) accuracy:", acc_vdt)

with open(CACHE_DIR / "vdt_results.json", "w") as f:
    json.dump({"attributes": attributes, "vdt": vdt, "acc_baseline_500": acc_baseline, "acc_vdt_500": acc_vdt}, f, indent=2)

print("Saved VDT outputs and results to", CACHE_DIR)


Using 37 classnames (first 8): ['abyssinian', 'american_bulldog', 'american_pit_bull_terrier', 'basset_hound', 'beagle', 'bengal', 'birman', 'bombay']
OpenAI request failed (attempt 1 ): You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.
OpenAI request failed (attempt 2 ): You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.
OpenAI request failed (attempt 3 ): You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.
OpenAI request failed (attempt 4 ): You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: h

RuntimeError: OpenAI calls failed repeatedly.

In [None]:
import json
import numpy as np
import torch
from tqdm import tqdm
from PIL import Image

VDT_PATH = "vdt_attributes_sentences.json"

with open(VDT_PATH, "r") as f:
    vdt_data = json.load(f)

attributes = vdt_data["attributes"]
vdt = vdt_data["vdt"]

print("Loaded", len(attributes), "attributes")
print("Loaded VDT entries:", len(vdt))

PROMPT_TEMPLATE = "a photo of a {classname}. {sentence}"

all_prompts = []
idx_to_pair = []   

for cls in classnames:
    sentences = vdt[cls]
    for j, sent in enumerate(sentences):
        prompt = PROMPT_TEMPLATE.format(
            classname=cls.replace("_", " "),
            sentence=sent
        )
        all_prompts.append(prompt)
        idx_to_pair.append((cls, j))

print("Total prompts:", len(all_prompts))


@torch.no_grad()
def encode_texts_clips(prompts):
    BATCH = 64
    all_embs = []
    for i in range(0, len(prompts), BATCH):
        batch = prompts[i:i+BATCH]
        tokens = tokenizer(batch, padding=True, truncation=True, return_tensors="pt").to(device)
        feats = clip_model.get_text_features(**tokens)
        feats = feats / feats.norm(dim=-1, keepdim=True)
        all_embs.append(feats.cpu())
    return torch.cat(all_embs, dim=0)

print("Encoding VDT prompts...")
text_emb_vdt_all = encode_texts_clips(all_prompts)
print("VDT text embedding shape:", text_emb_vdt_all.shape)

class_embs = {cls: [] for cls in classnames}

for emb, (cls, attr_idx) in zip(text_emb_vdt_all, idx_to_pair):
    class_embs[cls].append(emb.numpy())

final_class_embs = []
for cls in classnames:
    arr = np.stack(class_embs[cls], axis=0).mean(axis=0)
    arr = arr / np.linalg.norm(arr)
    final_class_embs.append(arr)

final_class_embs = torch.tensor(final_class_embs)  
print("Final class embedding matrix:", final_class_embs.shape)

@torch.no_grad()
def encode_image(img):
    inputs = processor(images=[img], return_tensors="pt").to(device)
    feats = clip_model.get_image_features(**inputs)[0]
    feats = feats / feats.norm()
    return feats.cpu().numpy()

def zero_shot_eval_vdt(items, class_emb_matrix, debug=3):
    correct = 0
    total = 0
    for idx, (imgpath, label) in enumerate(tqdm(items)):
        img = Image.open(imgpath).convert("RGB")
        img_emb = encode_image(img)
        sims = img_emb @ class_emb_matrix.numpy().T

        pred = int(np.argmax(sims))
        if pred == label:
            correct += 1
        total += 1

        if idx < debug:
            print("\nDEBUG:", imgpath)
            top3 = sims.argsort()[-3:][::-1]
            print(" TRUE:", classnames[label])
            for k in top3:
                print(" PRED:", classnames[k], " score=", float(sims[k]))

    return correct / total

print("Running VDT-enhanced zero-shot evaluation...")
vdt_acc = zero_shot_eval_vdt(items, final_class_embs)

print("\n=======================================")
print("VDT Zero-shot accuracy:", vdt_acc)
print("=======================================")


Loaded 20 attributes
Loaded VDT entries: 37
Total prompts: 740
Encoding VDT prompts...


  final_class_embs = torch.tensor(final_class_embs)  # (37, 512)


VDT text embedding shape: torch.Size([740, 512])
Final class embedding matrix: torch.Size([37, 512])
Running VDT-enhanced zero-shot evaluation...


  0%|          | 1/7390 [00:00<35:50,  3.44it/s]


DEBUG: oxford_pets/images/Abyssinian_1.jpg
 TRUE: abyssinian
 PRED: abyssinian  score= 0.3985186815261841
 PRED: bengal  score= 0.3210342526435852
 PRED: sphynx  score= 0.3201451599597931


  0%|          | 2/7390 [00:00<33:43,  3.65it/s]


DEBUG: oxford_pets/images/Abyssinian_10.jpg
 TRUE: abyssinian
 PRED: egyptian_mau  score= 0.28052714467048645
 PRED: abyssinian  score= 0.27945488691329956
 PRED: siamese  score= 0.27397793531417847


  0%|          | 3/7390 [00:00<31:55,  3.86it/s]


DEBUG: oxford_pets/images/Abyssinian_100.jpg
 TRUE: abyssinian
 PRED: abyssinian  score= 0.3075641393661499
 PRED: british_shorthair  score= 0.28348308801651
 PRED: bengal  score= 0.2733459770679474


100%|██████████| 7390/7390 [33:32<00:00,  3.67it/s]


VDT Zero-shot accuracy: 0.8525033829499323



