# Determininig Model Performance on the Oxford Pet Dataset
This notebook has been created with the intent to showcase our work in terms of determining out-of-the-box model performances on a commonly used dataset. We start by importing the necessary libraries and then we move on to code utility functions for the task and data management.

In [None]:
import os
import random
import torch
import torch.nn as nn
from torchvision import models, transforms
from torch.utils.data import DataLoader, Dataset
from PIL import Image
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from torchvision import transforms
from transformers import CLIPModel, CLIPProcessor, CLIPImageProcessor

In [None]:
# Extract categories from the file directory
categories = set()
directory = '../oxford_pets/images'
file_list = list(os.walk(directory))[0][2]

for filename in file_list:
    index = filename.rfind('_')
    categories.add(filename[:index])

print(categories)

{'chihuahua', 'samoyed', 'yorkshire_terrier', 'leonberger', 'basset_hound', 'pomeranian', 'Egyptian_Mau', 'Maine_Coon', 'english_setter', 'Ragdoll', 'Abyssinian', 'newfoundland', 'boxer', 'Russian_Blue', 'Persian', 'american_bulldog', 'Siamese', 'pug', 'staffordshire_bull_terrier', 'saint_bernard', 'great_pyrenees', 'beagle', 'Birman', 'scottish_terrier', 'keeshond', 'wheaten_terrier', 'Sphynx', 'german_shorthaired', 'Bombay', 'havanese', 'miniature_pinscher', 'shiba_inu', 'english_cocker_spaniel', 'British_Shorthair', 'Bengal', 'american_pit_bull_terrier', 'japanese_chin'}


In [2]:
len(categories)

37

In [None]:

categories = set()
directory = '../oxford_pets/images'
file_list = [f for f in os.listdir(directory) if f.endswith('.jpg')]

# Extract categories
for filename in file_list:
    index = filename.rfind('_')
    categories.add(filename[:index])

# Create query and gallery directories if they don't exist
gallery_dir = os.path.join(directory, 'gallery')
query_dir = os.path.join(directory, 'query')
os.makedirs(gallery_dir, exist_ok=True)
os.makedirs(query_dir, exist_ok=True)

for cat in categories:
    filtered = [f for f in file_list if f.startswith(cat + '_')]
    if not filtered:
        continue
    random.seed(1)
    sampled_query = random.choice(filtered)
    filtered.remove(sampled_query)

    # Move sampled query image
    src_query = os.path.join(directory, sampled_query)
    dst_query = os.path.join(query_dir, sampled_query)
    if os.path.exists(src_query):
        os.rename(src_query, dst_query)
    else:
        print(f"File not found: {src_query}")

    # Move gallery images
    cat_gallery_dir = os.path.join(gallery_dir, cat)
    os.makedirs(cat_gallery_dir, exist_ok=True)
    for file in filtered:
        src_gallery = os.path.join(directory, file)
        dst_gallery = os.path.join(cat_gallery_dir, file)
        if os.path.exists(src_gallery):
            os.rename(src_gallery, dst_gallery)
        else:
            print(f"File not found: {src_gallery}")

# ResNet
The first model we experimented with was ResNet due to its less convoluted structure compared to other architectures used afterwards.

In [None]:
# 1. Paths to gallery and query directories
gallery_dir = '../oxford_pets/images/gallery'
query_dir = '../oxford_pets/images/query'

# 2. Dataset and DataLoader
class ImageOnlyDataset(Dataset):
    def __init__(self, image_dir, transform=None):
        self.image_paths = []
        # Recursively collect all image files
        for root, _, files in os.walk(image_dir):
            for fname in files:
                if fname.lower().endswith(('.png', '.jpg', '.jpeg')):
                    self.image_paths.append(os.path.join(root, fname))
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        image = Image.open(img_path).convert("RGB")
        if self.transform:
            image = self.transform(image)
        return image, img_path

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

gallery_dataset = ImageOnlyDataset(gallery_dir, transform=transform)
query_dataset = ImageOnlyDataset(query_dir, transform=transform)
gallery_loader = DataLoader(gallery_dataset, batch_size=32, shuffle=False)
query_loader = DataLoader(query_dataset, batch_size=32, shuffle=False)

# 3. Load Model (ResNet18, as in your notebook)
model = models.resnet18(weights="IMAGENET1K_V1")
# Remove the final classification layer to get embeddings
model.fc = nn.Identity()

# Move to device
if torch.cuda.is_available():
    device = 'cuda'
elif torch.mps.is_available():
    device = 'mps'
else:
    device = 'cpu'
model = model.to(device)

# 4. Extract Embeddings
gallery_embeddings = []
query_embeddings = []
gallery_paths = []
query_paths = []

model.eval()
with torch.no_grad():
    for images, paths in gallery_loader:
        images = images.to(device)
        emb = model(images)
        gallery_embeddings.append(emb.cpu().numpy())
        gallery_paths.extend(paths)
    for images, paths in query_loader:
        images = images.to(device)
        emb = model(images)
        query_embeddings.append(emb.cpu().numpy())
        query_paths.extend(paths)

gallery_embeddings = np.vstack(gallery_embeddings)
query_embeddings = np.vstack(query_embeddings)

# 5. Compute Cosine Similarity and Top-k Retrievals
similarity_matrix = cosine_similarity(query_embeddings, gallery_embeddings)
top_k = 10 
topk_indices = np.argsort(-similarity_matrix, axis=1)[:, :top_k]

def get_breed(filename):
    base = os.path.basename(filename)
    return base[:base.rfind('_')]

In [2]:
# 6. Compute Metrics
recall_correct = 0
accuracy_correct = 0
precision_sum = 0

for i, indices in enumerate(topk_indices):
    query_breed = get_breed(query_paths[i])
    retrieved_breeds = [get_breed(gallery_paths[idx]) for idx in indices]
    # Recall@k: at least one correct in top-k
    if query_breed in retrieved_breeds:
        recall_correct += 1
    # Accuracy@1: top-1 is correct
    if query_breed == retrieved_breeds[0]:
        accuracy_correct += 1
    # Precision@k: fraction of correct in top-k
    precision_sum += retrieved_breeds.count(query_breed) / top_k

recall_at_k = recall_correct / len(query_paths)
accuracy_at_1 = accuracy_correct / len(query_paths)
precision_at_k = precision_sum / len(query_paths)

print(f"Recall@{top_k}: {recall_at_k:.4f}")
print(f"Accuracy@1: {accuracy_at_1:.4f}")
print(f"Precision@{top_k}: {precision_at_k:.4f}")

Recall@10: 1.0000
Accuracy@1: 0.9189
Precision@10: 0.8568


# CLIP
In our tests, CLIP proved to be the best model due to its solid performance across all metrics considered.

In [None]:

transform = transforms.Compose([
    transforms.Resize(336, interpolation=transforms.InterpolationMode.BICUBIC),
    transforms.CenterCrop(336),
    transforms.ToTensor(),
    transforms.Normalize([0.4815, 0.4578, 0.4082], [0.2686, 0.2613, 0.2758])
])

In [None]:
# 1. Paths to gallery and query directories
gallery_dir = "/home/disi/oxford_pets/images/gallery"
query_dir = '/home/disi/oxford_pets/images/query'

# 2. Dataset and DataLoader for CLIP
class CLIPImageDataset(Dataset):
    def __init__(self, image_dir, processor):
        """
        image_dir: directory con immagini
        processor: istanza di CLIPProcessor da Hugging Face
        """
        self.image_dir = image_dir
        self.image_paths = []
        for root, _, files in os.walk(image_dir):
            for fname in files:
                if fname.lower().endswith(('.png', '.jpg', '.jpeg')):
                    self.image_paths.append(os.path.join(root, fname))
        
        self.processor = processor

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        image = Image.open(img_path).convert("RGB")
        
        # CLIPImageProcessor returns a tensor directly
        pixel_values = self.processor(image, return_tensors="pt")["pixel_values"].squeeze(0)
        return pixel_values, img_path

# 3. Load CLIP model and processor
processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14-336")
device = "cuda" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else "cpu")
model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14-336").vision_model.to(device)  # Only vision part

print('after loading processor')
# 4. DataLoaders
def collate_fn(batch):
    images, paths = zip(*batch)
    images = torch.stack(images, dim=0)
    return images, list(paths)

gallery_dataset = CLIPImageDataset(gallery_dir, processor=processor)
query_dataset = CLIPImageDataset(query_dir, processor=processor)
gallery_loader = DataLoader(gallery_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn)
query_loader = DataLoader(query_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn)

print(f"Number of gallery images: {len(gallery_dataset)}")

# 5. Extract Embeddings
with torch.no_grad():
    gallery_embeddings = []
    query_embeddings = []
    gallery_paths = []
    query_paths = []

    for pixel_values, paths in gallery_loader:
        pixel_values = pixel_values.to(device)
        outputs = model(pixel_values=pixel_values)
        emb = outputs.pooler_output
        gallery_embeddings.append(emb.cpu().numpy())
        gallery_paths.extend(paths)

    for pixel_values, paths in query_loader:
        pixel_values = pixel_values.to(device)
        outputs = model(pixel_values=pixel_values)
        emb = outputs.pooler_output
        query_embeddings.append(emb.cpu().numpy())
        query_paths.extend(paths)

gallery_embeddings = np.vstack(gallery_embeddings)
query_embeddings = np.vstack(query_embeddings)

print('after embeddings')
# 6. Compute Cosine Similarity and Top-k Retrievals
similarity_matrix = cosine_similarity(query_embeddings, gallery_embeddings)
top_k = 10  # Change as needed
topk_indices = np.argsort(-similarity_matrix, axis=1)[:, :top_k]

def get_breed(filename):
    base = os.path.basename(filename)
    return base[:base.rfind('_')]

print('after similarity')
# 7. Compute Metrics
recall_correct = 0
accuracy_correct = 0
precision_sum = 0

for i, indices in enumerate(topk_indices):
    query_breed = get_breed(query_paths[i])
    retrieved_breeds = [get_breed(gallery_paths[idx]) for idx in indices]
    # Recall@k: at least one correct in top-k
    if query_breed in retrieved_breeds:
        recall_correct += 1
    # Accuracy@1: top-1 is correct
    if query_breed == retrieved_breeds[0]:
        accuracy_correct += 1
    # Precision@k: fraction of correct in top-k
    precision_sum += retrieved_breeds.count(query_breed) / top_k

recall_at_k = recall_correct / len(query_paths)
accuracy_at_1 = accuracy_correct / len(query_paths)
precision_at_k = precision_sum / len(query_paths)

print(f"CLIP Recall@{top_k}: {recall_at_k:.4f}")
print(f"CLIP Accuracy@1: {accuracy_at_1:.4f}")
print(f"CLIP Precision@{top_k}: {precision_at_k:.4f}")

after loading processor
Number of gallery images: 7349
after embeddings
after similarity
CLIP Recall@10: 1.0000
CLIP Accuracy@1: 0.9730
CLIP Precision@10: 0.9027


# EfficientNet
We also considered EfficientNet to make sure to take into account most of the models suggested by the literature available on the topic. However, the performances of such model were mixed similar to what observed with ResNet.

In [None]:
# ----- 1. Paths to gallery and query directories -----
gallery_dir = "/home/disi/oxford_pets/images/gallery"
query_dir = '/home/disi/oxford_pets/images/query'


# ----- 2. Dataset -----
class ImageOnlyDataset(Dataset):
    def __init__(self, image_dir, transform=None):
        self.image_paths = []
        for root, _, files in os.walk(image_dir):
            for fname in files:
                if fname.lower().endswith(('.png', '.jpg', '.jpeg')):
                    self.image_paths.append(os.path.join(root, fname))
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        image = Image.open(img_path).convert("RGB")
        if self.transform:
            image = self.transform(image)
        return image, img_path

# ----- 3. Image preprocessing -----
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )
])

# ----- 4. Load data -----
gallery_dataset = ImageOnlyDataset(gallery_dir, transform=transform)
query_dataset = ImageOnlyDataset(query_dir, transform=transform)

gallery_loader = DataLoader(gallery_dataset, batch_size=32, shuffle=False)
query_loader = DataLoader(query_dataset, batch_size=32, shuffle=False)

# ----- 5. Load EfficientNetB0 and remove classifier -----
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = models.efficientnet_b0(pretrained=True)
model.classifier = nn.Identity()
model.eval().to(device)

# ----- 6. Extract embeddings -----
def extract_embeddings(loader):
    embeddings = []
    paths = []
    with torch.no_grad():
        for images, batch_paths in loader:
            images = images.to(device)
            features = model(images)
            embeddings.append(features.cpu().numpy())
            paths.extend(batch_paths)
    return np.vstack(embeddings), paths

gallery_embeddings, gallery_paths = extract_embeddings(gallery_loader)
query_embeddings, query_paths = extract_embeddings(query_loader)

# ----- 7. Similarity and Ranking -----
similarity_matrix = cosine_similarity(query_embeddings, gallery_embeddings)
top_k = 10
topk_indices = np.argsort(-similarity_matrix, axis=1)[:, :top_k]

# ----- 8. Evaluation -----
def get_label(filepath):
    base = os.path.basename(filepath)
    return base[:base.rfind('_')]  # assumes filenames like 'label_xxx.jpg'

recall_correct = 0
accuracy_correct = 0
precision_sum = 0

for i, indices in enumerate(topk_indices):
    query_label = get_label(query_paths[i])
    retrieved_labels = [get_label(gallery_paths[idx]) for idx in indices]

    if query_label in retrieved_labels:
        recall_correct += 1
    if query_label == retrieved_labels[0]:
        accuracy_correct += 1
    precision_sum += retrieved_labels.count(query_label) / top_k

recall_at_k = recall_correct / len(query_paths)
accuracy_at_1 = accuracy_correct / len(query_paths)
precision_at_k = precision_sum / len(query_paths)

# ----- 9. Print results -----
print(f"EfficientNet-B0 Recall@{top_k}: {recall_at_k:.4f}")
print(f"EfficientNet-B0 Accuracy@1: {accuracy_at_1:.4f}")
print(f"EfficientNet-B0 Precision@{top_k}: {precision_at_k:.4f}")



Downloading: "https://download.pytorch.org/models/efficientnet_b0_rwightman-7f5810bc.pth" to /home/disi/.cache/torch/hub/checkpoints/efficientnet_b0_rwightman-7f5810bc.pth


100%|██████████| 20.5M/20.5M [00:00<00:00, 208MB/s]


EfficientNet-B0 Recall@10: 0.9730
EfficientNet-B0 Accuracy@1: 0.8919
EfficientNet-B0 Precision@10: 0.8784
