In [None]:
import torch
from transformers import  CLIPModel
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from transformers import CLIPProcessor
from torch.utils.data import DataLoader
import json
from torch.utils.data import Dataset
from PIL import Image
import os
from tqdm import tqdm
import time

In [11]:
train_dir = 'train'
gallery_dir = 'test-2/gallery'
query_dir = 'test-2/query'

In [12]:
class CLIPImageDataset(Dataset):
    def __init__(self, image_dir, processor):
        """
        image_dir: directory con immagini
        processor: istanza di CLIPProcessor da Hugging Face
        """
        self.image_dir = image_dir
        self.image_paths = [
            os.path.join(image_dir, fname)
            for fname in os.listdir(image_dir)
            if fname.lower().endswith(('.png', '.jpg', '.jpeg'))
        ]
        self.processor = processor

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        image = Image.open(img_path).convert("RGB")
        
        # Usa il processor CLIP per ottenere pixel_values
        inputs = self.processor(images=image, return_tensors="pt")
        pixel_values = inputs["pixel_values"].squeeze(0)  # Remove batch dim

        return pixel_values, img_path  # Ritorna tensor e percorso per tracciamento

In [None]:
print("CUDA available:", torch.cuda.is_available())  # Stampa True o False

if torch.cuda.is_available():
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
    print(f"GPU Memory allocated: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
    print(f"GPU Memory cached: {torch.cuda.memory_reserved() / 1e9:.2f} GB")

CUDA available: False


In [None]:
print(torch.__version__)
print(torch.version.cuda)

2.7.0+cu126
12.6


In [None]:
# Istanzia il processor di CLIP
processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14-336")
device = "cuda" if torch.cuda.is_available() else "cpu"
model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14-336").vision_model.to(device)  # solo parte visiva

# Crea istanze del dataset aggiornato
gallery_dataset = CLIPImageDataset(gallery_dir, processor=processor)
query_dataset = CLIPImageDataset(query_dir, processor=processor)

gallery_loader = DataLoader(gallery_dataset, batch_size=8, shuffle=False)  # era 32
query_loader = DataLoader(query_dataset, batch_size=8, shuffle=False)

In [None]:
with torch.no_grad():
    gallery_embeddings = []
    query_embeddings = []
    gallery_paths = []
    query_paths = []

    # Extract gallery embeddings con progress bar
    print("Extracting gallery embeddings...")
    for i, (pixel_values, paths) in enumerate(tqdm(gallery_loader)):
        start_time = time.time()
        
        pixel_values = pixel_values.to(device)
        outputs = model(pixel_values=pixel_values)
        emb = outputs.pooler_output
        gallery_embeddings.append(emb.cpu().numpy())
        gallery_paths.extend(paths)
        
        # Stampa timing ogni 10 batch
        if i % 10 == 0:
            print(f"Batch {i}: {time.time() - start_time:.2f}s")
        
        # Libera memoria GPU
        del pixel_values, outputs, emb
        torch.cuda.empty_cache() if torch.cuda.is_available() else None

    # Extract query embeddings
    print("Extracting query embeddings...")
    for i, (pixel_values, paths) in enumerate(tqdm(query_loader)):
        start_time = time.time()
        
        pixel_values = pixel_values.to(device)
        outputs = model(pixel_values=pixel_values)
        emb = outputs.pooler_output
        query_embeddings.append(emb.cpu().numpy())
        query_paths.extend(paths)
        
        if i % 10 == 0:
            print(f"Batch {i}: {time.time() - start_time:.2f}s")
            
        # Libera memoria GPU
        del pixel_values, outputs, emb
        torch.cuda.empty_cache() if torch.cuda.is_available() else None

In [None]:
# Stack all embedding batches into single numpy arrays
gallery_embeddings = np.vstack(gallery_embeddings)  # shape: (N_gallery, D)
query_embeddings = np.vstack(query_embeddings)      # shape: (N_query, D)

# Compute cosine similarity between each query and all gallery embeddings
similarity_matrix = cosine_similarity(query_embeddings, gallery_embeddings)

# For each query, find the index of the most similar gallery image
retrieved_indices = np.argmax(similarity_matrix, axis=1)

top_k = 10
top_k_indices = np.argsort(similarity_matrix, axis=1)[:, -top_k:][:, ::-1]

# Build results dictionary in the required format
results = {}

for i, indices in enumerate(top_k_indices):
    # Extract just the filename from the full path
    query_filename = os.path.basename(query_paths[i])
    
    # Get the top-k gallery filenames
    retrieved_filenames = [os.path.basename(gallery_paths[idx]) for idx in indices]
    
    results[query_filename] = retrieved_filenames

# Save results to JSON file
output_file = "retrieval_results.json"
with open(output_file, 'w') as f:
    json.dump(results, f, indent=2)

print(f"Results saved to {output_file}")

# Optional: Print a few examples to verify format
print("\nFirst 3 results:")
for i, (query, retrieved) in enumerate(results.items()):
    if i >= 3:
        break
    print(f"Query: {query}")
    print(f"Top-3 Retrieved: {retrieved[:3]}")
    print("-" * 50)