In [None]:
import torch
import clip
from PIL import Image
import faiss
import numpy as np
import os


In [None]:
# Load CLIP
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)
print("device" ,device)

device cpu


In [None]:
# Encode dataset images
embeddings = []
ids = []

dataset_path = "dataset/"
for id_article in os.listdir(dataset_path):
    folder = os.path.join(dataset_path, id_article)
    if os.path.isdir(folder):
        for file in os.listdir(folder):
            image = preprocess(Image.open(os.path.join(folder, file))).unsqueeze(0).to(device)
            with torch.no_grad():
                emb = model.encode_image(image).cpu().numpy()
            embeddings.append(emb)
            ids.append(id_article)

embeddings = np.vstack(embeddings).astype("float32")


In [None]:
# Build FAISS index
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)


In [None]:
# Query with new image - Get top 3 unique most similar articles
query_img = preprocess(Image.open("test/image.jpg")).unsqueeze(0).to(device)
with torch.no_grad():
    q_emb = model.encode_image(query_img).cpu().numpy()

# Search for more results to ensure we get unique article IDs
search_k = min(50, len(ids))  # Search more results
D, I = index.search(q_emb, search_k)

# Get unique article IDs with their best scores
unique_results = {}
for i in range(search_k):
    article_id = ids[I[0][i]]
    distance = D[0][i]
    
    # Keep only the best (lowest distance) for each article ID
    if article_id not in unique_results or distance < unique_results[article_id]['distance']:
        unique_results[article_id] = {
            'distance': distance,
            'index': i
        }

# Sort by distance (lower is better) and take top 3 unique results
sorted_results = sorted(unique_results.items(), key=lambda x: x[1]['distance'])[:3]

print("Top 3 most similar UNIQUE articles:")
print("=" * 50)
for i, (article_id, result_info) in enumerate(sorted_results):
    distance = result_info['distance']
    # Convert L2 distance to similarity score (lower distance = higher similarity)
    # Using negative distance as similarity score (higher is better)
    similarity_score = -distance
    print(f"{i+1}. Article ID: {article_id}")
    print(f"   Similarity Score: {similarity_score:.4f}")
    print(f"   Distance: {distance:.4f}")
    print()


Top 3 most similar UNIQUE articles:
1. Article ID: R124
   Similarity Score: -53.9524
   Distance: 53.9524

2. Article ID: R025
   Similarity Score: -57.0158
   Distance: 57.0158

3. Article ID: R011
   Similarity Score: -57.0158
   Distance: 57.0158

