# Import libraries

In [2]:
import numpy as np
import pandas as pd
import os
import torch
import re
import string
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import Counter
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
pd.set_option("max_colwidth", None)
from tqdm import tqdm
import clip

  from .autonotebook import tqdm as notebook_tqdm


# Load images metadata to a df

In [3]:
image_dir = "/workspace/Dataset/conclip/ccneg_images/cc3m_subset_images_extracted_final"

image_extensions = {'.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tiff', '.webp'}

image_data = []
for root, _, files in os.walk(image_dir):
    for file in files:
        if os.path.splitext(file)[1].lower() in image_extensions:
            image_path = os.path.join(root, file)
            image_data.append({'image_name': file, 'image_path': image_path})

images_metadata_df = pd.DataFrame(image_data)
print(images_metadata_df.shape)
images_metadata_df.head()

(0, 0)


In [None]:
import os
import torch
import clip
from PIL import Image
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt

# Set up device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load the CLIP model
model, preprocess = clip.load("ViT-B/32", device=device)

# Define path to image dataset
image_dir = "/path/to/your/images"  # e.g., COCO or VOC2007
image_files = [os.path.join(image_dir, fname) for fname in os.listdir(image_dir)
               if fname.lower().endswith(('.jpg', '.jpeg', '.png'))]

# Load and preprocess images
image_embeddings = []
image_paths = []
print("Encoding images...")
for img_path in tqdm(image_files):
    try:
        image = preprocess(Image.open(img_path).convert("RGB")).unsqueeze(0).to(device)
        with torch.no_grad():
            image_feat = model.encode_image(image)
        image_embeddings.append(image_feat.cpu())
        image_paths.append(img_path)
    except Exception as e:
        print(f"Error with {img_path}: {e}")

# Stack image features
image_embeddings = torch.cat(image_embeddings, dim=0)
image_embeddings /= image_embeddings.norm(dim=-1, keepdim=True)  # normalize

# Define a negated text query
query = "a dog without a leash"  # replace with your own
text_tokens = clip.tokenize([query]).to(device)
with torch.no_grad():
    text_embedding = model.encode_text(text_tokens).cpu()
    text_embedding /= text_embedding.norm(dim=-1, keepdim=True)

# Compute cosine similarity
similarities = (100.0 * image_embeddings @ text_embedding.T).squeeze(1)

# Rank and display top-k
top_k = 5
top_indices = similarities.topk(top_k).indices
print(f"\nTop {top_k} results for query: '{query}'\n")

# Display top results
for rank, idx in enumerate(top_indices):
    path = image_paths[idx]
    score = similarities[idx].item()
    print(f"Rank {rank + 1}: {path} (Score: {score:.2f})")
    img = Image.open(path)
    plt.imshow(img)
    plt.axis("off")
    plt.title(f"Rank {rank + 1} | Score: {score:.2f}")
    plt.show()
