In [1]:
!pip install -q ftfy regex tqdm
!pip install -q git+https://github.com/openai/CLIP.git
!pip install -q faiss-cpu pillow requests


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for clip (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/23.8 MB[0m [31m88.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import os
import clip
import torch
import faiss
import pickle
import requests
import numpy as np
import pandas as pd

from PIL import Image
from tqdm import tqdm
from io import BytesIO


In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)


Using device: cuda


In [4]:
model, preprocess = clip.load("ViT-B/32", device=device)
model.eval()


100%|████████████████████████████████████████| 338M/338M [00:01<00:00, 213MiB/s]


CLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): Sequential(
        (0): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (1): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          

In [5]:
csv_path = "photos_url.csv"
df = pd.read_csv(csv_path)

df.head()


Unnamed: 0,photo_image_url
0,https://images.unsplash.com/uploads/1413387620...
1,https://images.unsplash.com/reserve/jEs6K0y1Sb...
2,https://images.unsplash.com/uploads/1412192004...
3,https://images.unsplash.com/reserve/ijl3tATFRp...
4,https://images.unsplash.com/reserve/6vaWXsQuSW...


In [6]:
csv_path = "photos_url.csv"
df = pd.read_csv(csv_path)

df.head()


Unnamed: 0,photo_image_url
0,https://images.unsplash.com/uploads/1413387620...
1,https://images.unsplash.com/reserve/jEs6K0y1Sb...
2,https://images.unsplash.com/uploads/1412192004...
3,https://images.unsplash.com/reserve/ijl3tATFRp...
4,https://images.unsplash.com/reserve/6vaWXsQuSW...


In [7]:
MAX_IMAGES = 5000

df_sample = df.sample(n=MAX_IMAGES, random_state=42)
image_urls = df_sample["photo_image_url"].dropna().tolist()

print("Using images:", len(image_urls))


Using images: 5000


In [8]:
IMAGE_DIR = "images"
os.makedirs(IMAGE_DIR, exist_ok=True)

image_paths = []

for idx, url in tqdm(enumerate(image_urls), total=len(image_urls)):
    img_path = os.path.join(IMAGE_DIR, f"{idx}.jpg")

    try:
        response = requests.get(url, timeout=10)
        img = Image.open(BytesIO(response.content)).convert("RGB")
        img.save(img_path)
    except Exception:
        # fallback image
        img = Image.new("RGB", (224, 224), (0, 0, 0))
        img.save(img_path)

    image_paths.append(img_path)


100%|██████████| 5000/5000 [1:07:44<00:00,  1.23it/s]


In [9]:
image_embeddings = []

with torch.no_grad():
    for path in tqdm(image_paths):
        image = preprocess(Image.open(path).convert("RGB")).unsqueeze(0).to(device)
        embedding = model.encode_image(image)
        embedding = embedding / embedding.norm(dim=-1, keepdim=True)
        image_embeddings.append(embedding.cpu().numpy())

image_embeddings = np.vstack(image_embeddings).astype("float32")

print("Embeddings shape:", image_embeddings.shape)


100%|██████████| 5000/5000 [25:18<00:00,  3.29it/s]

Embeddings shape: (5000, 512)





In [10]:
embedding_dim = image_embeddings.shape[1]

index = faiss.IndexFlatIP(embedding_dim)
index.add(image_embeddings)

print("FAISS index size:", index.ntotal)


FAISS index size: 5000


In [11]:
def text_search(query, top_k=5):
    text = clip.tokenize([query]).to(device)

    with torch.no_grad():
        text_embedding = model.encode_text(text)

    text_embedding = text_embedding / text_embedding.norm(dim=-1, keepdim=True)
    text_embedding = text_embedding.cpu().numpy().astype("float32")

    scores, indices = index.search(text_embedding, top_k)

    return [(image_urls[i], float(scores[0][j])) for j, i in enumerate(indices[0])]


In [12]:
text_search("a person")


[('https://images.unsplash.com/photo-1566916350208-da3747267c55',
  0.2602486312389374),
 ('https://images.unsplash.com/photo-1576258405325-6ae2a1e8dbcd',
  0.2552069425582886),
 ('https://images.unsplash.com/unsplash-premium-photos-production/premium_photo-1698420085881-1b840c138a47',
  0.2538154721260071),
 ('https://images.unsplash.com/unsplash-premium-photos-production/premium_photo-1695635230516-e69891d27488',
  0.2538154721260071),
 ('https://images.unsplash.com/photo-1579127214712-62db05d8598c',
  0.2538154721260071)]

In [13]:
faiss.write_index(index, "image_index.faiss")


In [14]:
with open("image_urls.pkl", "wb") as f:
    pickle.dump(image_urls, f)

print("Saved image_urls.pkl")


Saved image_urls.pkl


In [15]:
from google.colab import files

files.download("image_index.faiss")
files.download("image_urls.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [16]:
text_search("a dog")

[('https://images.unsplash.com/photo-1553882809-a4f57e59501d',
  0.2848641872406006),
 ('https://images.unsplash.com/photo-1563889958749-625da26ed355',
  0.2834163308143616),
 ('https://images.unsplash.com/photo-1582487597916-a73a2cc1f068',
  0.2733270525932312),
 ('https://images.unsplash.com/photo-1565090116739-359c8ae0a83b',
  0.2730518579483032),
 ('https://images.unsplash.com/photo-1548113501-1c163f75e85c',
  0.2723201513290405)]