## Setting Up

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import random

In [None]:
# Utillity functions
data_dir = "/mnt/DATA/duyen/highres/LSC23/LSC23_highres_images"

def image_to_path(image_id):
    """image_id = %Y%m%d_xxx
    path = %DIR/%Y/%m/%d/image_id.jpg
    """
    return f"{data_dir}/{image_id[:6]}/{image_id[6:8]}/{image_id}"

def show_images(images, shuffle=True):
    if shuffle:
        images = random.sample(list(images), 9)
    else:
        images = images[:9]
    images = [image_to_path(image) for image in images]
    fig, axes = plt.subplots(3, 3, figsize=(10, 10))
    for i, ax in enumerate(axes.flat):
        img = mpimg.imread(images[i])
        ax.imshow(img)
        ax.axis('off')
    plt.show()
    

## Concept-based Retrieval

In [None]:
df = pd.read_csv('metadata.csv')
df.tail()

In [None]:
# Filter based on location
location = "Dublin"
filtered_images = df[df["city"].str.contains(location, na=False, case=False)]
show_images(filtered_images["ImageID"].values)

# Object Detection example using Detr model

In [None]:
"""
Code modified from https://huggingface.co/facebook/detr-resnet-50
"""
from transformers import DetrImageProcessor, DetrForObjectDetection
import torch
from PIL import Image

# load an image
ImageId = "20160808_111247_000.jpeg"
image = Image.open(ImageId)

# you can specify the revision tag if you don't want the timm dependency
processor = DetrImageProcessor.from_pretrained(
    "facebook/detr-resnet-50", revision="no_timm"
)
model = DetrForObjectDetection.from_pretrained(
    "facebook/detr-resnet-50", revision="no_timm"
)

inputs = processor(images=image, return_tensors="pt")
outputs = model(**inputs)

# convert outputs (bounding boxes and class logits) to COCO API
# let's only keep detections with score > 0.9
target_sizes = torch.tensor([image.size[::-1]])
results = processor.post_process_object_detection(
    outputs, target_sizes=target_sizes, threshold=0.9
)[0]

for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
    box = [round(i, 2) for i in box.tolist()]
    print(
        f"Detected {model.config.id2label[label.item()]} with confidence "
        f"{round(score.item(), 3)} at location {box}"
    )

In [None]:
# The csv file already contains the tags for each image
df["Tags"]

In [None]:
# Filter based on tags
tags = ["building", "road"]
filtered_images = df[df["Tags"].str.contains("|".join(tags), na=False, case=False)]
show_images(filtered_images["ImageID"].values)

In [None]:
# Also other metadata
df[["OCR"]].dropna()

In [None]:
# Filter based on OCR
ocr = ["DCU"]
filtered_images = df[df["OCR"].str.contains("|".join(ocr), na=False, case=False)]

exclude_tags = ["screen", "laptop", "monitor"]
filtered_images = filtered_images[~filtered_images["Tags"].str.contains("|".join(exclude_tags), na=False, case=False)]

show_images(filtered_images["ImageID"].values)

# Captioning

In [None]:
df[["Caption"]].dropna()

## TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

# Fit the TfidfVectorizer on the caption
X = vectorizer.fit_transform(df["Caption"].dropna())

# Search for images based on the caption
query = "I am ordering a coffee at a cafe"
query_vector = vectorizer.transform([query])
results = (X @ query_vector.T).toarray().ravel()
best = results.argsort()[-9:][::-1]
show_images(df.iloc[best]["ImageID"].values, shuffle=False)

# Embedding-based Retrieval

In [None]:
import torch
import clip
from PIL import Image

device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-L/14", device=device)

## Compare image and text similarity

In [None]:
image = preprocess(Image.open("20160808_111247_000.jpeg")).unsqueeze(0).to(device)
text = clip.tokenize(["I am ordering a coffee at a cafe", "I am hiking in the mountains"]).to(device)

# Encode the image and the text
with torch.no_grad():
    image_features = model.encode_image(image)
    text_features = model.encode_text(text)

# Calculate the similarity
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1).cpu().numpy()

print(similarity.round(2))
print("The first sentence is more similar to the image than the second one")

## Retrieval 

In [None]:
from tqdm.auto import tqdm
# Let's encode all the images first
image_features = {}
print("This might take a while...")
all_images = df["ImageID"].values
# all_images = random.sample(list(all_images), 1000)
for image_id in tqdm(all_images):
    try:
        image = preprocess(Image.open(image_to_path(image_id))).unsqueeze(0).to(device)
        with torch.no_grad():
            feature = model.encode_image(image)
            feature = feature / feature.norm(dim=-1, keepdim=True)
            image_features[image_id] = feature.cpu().numpy()
    except Exception as e:
        continue

In [None]:
# Let's save the features so we can load them later
torch.save(image_features, "image_features.pt")

In [None]:
import numpy as np
# image_features = torch.load("image_features.pt")
# all_images = list(image_features.keys())
# image_features = np.concatenate(list(image_features.values()), axis=0)
image_features = np.load("/mnt/DATA/duyen/highres/LSC23/ViT-L-14-336_openai_nonorm/features.npy")
image_features = image_features / np.linalg.norm(image_features, axis=1, keepdims=True)
all_images = pd.read_csv("/mnt/DATA/duyen/highres/LSC23/ViT-L-14-336_openai_nonorm/photo_ids.csv")["photo_id"].tolist()
all_images = [image.split("/")[-1] for image in all_images]

In [None]:
text = "I am ordering a sandwich at a cafe at the till"

text = clip.tokenize([text]).to(device)
with torch.no_grad():
    text_features = model.encode_text(text)
    text_features /= text_features.norm(dim=-1, keepdim=True)
    text_features = text_features.cpu().numpy()

# Calculate the similarity
similarity = (100.0 * image_features @ text_features.T).ravel()

# Show the images with the highest similarity
idx = similarity.argsort()[-9:][::-1]
show_images(np.array(all_images)[idx], shuffle=False)

# Query by example

In [None]:
example_image = "20160808_111247_000.jpeg"
image = Image.open(example_image)
image = preprocess(image).unsqueeze(0).to(device)
with torch.no_grad():
    example_features = model.encode_image(image)
    example_features /= example_features.norm(dim=-1, keepdim=True)
    example_features = example_features.cpu().numpy()

# Calculate the similarity
similarity = (100.0 * image_features @ example_features.T).squeeze()
similarity = similarity.argsort()[-9:][::-1]
show_images(np.array(all_images)[similarity], shuffle=False)