In [1]:
import torch
import clip

In [2]:
# Load CLIP before importing other libraries to avoid memory issues
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

In [3]:
import pandas as pd
import os
import numpy as np
import faiss
from PIL import Image
from sklearn.metrics.pairwise import cosine_similarity
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize
from pathlib import Path
import nltk
from nltk.corpus import stopwords
nltk.download("punkt")
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/aneeshponduru/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aneeshponduru/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# Load data and preprocess
df_raw = pd.read_parquet("train-00000-of-00001-60e50ff5fbbd1bb5.parquet")
df_unique = df_raw.drop_duplicates(subset=["video_id"]).copy()
df_unique["image_path"] = df_unique["video_id"].apply(lambda vid: f"images/{vid}.jpg")
df_unique = df_unique[df_unique["image_path"].apply(lambda p: Path(p).exists())].reset_index(drop=True)

In [5]:
# Initialize image embeddings and valid video IDs
clip_embeddings = []
valid_video_ids = []

# for each image, preprocess and encode
for _, row in df_unique.iterrows():
    try:
        # Preprocess the image
        image = preprocess(Image.open(row["image_path"])).unsqueeze(0).to(device)
        with torch.no_grad():
            # Encode the image
            embedding = model.encode_image(image).cpu().numpy()[0]

        # Store the embedding and video ID
        clip_embeddings.append(embedding)
        valid_video_ids.append(row["video_id"])

        # Clear memory
        del image
        torch.cuda.empty_cache()
    except Exception as e:
        # Handle errors by skipping the video
        print(f"Skipping {row['video_id']}: {e}")

# Create DataFrame for valid videos
clip_embeddings = np.array(clip_embeddings)
df_clip = df_unique[df_unique["video_id"].isin(valid_video_ids)].reset_index(drop=True)

In [6]:
# Build FAISS index and normalize embeddings
clip_embeddings = clip_embeddings / np.linalg.norm(clip_embeddings, axis=1, keepdims=True)
index = faiss.IndexFlatIP(clip_embeddings.shape[1])
index.add(clip_embeddings)

# Use captions to create BM25 index
df_clip["tokens"] = df_clip["caption"].fillna("").apply(
    lambda x: [word for word in word_tokenize(x.lower()) if word.isalnum() and word not in stop_words]
)

# Initialize BM25 with tokenized captions
bm25 = BM25Okapi(df_clip["tokens"].tolist())

In [7]:
# Define queries
queries = [
    "a snowy mountain with a clear blue sky",
    "a crowded city street at night with neon signs",
    "a peaceful beach with palm trees and waves crashing",
    "a person jogging through a park in the morning",
    "a chef chopping vegetables in a kitchen",
    "a surfer riding a big wave",
    "a red sports car in front of a building",
    "a golden retriever sitting on a grassy field",
    "a firefighter rescuing a person from a burning building at night",
    "a person near luxury cars in cloudy weather speaking to someone"
]

In [8]:
# Encode queries with CLIP
with torch.no_grad():
    text_tokens = clip.tokenize(queries).to(device)
    query_clip_embs = model.encode_text(text_tokens).cpu().numpy()

# Use queries to get BM25 scores
query_bm25_tokens = [word_tokenize(q.lower()) for q in queries]
bm25_scores_all = [bm25.get_scores(tokens) for tokens in query_bm25_tokens]

In [9]:
# Initialize list for results
results = []

# for each query, compute scores and rank
for i, query in enumerate(queries):
    # Get the query embedding
    query_emb = query_clip_embs[i]

    # Normalize the query embedding
    query_emb = query_emb / np.linalg.norm(query_emb)

    # Reshape to 2D for FAISS
    query_emb_2d = np.expand_dims(query_emb, axis=0)

    # FAISS cosine-like search
    faiss_scores_all, _ = index.search(query_emb_2d, len(df_clip))
    clip_scores = faiss_scores_all[0]

    # BM25 scoring
    bm25_scores = np.array(bm25_scores_all[i])
    norm_bm25 = (bm25_scores - np.min(bm25_scores)) / (np.max(bm25_scores) - np.min(bm25_scores))

    # Hybrid scoring
    hybrid_scores = 0.7 * clip_scores + 0.3 * norm_bm25
    top_indices = np.argsort(hybrid_scores)[::-1][:5]

    for rank, idx in enumerate(top_indices):
        results.append({
            "query": query,
            "rank": rank + 1,
            "video_id": df_clip.iloc[idx]["video_id"],
            "caption": df_clip.iloc[idx]["caption"],
            "url": df_clip.iloc[idx]["url"],
            "start time": df_clip.iloc[idx]["start time"],
            "end time": df_clip.iloc[idx]["end time"],
            "clip_similarity": clip_scores[idx],
            "bm25_score": bm25_scores[idx],
            "normalized bm25": norm_bm25[idx],
            "hybrid_score": hybrid_scores[idx]
        })

In [None]:
# Save results as a DataFrame
df_results = pd.DataFrame(results)

# Use query list to get the same query order as the list of queries
query_order = {q: i for i, q in enumerate(queries)}
df_results["query_order"] = df_results["query"].map(query_order)

# Sort by custom order, then hybrid score
df_results = df_results.sort_values(["query_order", "hybrid_score"], ascending=[True, False])
df_results = df_results.drop(columns=["query_order"])

# Save to CSV for manual annotation
df_results = pd.DataFrame(results)
if not os.path.exists("a_grade_hybrid_results.csv"):
    df_results.to_csv("a_grade_hybrid_results.csv", index=False)
    print("Results saved to a_grade_hybrid_results.csv")
else:
    print("File already exists. No save performed.")

Results saved to a_grade_hybrid_results-2.csv


In [14]:
# Reload the results after manual annotation
df = pd.read_csv("a_grade_hybrid_results.csv")

# Convert the relevance column to numeric, handling errors and filling NaN with 0
df["relevance"] = pd.to_numeric(df["relevance"], errors="coerce").fillna(0).astype(int)

# Function to calculate DCG
def dcg(scores):
    return sum(score / np.log2(idx + 2) for idx, score in enumerate(scores))

# Dictionary to store NDCG scores
ndcg_scores = {}

# Calculate NDCG for each query
for query, group in df.groupby("query"):
    # Sort by predicted rank 
    top_k = group.sort_values("rank").head(5)
    
    # Calculate DCG for the ranking
    relevance = top_k["relevance"].tolist()
    actual_dcg = dcg(relevance)
    
    # Ideal DCG (perfect ranking of same scores)
    ideal_relevance = sorted(relevance, reverse=True)
    ideal_dcg = dcg(ideal_relevance)
    
    # Calculate NDCG
    ndcg = actual_dcg / ideal_dcg if ideal_dcg > 0 else 0
    ndcg_scores[query] = ndcg

# Print per-query and average NDCG@5
print("NDCG@5 per query:")
for query, score in ndcg_scores.items():
    print(f"- {query[:50]}...: {score:.3f}")

mean_ndcg = np.mean(list(ndcg_scores.values()))
print(f"\nMean NDCG@5 across all queries: {mean_ndcg:.3f}")

NDCG@5 per query:
- a chef chopping vegetables in a kitchen...: 0.891
- a crowded city street at night with neon signs...: 0.968
- a firefighter rescuing a person from a burning bui...: 1.000
- a golden retriever sitting on a grassy field...: 0.624
- a peaceful beach with palm trees and waves crashin...: 0.967
- a person jogging through a park in the morning...: 0.631
- a person near luxury cars in cloudy weather speaki...: 1.000
- a red sports car in front of a building...: 0.567
- a snowy mountain with a clear blue sky...: 0.969
- a surfer riding a big wave...: 0.000

Mean NDCG@5 across all queries: 0.762
