In [None]:
import pandas as pd
import requests
import time
import pandas as pd
import torch
import clip
from PIL import Image
from pathlib import Path
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
def video_exists_youtube(url):
    try:
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
        }
        response = requests.get(url, headers=headers, timeout=10)
        time.sleep(1)  # Be polite to the server

        if response.status_code != 200:
            return False

        html = response.text.lower()
        unavailable_phrases = [
            "video is not available",
            "video unavailable",
            "this video is private",
            "this video has been removed",
            "this video is no longer available"
        ]

        for phrase in unavailable_phrases:
            if phrase in html:
                return False

        return True

    except Exception as e:
        print(f"Error checking video {url}: {e}")
        return False


In [None]:
# Load MSR-VTT and remove duplicates
df_raw = pd.read_parquet("train-00000-of-00001-60e50ff5fbbd1bb5.parquet")
df_unique = df_raw.drop_duplicates(subset=["video_id"])

# Check availability of first 550 URLs
urls = df_unique["url"].tolist()[:550]
valid_urls = [url for url in urls if video_exists_youtube(url)]

# Filter to only valid videos
df_valid = df_unique[df_unique["url"].isin(valid_urls)]

# Sample 400 from the valid set
sampled_video_ids = df_valid.sample(n=400, random_state=1)

In [4]:
sampled_video_ids.shape

(400, 10)

In [5]:
df_original_sample = pd.read_csv("sampled_data_100_videos.csv")

sample_1k = pd.concat([df_original_sample, sampled_video_ids], ignore_index=True)

In [6]:
sample_1k.shape

(500, 10)

In [None]:
# Load and prepare metadata
df_raw = pd.read_parquet("train-00000-of-00001-60e50ff5fbbd1bb5.parquet")
df_unique = df_raw.drop_duplicates(subset=["video_id"]).copy()
df_unique["image_path"] = df_unique["video_id"].apply(lambda vid: f"images/{vid}.jpg")
df_unique = df_unique[df_unique["image_path"].apply(lambda p: Path(p).exists())].reset_index(drop=True)

# Load CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

# Initialize image embeddings and valid video IDs
clip_embeddings = []
valid_video_ids = []

# for each image, preprocess and encode
for _, row in df_unique.iterrows():
    try:
        # Preprocess the image
        image = preprocess(Image.open(row["image_path"])).unsqueeze(0).to(device)
        with torch.no_grad():
            # Encode the image
            emb = model.encode_image(image).cpu().numpy()[0]

        # Store the embedding and video ID
        clip_embeddings.append(emb)
        valid_video_ids.append(row["video_id"])

    except Exception as e:
        # Handle errors by skipping the video
        print(f"Skipping {row['video_id']}: {e}")

# Create DataFrame for valid videos
df_clip = df_unique[df_unique["video_id"].isin(valid_video_ids)].reset_index(drop=True)
clip_embeddings = np.array(clip_embeddings)

# Define text queries
queries = [
    "a snowy mountain with a clear blue sky",
    "a crowded city street at night with neon signs",
    "a peaceful beach with palm trees and waves crashing",
    "a person jogging through a park in the morning",
    "a chef chopping vegetables in a kitchen",
    "a surfer riding a big wave",
    "a red sports car in front of a building",
    "a golden retriever sitting on a grassy field",
    "a firefighter rescuing a person from a burning building at night",
    "a person near luxury cars in cloudy weather speaking to someone"
]

# Encode text queries
with torch.no_grad():
    tokenized = clip.tokenize(queries).to(device)
    query_embeddings = model.encode_text(tokenized).cpu().numpy()

# Compute cosine similarity between query and image embeddings
similarities = cosine_similarity(query_embeddings, clip_embeddings)

# Initialize results list
results = []

# For each query, find the top 5 most similar videos
for i, query in enumerate(queries):
    # Get the similarity scores for the current query
    scores = similarities[i]

    # Store the similarity scores in the dataframe
    df_clip["similarity"] = scores

    # sort by similarity and get the top 5
    top5 = df_clip.sort_values("similarity", ascending=False).head(5)

    # For each of the top 5, create a result entry
    for rank, (_, row) in enumerate(top5.iterrows(), start=1):
        results.append({
            "query": query,
            "rank": rank,
            "video_id": row["video_id"],
            "caption": row["caption"],
            "url": row["url"],
            "start": row["start time"],
            "end": row["end time"],
            "similarity": row["similarity"]
        })

# Save to CSV for manual annotation
df_results = pd.DataFrame(results)
if not os.path.exists("clip_a_minus_results.csv"):
    results_df.to_csv("clip_a_minus_results.csv", index=False)
    print("Results saved to clip_a_minus_results.csv")
else:
    print("File already exists. No save performed.")

In [None]:
# Reload the results after manual annotation
df = pd.read_csv("clip_a_minus_results.csv")

# Convert the relevance column to numeric, handling errors and filling NaN with 0
df["relevance"] = pd.to_numeric(df["relevance"], errors="coerce").fillna(0).astype(int)

# Function to calculate DCG
def dcg(scores):
    return sum(score / np.log2(idx + 2) for idx, score in enumerate(scores))

# Dictionary to store NDCG scores
ndcg_scores = {}

# Calculate NDCG for each query
for query, group in df.groupby("query"):
    # Sort by predicted rank 
    top_k = group.sort_values("rank").head(5)
    
    # Calculate DCG for the ranking
    relevance = top_k["relevance"].tolist()
    actual_dcg = dcg(relevance)
    
    # Ideal DCG (perfect ranking of same scores)
    ideal_relevance = sorted(relevance, reverse=True)
    ideal_dcg = dcg(ideal_relevance)
    
    # Calculate NDCG
    ndcg = actual_dcg / ideal_dcg if ideal_dcg > 0 else 0
    ndcg_scores[query] = ndcg

# Print per-query and average NDCG@5
print("NDCG@5 per query:")
for query, score in ndcg_scores.items():
    print(f"- {query[:50]}...: {score:.3f}")

mean_ndcg = np.mean(list(ndcg_scores.values()))
print(f"\nMean NDCG@5 across all queries: {mean_ndcg:.3f}")

NDCG@5 per query:
- a chef chopping vegetables in a kitchen...: 0.907
- a crowded city street at night with neon signs...: 1.000
- a firefighter rescuing a person from a burning bui...: 1.000
- a golden retriever sitting on a grassy field...: 0.950
- a peaceful beach with palm trees and waves crashin...: 0.776
- a person jogging through a park in the morning...: 0.983
- a person near luxury cars in cloudy weather speaki...: 0.896
- a red sports car in front of a building...: 0.956
- a snowy mountain with a clear blue sky...: 0.969
- a surfer riding a big wave...: 1.000

Mean NDCG@5 across all queries: 0.944
