In [None]:

import pandas as pd
import requests
import time
import pandas as pd
import torch
import clip
from PIL import Image
from pathlib import Path
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [None]:
# function to check if a video exists on YouTube
# exists if the video is available and not private
def video_exists_youtube(url):
    try:
        # Headers to mimic a browser request
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
        }
        # Get the video page
        response = requests.get(url, headers=headers, timeout=10)
        time.sleep(1)  # Be polite to the server

        # Check if the response is successful
        if response.status_code != 200:
            return False

        # Get the HTML content and intialize unavailable phrases
        html = response.text.lower()
        unavailable_phrases = [
            "video is not available",
            "video unavailable",
            "this video is private",
            "this video has been removed",
            "this video is no longer available"
        ]

        # Check if any of the unavailable phrases are in the HTML
        for phrase in unavailable_phrases:
            if phrase in html:
                return False

        return True

    # exception handling for request failures
    except Exception as e:
        print(f"Error checking video {url}: {e}")
        return False


In [None]:
# Get data for the 100 sampled videos
df_unique = pd.read_csv("sampled_data_100_videos.csv")

# Add image path for each video
df_unique["image_path"] = df_unique["video_id"].apply(lambda vid: f"images/{vid}.jpg")

# Filter out rows where the image file does not exist
df_unique = df_unique[df_unique["image_path"].apply(lambda p: Path(p).exists())].reset_index(drop=True)

# Load CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

# Initialize lists to store image embeddings and valid video IDs
image_embeddings = []
valid_video_ids = []

# For each image, preprocess and encode it
for _, row in df_unique.iterrows():
    try:
        # Use CLIP to preprocess and encode the image
        image = preprocess(Image.open(row["image_path"])).unsqueeze(0).to(device)
        with torch.no_grad():
            image_embedding = model.encode_image(image).cpu().numpy()[0]
        
        # Add the image embedding and video_id to the respective list
        image_embeddings.append(image_embedding)
        valid_video_ids.append(row["video_id"])
    # handle exceptions
    except Exception as e:
        print(f"Skipping {row['video_id']}: {e}")

# Restrict to only those that succeeded
df_clip = df_unique[df_unique["video_id"].isin(valid_video_ids)].reset_index(drop=True)
image_embeddings = np.array(image_embeddings)

# Initialize a list of queries
queries = [
    "a snowy mountain with a clear blue sky",
    "a crowded city street at night with neon signs",
    "a peaceful beach with palm trees and waves crashing",
    "a person jogging through a park in the morning",
    "a chef chopping vegetables in a kitchen",
    "a surfer riding a big wave",
    "a red sports car in front of a building",
    "a golden retriever sitting on a grassy field",
    "a firefighter rescuing a person from a burning building at night",
    "a person near luxury cars in cloudy weather speaking to someone"
]

# Encode the text queries using CLIP text encoder
with torch.no_grad():
    text_tokens = clip.tokenize(queries).to(device)
    text_embeddings = model.encode_text(text_tokens).cpu().numpy()

# Compute cosine similarity between each query and all video frames
similarities = cosine_similarity(text_embeddings, image_embeddings)

# Set a threshold for similarity
threshold = 0.15
results = []

# for each query, find the top 5 most similar video frames
for i, query in enumerate(queries):
    # Get the similarity scores for the current query
    sim_scores = similarities[i]
    
    # Store the similarity scores in the dataframe
    df_clip["similarity"] = sim_scores

    # Filter out rows with similarity below the threshold and sort by similarity, taking the top 5
    top_matches = df_clip[df_clip["similarity"] > threshold].sort_values("similarity", ascending=False).head(5)

    # For each top match, append to results
    for rank, (_, row) in enumerate(top_matches.iterrows(), start=1):
        results.append({
            "query": query,
            "rank": rank,
            "video_id": row["video_id"],
            "caption": row["caption"],
            "url": row["url"],
            "start": row["start time"],
            "end": row["end time"],
            "similarity": row["similarity"]
        })

# Save to CSV for manual annotation
df_results = pd.DataFrame(results)
if not os.path.exists("clip_bplus_results.csv"):
    results_df.to_csv("clip_bplus_results.csv", index=False)
    print("Results saved to clip_bplus_results.csv")
else:
    print("File already exists. No save performed.")

In [None]:
# Reload the results after manual annotation
df = pd.read_csv("clip_bplus_results.csv")

# Convert the relevance column to numeric, handling errors and filling NaN with 0
df["relevance"] = pd.to_numeric(df["relevance"], errors="coerce").fillna(0).astype(int)

# Function to calculate DCG
def dcg(scores):
    return sum(score / np.log2(idx + 2) for idx, score in enumerate(scores))

# Dictionary to store NDCG scores
ndcg_scores = {}

# Calculate NDCG for each query
for query, group in df.groupby("query"):
    # Sort by predicted rank 
    top_k = group.sort_values("rank").head(5)
    
    # Calculate DCG for the ranking
    relevance = top_k["relevance"].tolist()
    actual_dcg = dcg(relevance)
    
    # Ideal DCG (perfect ranking of same scores)
    ideal_relevance = sorted(relevance, reverse=True)
    ideal_dcg = dcg(ideal_relevance)
    
    # Calculate NDCG
    ndcg = actual_dcg / ideal_dcg if ideal_dcg > 0 else 0
    ndcg_scores[query] = ndcg

# Print per-query and average NDCG@5
print("NDCG@5 per query:")
for query, score in ndcg_scores.items():
    print(f"- {query[:50]}...: {score:.3f}")

mean_ndcg = np.mean(list(ndcg_scores.values()))
print(f"\nMean NDCG@5 across all queries: {mean_ndcg:.3f}")

NDCG@5 per query:
- a chef chopping vegetables in a kitchen...: 1.000
- a crowded city street at night with neon signs...: 0.968
- a firefighter rescuing a person from a burning bui...: 0.967
- a golden retriever sitting on a grassy field...: 0.877
- a peaceful beach with palm trees and waves crashin...: 1.000
- a person jogging through a park in the morning...: 1.000
- a person near luxury cars in cloudy weather speaki...: 1.000
- a red sports car in front of a building...: 1.000
- a snowy mountain with a clear blue sky...: 0.500
- a surfer riding a big wave...: 1.000

Mean NDCG@5 across all queries: 0.931
