In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from rank_bm25 import BM25Okapi
import requests
import time
import os

nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/aneeshponduru/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
# Load the MSR-VTT train parquet file
df_raw = pd.read_parquet("train-00000-of-00001-60e50ff5fbbd1bb5.parquet")
df_raw.head(5)

Unnamed: 0,video_id,caption,sen_id,category,url,start time,end time,split,id,__index_level_0__
0,video0,a car is shown,77300,9,https://www.youtube.com/watch?v=9lZi22qLlEo,137.72,149.44,train,0,0
1,video1,in a kitchen a woman adds different ingredient...,110460,16,https://www.youtube.com/watch?v=w4JM08PDEng,184.33,206.89,train,1,1
2,video10,a man holds two dogs,47320,6,https://www.youtube.com/watch?v=CcJwo2eyfI0,33.33,46.53,train,10,2
3,video100,a basset hound sits outside a door,18360,12,https://www.youtube.com/watch?v=6S-47swQBBU,1146.06,1156.44,train,100,3
4,video1000,a woman is wearing a costume,49000,7,https://www.youtube.com/watch?v=ALrHNDBK-jw,738.93,749.93,train,1000,4


In [None]:
# function to check if a video exists on YouTube
# exists if the video is available and not private
def video_exists_youtube(url):
    try:
        # Headers to mimic a browser request
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
        }
        # Get the video page
        response = requests.get(url, headers=headers, timeout=10)
        time.sleep(1)  # Be polite to the server

        # Check if the response is successful
        if response.status_code != 200:
            return False

        # Get the HTML content and intialize unavailable phrases
        html = response.text.lower()
        unavailable_phrases = [
            "video is not available",
            "video unavailable",
            "this video is private",
            "this video has been removed",
            "this video is no longer available"
        ]

        # Check if any of the unavailable phrases are in the HTML
        for phrase in unavailable_phrases:
            if phrase in html:
                return False

        return True

    # exception handling for request failures
    except Exception as e:
        print(f"Error checking video {url}: {e}")
        return False


In [None]:
# Sample 90 unique video IDs and add 1 relevant video for each query (100 total videos)
# Drop duplicates per video
df_unique = df_raw.drop_duplicates(subset=["video_id"])

# Check availability for first 200 (this is for B-grade implementation. More videos will be added later)
urls = df_unique["url"].tolist()[:230]

# Check which videos are live
valid_urls = [url for url in urls if video_exists_youtube(url)][:200]

# Sample 90 from valid video IDs
df_valid = df_unique[df_unique["url"].isin(valid_urls)]
sampled_video_ids = df_valid.sample(n=90, random_state=1)["video_id"].to_list()

# Add 10 handpicked relevant videos
relevant_video_ids = [
    "video5870",
    "video4900",
    "video5524",
    "video5830",
    "video3855",
    "video3604",
    "video5942",
    "video3525",
    "video1634",
    "video453"
]

# Filter to unique video-caption pairs
df_unique = df_raw.drop_duplicates(subset=["video_id"])

# Get full rows for the 90 random videos
df_sampled_90 = df_unique[df_unique["video_id"].isin(sampled_video_ids)]

# Get full rows for the 10 handpicked relevant videos
df_manual_10 = df_unique[df_unique["video_id"].isin(relevant_video_ids)]

# Combine to get one dataframe with 100 unique video IDs and captions
sampled_video_ids = pd.concat([df_sampled_90, df_manual_10], ignore_index=True)[["video_id", "caption"]]

checkingyt
start
done


In [None]:
# Create DataFrame for all data for the sampled videos
df_sample = df_raw[df_raw["video_id"].isin(sampled_video_ids['video_id'].to_list())]
df_metadata = df_sample.groupby("video_id").first().reset_index()

In [None]:
# Tokenize captions for BM25
df_metadata["tokens"] = df_metadata["caption"].apply(lambda x: word_tokenize(x.lower()))

# Initialize BM25 with the tokenized captions
bm25 = BM25Okapi(df_metadata["tokens"].tolist())

In [None]:
# Initialize a list of queries
queries = [
    "a snowy mountain with a clear blue sky",
    "a crowded city street at night with neon signs",
    "a peaceful beach with palm trees and waves crashing",
    "a person jogging through a park in the morning",
    "a chef chopping vegetables in a kitchen",
    "a surfer riding a big wave",
    "a red sports car in front of a building",
    "a golden retriever sitting on a grassy field",
    "a firefighter rescuing a person from a burning building at night",
    "a person near luxury cars in cloudy weather speaking to someone"
]

In [None]:
# Initialize a list to store results
results = []

# Iterate through each query and get the top 5 results
for query in queries:
    # Tokenize the query
    query_tokens = word_tokenize(query.lower())
    # Get BM25 scores for the query
    scores = bm25.get_scores(query_tokens)
    # Get the indices of the top 5 scores
    top_indices = np.argsort(scores)[-5:][::-1]
    
    # For each of the top 5 results, append the necessary details to the results list
    for rank, idx in enumerate(top_indices):
        results.append({
            "query": query,
            "rank": rank + 1,
            "video_id": df_metadata.iloc[idx]["video_id"],
            "caption": df_metadata.iloc[idx]["caption"],
            "score": scores[idx],
            "link": df_metadata.iloc[idx]["url"],
            "start": df_metadata.iloc[idx]["start time"],
            "end": df_metadata.iloc[idx]["end time"]
        })

In [None]:
# Save to CSV for manual annotation
results_df = pd.DataFrame(results)
if not os.path.exists("bm25_b_level_results.csv"):
    results_df.to_csv("bm25_b_level_results.csv", index=False)
    print("Results saved to bm25_b_level_results.csv")
else:
    print("File already exists. No save performed.")

BM25 retrieval results saved to bm25_b_level_results.csv


In [None]:
# Reload the results after manual annotation
df = pd.read_csv("bm25_b_level_results.csv")

# Convert the relevance column to numeric, handling errors and filling NaN with 0
df["relevance"] = pd.to_numeric(df["relevance"], errors="coerce").fillna(0).astype(int)

# Function to calculate DCG
def dcg(scores):
    return sum(score / np.log2(idx + 2) for idx, score in enumerate(scores))

# Dictionary to store NDCG scores
ndcg_scores = {}

# Calculate NDCG for each query
for query, group in df.groupby("query"):
    # Sort by predicted rank 
    top_k = group.sort_values("rank").head(5)
    
    # Calculate DCG for the ranking
    relevance = top_k["relevance"].tolist()
    actual_dcg = dcg(relevance)
    
    # Ideal DCG (perfect ranking of same scores)
    ideal_relevance = sorted(relevance, reverse=True)
    ideal_dcg = dcg(ideal_relevance)
    
    # Calculate NDCG
    ndcg = actual_dcg / ideal_dcg if ideal_dcg > 0 else 0
    ndcg_scores[query] = ndcg

# Print per-query and average NDCG@5
print("NDCG@5 per query:")
for query, score in ndcg_scores.items():
    print(f"- {query[:50]}...: {score:.3f}")

mean_ndcg = np.mean(list(ndcg_scores.values()))
print(f"\nMean NDCG@5 across all queries: {mean_ndcg:.3f}")

NDCG@5 per query:
- a chef chopping vegetables in a kitchen...: 1.000
- a crowded city street at night with neon signs...: 0.924
- a firefighter rescuing a person from a burning bui...: 0.877
- a golden retriever sitting on a grassy field...: 0.920
- a peaceful beach with palm trees and waves crashin...: 1.000
- a person jogging through a park in the morning...: 0.712
- a person near luxury cars in cloudy weather speaki...: 0.865
- a red sports car in front of a building...: 0.724
- a snowy mountain with a clear blue sky...: 0.924
- a surfer riding a big wave...: 0.000

Mean NDCG@5 across all queries: 0.795


In [None]:
# Save the sampled data for the 100 videos
#df_raw[df_raw['video_id'].isin(sampled_video_ids['video_id'].to_list())].to_csv("sampled_data_100_videos.csv", index=False)