# Set the Working Directory

In [None]:
import os

working_dir = os.getcwd()

if os.path.isdir(working_dir):
    print("Working directory is ready!")
else:
    raise ValueError("Working directory does not exist")

# Set the Trace Directory

In [None]:
import yaml

# Load configuration file
config_path = os.path.join(working_dir, "configuration.yaml")
with open(config_path, "r") as f:
    config = yaml.safe_load(f)

# Resolve trace directory
trace_dir = os.path.join(working_dir, "data", config["working_trace"])

if os.path.isdir(trace_dir):
    print(f"✅ Trace directory ready: {trace_dir}")
else:
    raise FileNotFoundError(f"❌ Trace directory not found: {trace_dir}\n")

In [None]:
from utils import read_file_ids

# Read file IDs from the trace directory
file_ids = read_file_ids(trace_dir=trace_dir, overwrite=False)

print(f"📁 Total video files found: {len(file_ids)}")

# Build Multi-Feature Embedding Index

In [None]:
from utils import build_file_embeddings

features_embedding_dirs = [
    ("video_audio_content", "features/video_audio_embedding"),
    ("video_visual_content", "features/video_visual_embedding"),
    ("llm_generated_description", "features/llm_generated_description_embedding"),
    ("user_defined_metadata", "features/user_defined_metadata_embedding"),
    ("llm_generated_keywords", "features/llm_generated_keywords_embedding"),
]
output_path = os.path.join(trace_dir, "file_embeddings_full.pickle")

file_embeddings, shapes = build_file_embeddings(trace_dir, file_ids, features_embedding_dirs, output_path)

# Build Concatenated Embedding Matrix

In [None]:
from utils import build_ordered_embedding_matrix

concat_embedding_matrix, ordered_ids = build_ordered_embedding_matrix(
    trace_dir, file_ids, file_embeddings, shapes
)

# Compute Pairwise Cosine Similarity

In [None]:
import numpy as np
import numpy.ma as ma
from sklearn.metrics.pairwise import cosine_similarity

# Compute pairwise cosine similarity
similarity_matrix = cosine_similarity(concat_embedding_matrix)

# Mask self-similarity and lower triangle
nof_rows, nof_cols = similarity_matrix.shape
mask = np.tril(np.ones((nof_rows, nof_cols)), k=0)

masked_similarity_matrix = ma.masked_array(similarity_matrix, mask=mask)

print(f"✅ Computed and masked similarity matrix. Shape: {masked_similarity_matrix.shape}")

# Plot CDF of Pairwise Cosine Similarity

In [None]:
import matplotlib.pyplot as plt

flattened_values = masked_similarity_matrix.compressed()  # Remove masked entries
sorted_values = np.sort(flattened_values)
cdf = np.arange(1, len(sorted_values) + 1) / len(sorted_values)

min_similarity, max_similarity = np.min(sorted_values), np.max(sorted_values)
print(f"🔍 Cosine similarity range: {min_similarity:.4f} to {max_similarity:.4f}")
print(f"🎞️ Total number of video pairs: {len(sorted_values):,}")

fig, ax = plt.subplots(figsize=(6, 3))

ax.plot(sorted_values, cdf, '-', color="#00A4EF", linewidth=4)

ax.tick_params(axis='x', labelsize=15)
ax.set_xlabel('Cosine similarity', fontsize=15)

ax.set_ylim(-0.05, 1.05)
ax.set_yticks(np.arange(0, 1.2, 0.2))
ax.tick_params(axis='y', labelsize=15)
ax.set_ylabel('CDF', fontsize=15)

ax.grid(ls='--', zorder=3)

output_dir = os.path.join(trace_dir, "figures")
os.makedirs(output_dir, exist_ok=True)
fig.savefig(os.path.join(output_dir, "pairwise_cosine_similarity_cdf.pdf"), bbox_inches="tight")

plt.show()

# Sample Video Pairs

In [None]:
import pickle
import random

sampled_pairs_path = os.path.join(trace_dir, "sampled_pairs.pickle")

# Sample video pairs from similarity intervals
sampled_pairs = []
nof_bins = 5
delta = (max_similarity - min_similarity) / nof_bins
nof_sampled_pairs_per_bin = 10

for lower in np.arange(min_similarity, max_similarity, delta):
    upper = lower + delta
    
    # Find video pairs within an interval and not masked
    indexes = list(zip(*np.where(
        (masked_similarity_matrix >= lower) &
        (masked_similarity_matrix < upper) &
        (~masked_similarity_matrix.mask)
    )))
    print(f"[{lower:.2f}, {upper:.2f}): {len(indexes):,} pairs")

    # Shuffle and sample from the interval
    random.shuffle(indexes)
    sampled_count = min(nof_sampled_pairs_per_bin, len(indexes))
    sampled_pairs += random.sample(indexes, sampled_count)
    print(f"  Sampled {sampled_count} pairs")

assert len(sampled_pairs) == len(set(sampled_pairs)), "Sampled pairs must be unique"

# Convert matrix indices to file IDs
for idx, (i, j) in enumerate(sampled_pairs):
    assert i < j, "Duplicate pairs in reverse order"
    sampled_pairs[idx] = (ordered_ids[i], ordered_ids[j])

# Save sampled video pairs to pickle
with open(sampled_pairs_path, "wb") as f:
    pickle.dump(sampled_pairs, f)
print(f"✅ Saved sampled pairs to `{sampled_pairs_path}`")