# Set the Working Directory

In [None]:
import os

working_dir = os.getcwd()

if os.path.isdir(working_dir):
    print("Working directory is ready!")
else:
    raise ValueError("Working directory does not exist")

# Set the Trace Directory

In [None]:
import yaml

# Load configuration file
config_path = os.path.join(working_dir, "configuration.yaml")
with open(config_path, "r") as f:
    config = yaml.safe_load(f)

# Resolve trace directory
trace_dir = os.path.join(working_dir, "data", config["working_trace"])

if os.path.isdir(trace_dir):
    print(f"✅ Trace directory ready: {trace_dir}")
else:
    raise FileNotFoundError(f"❌ Trace directory not found: {trace_dir}\n")

In [None]:
from utils import read_file_ids

# Read file IDs from the trace directory
file_ids = read_file_ids(trace_dir=trace_dir, overwrite=False)

print(f"📁 Total video files found: {len(file_ids)}")

# Build Multi-Feature Embedding Index

In [None]:
from utils import build_file_embeddings

features_embedding_dirs = [
    ("video_audio_content", "features/video_audio_embedding"),
    ("video_visual_content", "features/video_visual_embedding"),
    ("llm_generated_description", "features/llm_generated_description_embedding"),
    ("user_defined_metadata", "features/user_defined_metadata_embedding"),
    ("llm_generated_keywords", "features/llm_generated_keywords_embedding"),
]
output_path = os.path.join(trace_dir, "file_embeddings_full.pickle")

file_embeddings, shapes = build_file_embeddings(trace_dir, file_ids, features_embedding_dirs, output_path)

# Load Final Annotation
⚠️ Please ensure [the final annotation has been generated](annotation.ipynb) before running this cell.

In [None]:
from utils import load_pickle_file

# Load pickle files
annotation = load_pickle_file(os.path.join(trace_dir, "annotations/final_annotation.pickle"))

# Compute annotation statistics
total_pairs = len(annotation)
positive_pairs = sum(value is True for value in annotation.values())
negative_pairs = total_pairs - positive_pairs

print(f"📊 Annotation statistics:")
print(f"  Total video pairs: {total_pairs:,}")
print(f"  Positive (similar) pairs: {positive_pairs:,}")
print(f"  Negative (not similar) pairs: {negative_pairs:,}")

# Identify the Best Feature Combination and Similarity Threshold

In [None]:
from itertools import combinations
import numpy as np
import pickle
from tqdm.notebook import tqdm
from utils import compute_cosine_similarity, compute_metrics_across_thresholds

output_path = os.path.join(trace_dir, "feature_combo_metrics.pickle")

if os.path.isfile(output_path):
    # Load existing feature combination metrics
    with open(output_path, "rb") as f:
        feature_combo_metrics = pickle.load(f)
    print(f"✅ Loaded cached feature combination metrics from `{output_path}`")
else:
    features = [
        'video_audio_content',
        'video_visual_content',
        'llm_generated_description',
        'user_defined_metadata',
        'llm_generated_keywords',
    ]
    thresholds = np.arange(0, 1, 0.001)
    feature_combo_metrics = []
    
    for r in range(1, len(features) + 1):
        for combo in tqdm(list(combinations(features, r)), desc=f"Evaluating {r}-feature combinations"):
            similarity, ground_truth = compute_cosine_similarity(annotation, combo, file_embeddings)
            accuracy, precision, recall, f1 = compute_metrics_across_thresholds(similarity, ground_truth, thresholds)
            best_idx = np.argmax(f1)
            feature_combo_metrics.append([
                combo,
                accuracy[best_idx],
                precision[best_idx],
                recall[best_idx],
                f1[best_idx],
                thresholds[best_idx]
            ])
    
    feature_combo_metrics = sorted(feature_combo_metrics, key=lambda x: -x[4])  # sort by F1-score
    
    # Save feature combination metrics to pickle
    with open(output_path, "wb") as f:
        pickle.dump(feature_combo_metrics, f)
    print(f"✅ Saved feature combination metrics to `{output_path}`")

# Display top-100 results
print("\n🏆 Top 100 Feature Combinations:")
for rank, (features, acc, prec, rec, f1, threshold) in enumerate(feature_combo_metrics[:100]):
    print(f"{rank:>2}. {features}")
    print(f"  Accuracy: {acc:.4f}, Precision: {prec:.4f}, Recall: {rec:.4f}, F1-score: {f1:.4f}, Threshold: {threshold:.3f}")

In [None]:
best_combo = feature_combo_metrics[0]

print("\n🏅 Best Feature Combination:")
print(f"  Features: {best_combo[0]}, Threshold: {best_combo[5]:.3f}")

# Update configuration with the best feature combination and similarity threshold
config["best_combo"] = {
    "from_trace": config["working_trace"],
    "features": list(best_combo[0]),
    "threshold": round(float(best_combo[5]), 4)
}

with open(config_path, "w") as f:
    yaml.safe_dump(config, f, indent=4)

print(f"\n✅ Saved the best feature combination and similarity threshold to: {config_path}")