# Set the Working Directory

In [None]:
import os

working_dir = os.getcwd()

if os.path.isdir(working_dir):
    print("Working directory is ready!")
else:
    raise ValueError("Working directory does not exist")

# Set the Trace Directory

In [None]:
import yaml

# Load configuration file
config_path = os.path.join(working_dir, "configuration.yaml")
with open(config_path, "r") as f:
    config = yaml.safe_load(f)

# Resolve trace directory
trace_dir = os.path.join(working_dir, "data", config["working_trace"])

if os.path.isdir(trace_dir):
    print(f"✅ Trace directory ready: {trace_dir}")
else:
    raise FileNotFoundError(f"❌ Trace directory not found: {trace_dir}\n")

In [None]:
from utils import read_file_ids

# Read file IDs from the trace directory
file_ids = read_file_ids(trace_dir=trace_dir, overwrite=False)

print(f"📁 Total video files found: {len(file_ids)}")

# Load Metadata and Keywords
⚠️ Please ensure [user-defined metadata has been processed](user_defined_metadata.ipynb) and [LLM-generated keywords have been generated](llm_generated_keywords.ipynb) before running this cell.

In [None]:
from utils import load_pickle_file

# Load pickle files
metadata = load_pickle_file(os.path.join(trace_dir, "metadata.pickle"))
keywords = load_pickle_file(os.path.join(trace_dir, "keywords.pickle"))

# Build Multi-Feature Embedding Index

In [None]:
from utils import build_file_embeddings

features_embedding_dirs = []

for feature in config["best_combo"]["features"]:
    features_embedding_dirs.append(
        (feature, os.path.join("features", f"{feature}_embedding"))
    )

output_path = os.path.join(trace_dir, "file_embeddings_best.pickle")

file_embeddings, shapes = build_file_embeddings(trace_dir, file_ids, features_embedding_dirs, output_path)

# Compute Binary Similarity Matrix

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from utils import build_ordered_embedding_matrix

# Build a concatenated embedding matrix
concat_embedding_matrix, ordered_ids = build_ordered_embedding_matrix(
    trace_dir, file_ids, file_embeddings, shapes
)

# Compute cosine similarity matrix and apply threshold
similarity_matrix = np.triu(cosine_similarity(concat_embedding_matrix), k=1)
threshold = config["best_combo"]["threshold"]
binary_similarity = similarity_matrix > threshold

print(f"✅ Computed binary similarity matrix with shape: {similarity_matrix.shape}")

# Perform Video Sequence Analysis

## Task 1: Values of $|\mathcal{V}|_{\max} / L$ for Each Window

In [None]:
import matplotlib.pyplot as plt
from utils import analyze_temporal_windows

# Set parameters for analysis
window_lengths = [5, 10]

# Analyze video sequence
start_normalized_orders, _ = analyze_temporal_windows(
    ordered_ids, keywords, binary_similarity, window_lengths=window_lengths, step=1
)

# Visualization settings
colors = ["#F25022", "#00A4EF", "#7FBA00", "#FFB900"]
linestyles = ["-", "--", "-.", ":"]

fig, ax = plt.subplots(figsize=(6.4, 4.8))

for idx, length in enumerate(window_lengths):
    x = np.arange(1, len(start_normalized_orders[length]) + 1)
    y = start_normalized_orders[length][:len(x)]
    
    line, = ax.plot(
        x, y, color=colors[idx % len(colors)], linestyle=linestyles[idx % len(linestyles)], linewidth=1.5, label=fr"$L$ = {length}"
    )

ax.tick_params(axis='x', labelsize=14)
ax.set_xlabel(r"Window index $m$", fontsize=24)

ax.set_ylim(-0.05, 1.05)
ax.set_yticks(np.arange(0, 1.2, 0.2))
ax.tick_params(axis='y', labelsize=14)
ax.set_ylabel(r"Values of $|\mathcal{V}|_{\max} / L$", fontsize=24)

ax.legend(fontsize=16, loc='upper right')

ax.grid(ls='--')

output_dir = os.path.join(trace_dir, "figures")
os.makedirs(output_dir, exist_ok=True)
fig.savefig(os.path.join(output_dir, f"normalized_orders.pdf"), bbox_inches="tight")

plt.show()

## Task 2: Keyword Occurrences in Each Window

In [None]:
import matplotlib.pyplot as plt
from utils import analyze_temporal_windows

# Set parameters for analysis
length = 5
top_k = 6

# Analyze video sequence
start_normalized_orders, start_components_keywords = analyze_temporal_windows(
    ordered_ids, keywords, binary_similarity, window_lengths=[length], step=1
)

# Track keyword occurrences across windows
occurrence_map = {}
occurrence_counts = {}
nof_windows = len(start_normalized_orders[length])

for i, (start, components_keywords) in enumerate(start_components_keywords[length]):
    for _, keywords_in_component in components_keywords:
        for kw in keywords_in_component:
            if kw not in occurrence_map:
                occurrence_map[kw] = np.full(nof_windows, -100.0)
                occurrence_counts[kw] = 1
            occurrence_map[kw][start] = 1.0
            occurrence_counts[kw] += 1

# Rank keywords by total occurrences
top_k = min(top_k, len(occurrence_map))
ranked_keywords = sorted(
    [(kw, np.sum(occurrence_map[kw])) for kw in occurrence_map], key=lambda x: -x[1]
)[:top_k]

# Visualization settings
colors = ["#F25022", "#00A4EF", "#7FBA00", "#FFB900"]

fig, ax = plt.subplots(figsize=(6.4, 4.8))

x = np.arange(1, nof_windows + 1)
y_offset = np.arange(1, top_k + 1)

for idx, (kw, _) in enumerate(ranked_keywords):
    y = occurrence_map[kw][:len(x)] * y_offset[idx]
    ax.scatter(x, y, color=colors[idx % len(colors)], label=kw)

ax.tick_params(axis='x', labelsize=24)
ax.set_xlabel(r"Window index $m$", fontsize=24)

margin = (top_k + 2.5) / 20
ax.set_ylim(1 - margin, top_k + 2.5 + margin)
ax.set_yticks(y_offset, labels=[""] * top_k)
ax.tick_params(axis='y', labelsize=24)
ax.set_ylabel("Keyword occurrences", fontsize=24)

ax.legend(ncol=2, loc='upper center', fontsize=16)
ax.grid(ls='--')

output_dir = os.path.join(trace_dir, "figures")
os.makedirs(output_dir, exist_ok=True)
fig.savefig(os.path.join(output_dir, f"keyword_occurrences.pdf"), bbox_inches="tight")

plt.show()

## Task 3: Visualize Windows and Key Frames

In [None]:
import itertools
from utils import analyze_temporal_windows

# Set parameters for analysis
length = 10
visualization_mode = "start"  # Options: "start" and "sequential"

# Analyze video sequence
start_normalized_orders, start_components_keywords = analyze_temporal_windows(
    ordered_ids, keywords, binary_similarity, window_lengths=[length], step=1
)

# Determine start index based on visualization mode
if visualization_mode == "sequential":
    target_order = 5  # <== Changing this! It sets the target component order
    matches = np.where(np.array(start_normalized_orders[length]) * length == target_order)[0]
    if len(matches) < 1:
        raise ValueError(f"No window found with normalized order {target_order / length:.2f}.")
    index_cycle = itertools.cycle(matches)
    start = next(index_cycle)

elif visualization_mode == "start":
    start = 0  # <== Changing this! It sets the window start index
    if not (0 <= start < len(start_normalized_orders[length])):
        raise IndexError("Start index out of range.")

else:
    raise ValueError(f"Unsupported inspection mode: {visualization_mode}")

# Display summary info
order_info = int(start_normalized_orders[length][start] * length)
keyword_info = start_components_keywords[length][start]
print(f"ℹ️ Window start index: {start}")
print(f"Largest connected component order: {order_info}")
print(f"Associated keywords: {keyword_info[1][0][1] if keyword_info[1] else []}")

if visualization_mode == "sequential":
    print("\n⚠️ Manually loop through the cell below to visualize each window!")

> Manually loop through the cell below if you selected `sequential` mode 🔽

In [None]:
from matplotlib.patches import FancyArrowPatch
import random
from utils import display_metadata_and_videos

if visualization_mode == "sequential":
    # If sequential mode, set the next window start index
    start = next(index_cycle)
nodes = range(start, start + length)

# Visualize connected components
fig, ax = plt.subplots(figsize=(15, 4))

for i in nodes:
    for j in range(i + 1, start + length):
        if binary_similarity[i, j]:
            arrow = FancyArrowPatch(
                [i, 0], [j, 0],
                connectionstyle=f"arc3,rad={random.choice([-1, 1]) * 0.25}",
                arrowstyle="-|>", color="gray"
            )
            ax.add_patch(arrow)

ax.scatter(nodes, np.zeros(len(nodes)), s=1250, color="#007ACC", zorder=3)

for node in nodes:
    ax.text(node, 0, str(node), color="white", fontsize=12, fontweight="bold",
            ha='center', va='center')

ax.set_xlim([min(nodes) - 1, max(nodes) + 1])
ax.set_ylim([-3, 3])

ax.axis('off')

output_dir = os.path.join(trace_dir, "figures")
os.makedirs(output_dir, exist_ok=True)
fig.savefig(os.path.join(output_dir, "window_connected_components.pdf"), bbox_inches="tight")

plt.show()

# Visualize metadata and videos
video_ids = [ordered_ids[idx] for idx in nodes]
display_metadata_and_videos(video_ids, metadata, trace_dir)

In [None]:
import cv2
from tqdm.notebook import tqdm

# Sample video frames
frames = {}
sample_every_n_frames = 30

for file_id in tqdm(video_ids, desc="Sampling video frames"):
    video_path = os.path.join(trace_dir, "videos", f"{file_id}.mp4")
    cap = cv2.VideoCapture(video_path)

    if not cap.isOpened():
        print(f"❌ Error opening video: {file_id}")
        continue

    frames[file_id] = []
    frame_count = 0

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        frame_count += 1
        if frame_count % sample_every_n_frames == 0:
            frames[file_id].append(frame)

    cap.release()

print("✅ Completed frame sampling.")
print("Sampled frame counts:", [len(frames[fid]) for fid in frames])

# Visualize first frame from each video
fig, axes = plt.subplots(nrows=1, ncols=len(video_ids), figsize=(20, 40))

for idx, file_id in enumerate(video_ids):
    plt.subplot(1, len(video_ids), idx + 1)
    plt.imshow(cv2.cvtColor(frames[file_id][0], cv2.COLOR_BGR2RGB))
    plt.xticks([])
    plt.yticks([])

output_dir = os.path.join(trace_dir, "figures")
os.makedirs(output_dir, exist_ok=True)
fig.savefig(os.path.join(output_dir, "window_key_frames.pdf"), bbox_inches="tight")

plt.show()