# Retrieve and download the raw Youtube video

In [1]:
from pytubefix import YouTube

In [2]:
video_link = "https://www.youtube.com/watch?v=45KmZUc0CzA"
data_folder = "../data"


yt_item = YouTube(video_link)
print(yt_item.title)

stream = yt_item.streams.get_highest_resolution()
stream.download(output_path=f"{data_folder}/raw", filename=f"{yt_item.title}.mp4")

codec = stream.codecs[0]
print(codec)
fps = stream.fps
print(fps)
height, width = stream.width, stream.height
print(height, width)

Janja Garnbret Goes Gold at Paris 2024 ðŸ¥‡ ðŸ‘‘ | Full Replay all climbs
avc1.42001E
25
640 360


# Capture highlight moments from the video

## Naive and personal annotation
My first instinct is to try and select moments from the video that I personally consider as highlight moments, and then try to figure out which aspects from the video can be used to automate the process.

Here follows the video highlight moments timestamps:

Janja Garnbret Goes Gold at Paris 2024 ðŸ¥‡ ðŸ‘‘ | Full Replay all climbs
* 0:59 - 1:07
* 1:26 - 1:35
* 2:30 - 2:45
* 2:52 - 3:00
* 4:42 - 5:17
* 7:04 - 7:34

Specific Olympic Games transisions (Fade-in = FI // Fade-out)
* 0:58-0:59 (FI)
* 1:06-1:07
* 2:51-2:52 (FI)
* 3:00-3:01
* 4:42-4:43 (FI)
* 5:16:5-17

## Observations 

For this particular video, the video edits could be a great clue, as the olympic rings appear during transitions before and after a replay. There is also a specific sound at the beginning of the replay, when the olympic rings appear on the screen. It is important to note that this solution would not generalize well with any other video.

A more generalizable solution would be to detect when a single person is being the subject of the frame; we would expect these frames to "matter more" and, if there are enough consecutive frames, it would constitute a highlight moment.

Another general solution related to bouldering is the use of pose estimation, and to find a way to detect rapid changes in poses. This idea would enhance the person detection idea. The issue is how to determine what a "rapid change in poses" actually means.

Crowd noise could be an indicator of the relevance of a moment during the video, but this is hard to isolate from the commentators, and I feel that crowd noise was tuned down in the video.

---

Before going toward coding, I will rate the ideas on specific criterions to help me decide which solution to go to.

| Idea | Ease to Develop | Generalization to Other Videos | Expected Relevance |
|------|------------------|-------------------------------|--------------------|
| Olympic Rings (Sound is optional) | +++ | + | +++ |
| Single Person Detection | +++ | +++ | ++ |
| Pose Estimation and Rapid Changes | + | +++ | +++ |
| Crowd Noise Analysis | + | + | ++ |

Considering the project timeline, I will develop the most easy solutions which are the Olympic Rings (visually, and optionally with the sound approach), as well as the Single Person Detection.

# Olympic Rings and Sound

## Olympic Rings video transition detection

My first intuition when trying to detect the video transitions is to try and find classic DL Computer Vision techniques. 
Let's first set the fade-in and fade-out video sequences timestamps, and make a function to extract these video sequences.


In [73]:
import cv2
from datetime import time
from loguru import logger
from pathlib import Path
import torch
import clip
from PIL import Image

In [68]:
# Define fade-in and fade-out timestamps and durations
fade_in_timestamps = [
    time(minute=0, second=58, microsecond=800000),
    time(minute=2, second=51, microsecond=250000),
    time(minute=4, second=42, microsecond=400000),
]
fade_in_duration = time(second=1, microsecond=0)

fade_out_timestamps = [
    time(minute=1, second=6, microsecond=500000),
    time(minute=3, second=0, microsecond=300000),
    time(minute=5, second=16, microsecond=750000),
]
fade_out_duration = time(second=1, microsecond=0)

In [69]:
def retrieve_video_sequence(cap: cv2.VideoCapture, start_time: time, duration: time):
    """Given a video capture object, start time, and duration, retrieves the corresponding video frames."""

    start_frame = ((start_time.microsecond / 1e6) + start_time.second + start_time.minute * 60) * fps    # fps is defined globally from the stream metadata
    end_frame = ((duration.microsecond / 1e6) + duration.second + duration.minute * 60) * fps + start_frame
    
    frames = []
    for frame_num in range(int(start_frame), int(end_frame)):
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
        ret, frame = cap.read()
        if not ret:
            logger.warning(f"Could not read frame {frame_num}. Stopping retrieval.")
            break
        frames.append(frame)
    
    return frames


def write_video_sequence(frames, output_path: str, fps: float):
    """Writes a sequence of video frames to a video file.
    Note: output video is saved in .mp4 format using the 'mp4v' codec."""
    if not frames:
        logger.warning("No frames to write.")
        return
    
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # Using 'mp4v' codec for .mp4 files
    out = cv2.VideoWriter(output_path, fourcc, fps, (height, width)) # width and height are defined globally from the stream metadata
    
    for frame in frames:
        out.write(frame)
    
    out.release()

In [70]:
def extract_fade_in_out_sequences(video_path: str) -> tuple[list, list]:
    """Extracts fade-in and fade-out sequences from the video based on predefined timestamps."""

    assert Path(video_path).is_file(), f"Video file {video_path} does not exist."

    cap = cv2.VideoCapture(video_path)

    fade_in_sequences = []
    for timestamp in fade_in_timestamps:
        sequence = retrieve_video_sequence(cap, timestamp, fade_in_duration)
        print(f"Writing fade-in sequence for timestamp {timestamp} with {len(sequence)} frames.")
        write_video_sequence(sequence, f"{data_folder}/sequences/fade_in_{timestamp.minute}_{timestamp.second}.mp4", fps) # fps is defined globally from the stream metadata
        fade_in_sequences.append(sequence)
    
    fade_out_sequences = []
    for timestamp in fade_out_timestamps:
        sequence = retrieve_video_sequence(cap, timestamp, fade_out_duration)
        print(f"Writing fade-out sequence for timestamp {timestamp} with {len(sequence)} frames.")
        write_video_sequence(sequence, f"{data_folder}/sequences/fade_out_{timestamp.minute}_{timestamp.second}.mp4", fps)  # fps is defined globally from the stream metadata
        fade_out_sequences.append(sequence)
    
    cap.release()

    return fade_in_sequences, fade_out_sequences

fade_in_sequences, fade_out_sequences = extract_fade_in_out_sequences(f"{data_folder}/raw/Janja Garnbret Goes Gold at Paris 2024 ðŸ¥‡ ðŸ‘‘ | Full Replay all climbs.mp4")

Writing fade-in sequence for timestamp 00:00:58.800000 with 25 frames.
Writing fade-in sequence for timestamp 00:02:51.250000 with 25 frames.
Writing fade-in sequence for timestamp 00:04:42.400000 with 25 frames.
Writing fade-out sequence for timestamp 00:01:06.500000 with 25 frames.
Writing fade-out sequence for timestamp 00:03:00.300000 with 25 frames.
Writing fade-out sequence for timestamp 00:05:16.750000 with 25 frames.


Now that we have the sequences, let's embed them and compare them together. This will tell us how "alike" the sequences are, and help us decide a threshold to recognize the sequence.

Then, we can then process all video frames and compare them to the mean of the sequences embeddings, and see if the sequences are correctly recognized.

In [75]:
def embed_video_sequence(frames, model, preprocess, device):
    """Embed the video sequence using a pre-trained model (e.g., CLIP, TimeSformer)."""

    sequence_input = [ preprocess(Image.fromarray(frame)).to(device) for frame in frames ]
    sequence_input = torch.stack(sequence_input)  # Stack frames into a batch tensor

    with torch.no_grad():
        frames_features =  model.encode_image(sequence_input) # Handle batch processing

    return frames_features

# Load model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

fade_in_sequences_embeddings = [embed_video_sequence(seq, model, preprocess, device) for seq in fade_in_sequences]
fade_out_sequences_embeddings = [embed_video_sequence(seq, model, preprocess, device) for seq in fade_out_sequences]

In [84]:
def cosine_similarity(embeddings1, embeddings2):
    """Compute cosine similarity between two sets of embeddings."""
    similarity = torch.nn.CosineSimilarity(dim=1)(embeddings1, embeddings2)
    return similarity

# Compare fade-in embeddings between each other
for i, fade_in_emb in enumerate(fade_in_sequences_embeddings):
    for j, fade_in_emb2 in enumerate(fade_in_sequences_embeddings):
        similarity = cosine_similarity(fade_in_emb, fade_in_emb2)
        print(f"Similarity between fade-in sequence {i} and fade-in sequence {j}: {similarity.mean().item()}")

print("\n")

# Compare fade-out embeddings between each other
for i, fade_out_emb in enumerate(fade_out_sequences_embeddings):
    for j, fade_out_emb2 in enumerate(fade_out_sequences_embeddings):
        similarity = cosine_similarity(fade_out_emb, fade_out_emb2)
        print(f"Similarity between fade-out sequence {i} and fade-out sequence {j}: {similarity.mean().item()}")

Similarity between fade-in sequence 0 and fade-in sequence 0: 1.0
Similarity between fade-in sequence 0 and fade-in sequence 1: 0.81201171875
Similarity between fade-in sequence 0 and fade-in sequence 2: 0.85498046875
Similarity between fade-in sequence 1 and fade-in sequence 0: 0.81201171875
Similarity between fade-in sequence 1 and fade-in sequence 1: 1.0
Similarity between fade-in sequence 1 and fade-in sequence 2: 0.841796875
Similarity between fade-in sequence 2 and fade-in sequence 0: 0.85498046875
Similarity between fade-in sequence 2 and fade-in sequence 1: 0.841796875
Similarity between fade-in sequence 2 and fade-in sequence 2: 1.0


Similarity between fade-out sequence 0 and fade-out sequence 0: 1.0
Similarity between fade-out sequence 0 and fade-out sequence 1: 0.884765625
Similarity between fade-out sequence 0 and fade-out sequence 2: 0.94775390625
Similarity between fade-out sequence 1 and fade-out sequence 0: 0.884765625
Similarity between fade-out sequence 1 and fade-ou