## EMBEDDING

In [None]:
!pip install huggingface_hub transformers av tqdm

In [1]:
import numpy as np
import av
import torch
import os
import json
from tqdm import tqdm

from transformers import VivitConfig, VivitModel, VivitImageProcessor
from huggingface_hub import hf_hub_download

np.random.seed(0)

def read_video_pyav(container, indices):
    # Decodes the video; container represents the video; indices are a list of frame indices to decode; returns numpy array
    frames = []
    container.seek(0)
    start_index = indices[0]
    end_index = indices[-1]
    for i, frame in enumerate(container.decode(video=0)):
        if i > end_index:
            break
        if i >= start_index and i in indices:
            frames.append(frame)
    return np.stack([x.to_ndarray(format="rgb24") for x in frames])

def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
    # Generates set of frame indices
    converted_len = int(clip_len * frame_sample_rate)
    end_idx = np.random.randint(converted_len, seg_len)
    start_idx = end_idx - converted_len
    indices = np.linspace(start_idx, end_idx, num=clip_len)
    indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
    return indices

def get_video_frame_count(container):
    # Count the number of frames in the video
    frame_count = 0
    for frame in container.decode(video=0):
        frame_count += 1
    return frame_count

# Load the model and the processor
image_processor = VivitImageProcessor.from_pretrained("google/vivit-b-16x2-kinetics400")
model = VivitModel.from_pretrained("google/vivit-b-16x2-kinetics400")

# Path to the folder containing the videos
video_folder_path = "../Videos/"

embeddings = {}

# Loop through each video file in the folder
for i, video_file in tqdm(enumerate(os.listdir(video_folder_path))):
    # Check if the file is a video file (e.g., .mov, .mp4)
    if video_file.endswith(('.mov', '.mp4')):
        video_path = os.path.join('./Videos', video_file)

        try:
            # Load the video
            container = av.open(video_path)
            frame_count = get_video_frame_count(container)

            # Sample 32 frames
            indices = sample_frame_indices(clip_len=32, frame_sample_rate=1, seg_len=frame_count)
            video_frames = read_video_pyav(container, indices)

            # Prepare video frames for model
            inputs = image_processor(list(video_frames), return_tensors="pt")

            # Forward pass
            outputs = model(**inputs, output_hidden_states=True)
            hidden_states = outputs.hidden_states

            # Process the hidden states as needed
            for i, hidden_state in enumerate(hidden_states):
                print(f"Layer {i} hidden state shape: {hidden_state.shape}")

            embedding = hidden_states[-1].detach().numpy().tolist()
            embeddings[video_file] = embedding

        except Exception as e:
            print(f"Error processing video {video_file}: {e}")


with open('video_embeddings.json', 'w') as json_file:
    json.dump(embeddings, json_file)

with open('video_embeddings.json', 'r') as json_file:
    saved_embeddings = json.load(json_file)

for video_name, embedding in saved_embeddings.items():
    print(f"Video: {video_name}, Embedding Length: {len(embedding)}")



ModuleNotFoundError: No module named 'av'

## KNN

In [None]:
import json
import numpy as np
from sklearn.neighbors import NearestNeighbors

# Load the .ndjson file into a dictionary
embeddings_dict = {}
file_path = '/mnt/data/data (2).ndjson'  # Replace with the actual path of .ndjson file
with open(file_path, 'r') as file:
    for line in file:
        data = json.loads(line)
        for video_name, embedding in data.items():
            embeddings_dict[video_name] = embedding

# Prepare the embeddings for KNN
# Convert embeddings to a numpy array for scikit-learn
embeddings_list = list(embeddings_dict.values())
embeddings_array = np.array(embeddings_list)

# Create and fit the KNN model
knn = NearestNeighbors(n_neighbors=5, algorithm='auto')  # Adjust the number of neighbors as needed
knn.fit(embeddings_array)

# Querying the model
# Replace 'your_video_name.mov' with an actual video name from  dataset
video_name = 'your_video_name.mov'  # Update with an actual video name
video_embedding = embeddings_dict[video_name]

# Reshape and use KNN
video_embedding_reshaped = np.array(video_embedding).reshape(1, -1)
distances, indices = knn.kneighbors(video_embedding_reshaped)

# Print Neighbors and Distances
for i in range(len(indices[0])):
    neighbor_video_name = list(embeddings_dict.keys())[indices[0][i]]
    print(f"Neighbor {i+1}: {neighbor_video_name}, Distance: {distances[0][i]}")
