<a href="https://colab.research.google.com/github/ZsofiaK/masterthesis/blob/main/Implementation/Pipeline/Embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# DINOv2 embeddings

This notebook generates and saves DINOv2 embeddings of pre-selected frames from a video dataset.

In [1]:
# Mount drive.

from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
dataset_dir = 'Fish clips'

dataset_name = 'fishClips'

frame_selection_method = 'motionAbsdiff_10'

embedding_method = 'default'

In [3]:
# DINOv2 version to use.
import torch

lvm = torch.hub.load('facebookresearch/dinov2', 'dinov2_vits14')

Downloading: "https://github.com/facebookresearch/dinov2/zipball/main" to /root/.cache/torch/hub/main.zip
Downloading: "https://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_pretrain.pth" to /root/.cache/torch/hub/checkpoints/dinov2_vits14_pretrain.pth
100%|██████████| 84.2M/84.2M [00:00<00:00, 113MB/s]


In [4]:
# Transformations to use
import torchvision.transforms as T

transform = T.Compose([
    T.Resize(224),
    T.CenterCrop(224),
    T.ToTensor()
    ])

In [5]:
# Copy dataset
import shutil

data_source = f"/content/drive/My Drive/UvA/M Thesis/Data/{dataset_dir}"
data_dir = "/content/Fish clips"

# Copy the folder to destination
shutil.copytree(data_source, data_dir)

'/content/Fish clips'

In [6]:
# Auxilliary functions.
import os
import numpy as np
from PIL import Image

def generate_default_embedding(frame, transform):
  '''
  Generates a default embedding of a frame.

  :param: frame: the frame as a numpy array.
  :param: transform: the torchvision transforms object with the necessary transformations.
  :return: the embedding as a numpy array.
  '''

  img = transform(Image.fromarray(frame))[:3].unsqueeze(0)

  with torch.no_grad():
    # Note: lvm is the loaded large vision model to generate the embedding.
    embedding = lvm(img)[0]

  return embedding.squeeze().numpy()

# Method dictionary for later easy of use.
embedding_methods = {'default' : generate_default_embedding}

In [7]:
# Read frame selection table.
import pandas as pd

selection_table_name = f'{dataset_name}_{frame_selection_method}.csv'

frames_df = pd.read_csv(f'/content/{dataset_dir}/Selected frames/{selection_table_name}')

In [8]:
import cv2
import numpy as np
from IPython.display import clear_output

generate_embedding = embedding_methods[embedding_method]

save_dir = f'{data_dir}/Embeddings/{frame_selection_method}/{embedding_method}'
os.makedirs(save_dir, exist_ok = True)

nr_videos = len(frames_df.index)

failed_frames = []

for i, row in frames_df.iterrows():
    video_file = row['video']
    frame_indices = eval(row['frames'])

    video_path = f'/content/{dataset_dir}/Clips/{video_file}'

    cap = cv2.VideoCapture(video_path)

    embeddings = []

    for frame_idx in frame_indices:
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
        ret, frame = cap.read()

        if ret:
          # Convert frame to RGB (from BGR)
          frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

          embedding = generate_embedding(frame, transform)

          embeddings.append(embedding)

          # Clearing memory
          frame = None
          embedding = None

          clear_output(wait=True)
          print(f'Number of videos: {nr_videos}')
          print(f'Progress: {(i + 1 )/ nr_videos * 100:.2f}%')

        else:
          failed_frames.append((video_file, frame_idx))

          clear_output(wait=True)
          print(f'Number of videos: {nr_videos}')
          print(f'Progress: {(i + 1 )/ nr_videos * 100:.2f}%')
          print('WARNING: failed to read frame.')

    # Clearing memory.
    cap.release()
    cap = None

    embeddings = np.concatenate(embeddings, axis=0)

    # Save concatenated embeddings as a NumPy array
    video_name = video_file.replace(".mp4", "")

    save_path = f'{save_dir}/{video_name}.npy'

    np.save(save_path, embeddings)

Number of videos: 220
Progress: 100.00%


In [9]:
if len(failed_frames) > 0:
  print('WARNING: some frames were not read.')

else:
  print('Success! All frames read.')

Success! All frames read.


In [10]:
# Copy embeddings to Drive.
drive_dir = f"/content/drive/My Drive/UvA/M Thesis/Data/{dataset_dir}/Embeddings/{frame_selection_method}/{embedding_method}"

shutil.copytree(save_dir, drive_dir)

'/content/drive/My Drive/UvA/M Thesis/Data/Fish clips/Embeddings/motionAbsdiff_10/default'