<a href="https://colab.research.google.com/github/ZsofiaK/masterthesis/blob/main/Implementation/Pipeline/Embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# DINOv2 embeddings

This notebook generates and saves DINOv2 embeddings of pre-selected frames from a video dataset.

In [2]:
# Mount Drive.
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
dataset_dir = 'AK fish'

dataset_name = 'AK-fish'

frame_selection_method = 'motionAbsdiff_10'

DINO_model = 'dinov2_vitg14_reg'

feature_extraction = 'clf'

image_size = 448    # Size to use when downsampling the frames (shorter side).

patch_size = 14     # A characteristic of the DINOv2 model.

embedding_method = f'{DINO_model}-{feature_extraction}'.replace('_', '-')

immediate_copy = True   # If the embeddings should immediately be copied to Drive.

skip_existing = True    # If existing embeddings should not be calculated again.

In [4]:
# Specify directory to save embeddings
import os
drive_save_dir = f"/content/drive/My Drive/UvA/M Thesis/Data/{dataset_dir}/Embeddings/{embedding_method}/{image_size}"

os.makedirs(drive_save_dir, exist_ok=True)

In [7]:
# DINOv2 version to use.
import torch

lvm = torch.hub.load('facebookresearch/dinov2', DINO_model)

Using cache found in /root/.cache/torch/hub/facebookresearch_dinov2_main
Downloading: "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitg14/dinov2_vitg14_reg4_pretrain.pth" to /root/.cache/torch/hub/checkpoints/dinov2_vitg14_reg4_pretrain.pth
100%|██████████| 4.23G/4.23G [00:20<00:00, 227MB/s]


In [14]:
# Copy dataset
data_dir = f"/content/drive/MyDrive/UvA/M Thesis/Data/{dataset_dir}"

In [15]:
# Auxilliary functions.
import os
import numpy as np
from PIL import Image

def calculate_new_dimensions(frame, image_size, patch_size):
  '''
  Calculates the downsampled dimensions of images.

  :param: frame: the frame to downsample.
  :param: image_size: the downsampled length of the smaller side of the frame.
  :param: patch_size: the patch size of the LVM which will be used for embeddings.
  '''

  height, width, _ = frame.shape

  # Calculate new image dimensions.
  if width > height:
    new_width = image_size
    new_height = int(height * image_size / width)

  else:
    new_height = image_size
    new_width = int(width * image_size / height)

  # Ensure that both dimensions are multiples of the patch size.
  if new_width % patch_size != 0:
    new_width = (new_width // patch_size) * patch_size

  if new_height % patch_size != 0:
    new_height = (new_height // patch_size) * patch_size

  return new_width, new_height

def generate_default_embedding(frame, transform):
  '''
  Generates a default embedding of a frame.

  :param: frame: the frame as a numpy array.
  :param: transform: the torchvision transforms object with the necessary transformations.
  :return: the embedding as a numpy array.
  '''

  img = transform(Image.fromarray(frame))[:3].unsqueeze(0)

  with torch.no_grad():
    # Note: lvm is the loaded large vision model to generate the embedding.
    embedding = lvm(img)[0]

  return embedding.squeeze().numpy()

# Method dictionary for later easy of use.
embedding_methods = {'clf' : generate_default_embedding}

In [17]:
# Read frame selection table.
import pandas as pd

selection_table_name = f'{dataset_name}_{frame_selection_method}.csv'

frames_df = pd.read_csv(f'{data_dir}/Selected frames/{selection_table_name}')

In [18]:
import torchvision.transforms as T
import cv2
import numpy as np
from IPython.display import clear_output
import shutil

# Select method of feature extraction.
generate_embedding = embedding_methods[feature_extraction]

save_dir = f'/content/Embeddings/{embedding_method}/{image_size}'
os.makedirs(save_dir, exist_ok = True)

nr_videos = len(frames_df.index)

failed_frames = []

transformation_set = False    # Marks if image transformation process has been set.

for i, row in frames_df.iterrows():
    # Show progress
    clear_output(wait=True)
    print(f'Number of videos: {nr_videos}')
    print(f'Progress: {(i + 1 )/ nr_videos * 100:.2f}%')

    video_file = row['video']
    frame_indices = eval(row['frames'])

    video_name = video_file.replace(".mp4", "")

    video_path = f'{data_dir}/Clips/{video_file}'

    video_save_dir = f'{save_dir}/{video_name}'

    if not os.path.exists(video_save_dir):
      os.makedirs(video_save_dir)

    drive_video_dir = f'{drive_save_dir}/{video_name}'

    cap = cv2.VideoCapture(video_path)

    for frame_idx in frame_indices:
      # Destination to save embedding
      save_path = f'{video_save_dir}/{video_name}_{frame_idx}.npy'


      # Check if embedding already exists and skip if it does (optional).
      if skip_existing:

        # Skip if embedding already exists in Drive.
        if os.path.exists(f'{drive_video_dir}/{video_name}_{frame_idx}.npy'):
          continue

        # Skip if already exists in runtime.
        elif os.path.exists(save_path):

          # Copy to Drive if immediate copy is enabled.
          if immediate_copy:
            if not os.path.exists(drive_video_dir):
              os.makedirs(drive_video_dir)

            shutil.copy(save_path, f'{drive_video_dir}/{video_name}_{frame_idx}.npy')

          continue

        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
        ret, frame = cap.read()

        if ret:
          if not transformation_set:
            new_width, new_height = calculate_new_dimensions(frame, image_size, patch_size)

            # Set transformations to use
            transform = T.Compose([
                T.Resize((new_height, new_width)),
                T.ToTensor()
                ])

            transformation_set = True

          # Convert frame to RGB (from BGR)
          frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

          embedding = generate_embedding(frame, transform)

          # Save embedding as a NumPy array
          np.save(save_path, embedding)

          # Copy file to Drive (optional).
          if immediate_copy:
            if not os.path.exists(drive_video_dir):
              os.makedirs(drive_video_dir)

            shutil.copy(save_path, f'{drive_video_dir}/{video_name}_{frame_idx}.npy')

          # Clearing memory
          frame = None
          embedding = None

        else:
          failed_frames.append((video_file, frame_idx))
          print('WARNING: failed to read frame.')

    # Clearing memory.
    cap.release()
    cap = None

Number of videos: 887
Progress: 100.00%


In [None]:
if len(failed_frames) > 0:
  print('WARNING: some frames were not read.')

else:
  print('Success! All frames read.')

Success! All frames read.


In [None]:
# Copy embeddings to Drive if they have not been already.
if not immediate_copy:
  shutil.copytree(save_dir, drive_save_dir)

else:
  print('Embeddings have already been copied to Drive.')

Embeddings have already been copied to Drive.


In [None]:
# Check if there are any missing frame embeddings
nr_frames = int(frame_selection_method.split('_')[-1])    # Correct number of embedded frames per video

frames_df = pd.read_csv(f'/content/{dataset_dir}/Selected frames/{selection_table_name}', \
                                    index_col = 'video')

for item in os.listdir(drive_save_dir):
  item_path = f'{drive_save_dir}/{item}'

  if os.path.isdir(item_path):
    if len(os.listdir(item_path)) != nr_frames:
      print('VIDEO:', item)

      selected_frames = eval(frames_df['frames'][f'{item}.mp4'])

      found_frames = []

      for embedding in os.listdir(item_path):
        found_frames.append(int(embedding.split('_')[-1].replace('.npy', '')))

      for i in selected_frames:
        if i not in found_frames:
          print('I')

      print()

VIDEO: UVHZNUPH

VIDEO: SZPVDMHZ

VIDEO: QDSYAFGA

VIDEO: ISOMHLHH

VIDEO: XZTDNQCJ

VIDEO: BJECMPAB

