<a href="https://colab.research.google.com/github/ZsofiaK/masterthesis/blob/main/Implementation/Auxilliary/Process_checks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Checking process correctness for some thesis elements

## Checking AK fish embeddings

In [1]:
vitg14_embedding_dir = '/content/drive/MyDrive/UvA/M Thesis/Data/AK fish/Embeddings/dinov2-vitg14-clf/448'
vitg14_reg_embedding_dir = '/content/drive/MyDrive/UvA/M Thesis/Data/AK fish/Embeddings/dinov2-vitg14-reg-clf/448'

In [2]:
# Mount Drive.
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
import os

print('Number of videos embedded with vitg14:', len(os.listdir(vitg14_embedding_dir)))
print('Number of videos embedded with vitg14-reg:', len(os.listdir(vitg14_reg_embedding_dir)))

Number of videos embedded with vitg14: 887
Number of videos embedded with vitg14-reg: 397


In [11]:
# Check if embeddings in two directories are the same.
import numpy as np
from IPython.display import clear_output

progress = 0
nr_videos = len(os.listdir(vitg14_reg_embedding_dir))

unequal = []
not_found = []

missing_embeddings = []

for video in os.listdir(vitg14_reg_embedding_dir):
  video_vitg14_reg_dir = f'{vitg14_reg_embedding_dir}/{video}'
  video_vitg14_dir = f'{vitg14_embedding_dir}/{video}'

  progress += 1
  clear_output(wait=True)
  print(f'Progress: {progress/nr_videos * 100:.2f}%')

  if len(os.listdir(video_vitg14_reg_dir)) < 10:
    missing_embeddings.append((video, len(os.listdir(video_vitg14_reg_dir))))
    continue

  for embedding in os.listdir(video_vitg14_reg_dir):
    vitg14_reg_embedding = np.load(f'{video_vitg14_reg_dir}/{embedding}')

    try:
      vitg14_embedding = np.load(f'{video_vitg14_dir}/{embedding}')

      if not np.array_equal(vitg14_reg_embedding, vitg14_embedding):
        unequal.append(embedding)

    except:
      not_found.append(embedding)

Progress: 100.00%


In [12]:
print('Number of unequal embeddings:', len(unequal))
print('Number of not found embeddings:', len(not_found))
print('Number of incomplete embeddings:', len(missing_embeddings))

Number of unequal embeddings: 0
Number of not found embeddings: 0
Number of incomplete embeddings: 3


In [14]:
print('Incomplete embeddings in ViT-g14-reg:')
for embedding in missing_embeddings:
  print(embedding)

Incomplete embeddings in ViT-g14-reg:
('UVHZNUPH', 7)
('SZPVDMHZ', 9)
('KPKBAKGZ', 7)


In [15]:
# Check if all embeddings are there in the ViT-g14 directory:
missing_embeddings_vitg14 = []

progress = 0
nr_videos = len(os.listdir(vitg14_embedding_dir))

for video in os.listdir(vitg14_embedding_dir):
  video_dir = f'{vitg14_embedding_dir}/{video}'

  embeddings = [file for file in os.listdir(video_dir) if file.endswith('.npy')]

  if len(embeddings) < 10:
    missing_embeddings_vitg14.append((video, len(embeddings)))

  progress += 1
  clear_output(wait=True)
  print(f'Progress: {progress / nr_videos * 100:.2f}%')

Progress: 100.00%


In [17]:
print('Incomplete embeddings in ViT-g14:')

for embedding in missing_embeddings_vitg14:
  print(embedding)

print(f'Total of {sum([item[1] for item in missing_embeddings_vitg14])} embeddings missing.')

Incomplete embeddings in ViT-g14:
('UVHZNUPH', 7)
('SZPVDMHZ', 9)
('QDSYAFGA', 4)
('ISOMHLHH', 4)
('XZTDNQCJ', 7)
('BJECMPAB', 9)
Total of 40 embeddings missing.


In [22]:
# CONCLUSION: Remove vitg14-reg directory as it is the same as the other one but incomplete.
# vitg14-clf directory will be renamed to vitg14-reg-clf.
import shutil
shutil.rmtree('/content/drive/MyDrive/UvA/M Thesis/Data/AK fish/Embeddings/dinov2-vitg14-reg-clf')

In [24]:
# Check if directory has correct size.
print('Number of DINOv2-vitG14 register embeddings:',
      len(os.listdir(vitg14_reg_embedding_dir)))

Number of DINOv2-vitG14 register embeddings: 887
