### Setup

In [1]:
import os
os.chdir("/content/drive/MyDrive/Colab Notebooks/MADE/Final_Project")

In [2]:
import cv2
import glob

from tqdm.notebook import tqdm

from google.colab.patches import cv2_imshow

In [3]:
train_frames_dir = "data/frames/train/"
val_frames_dir = "data/frames/validation/"

print(len(os.listdir(train_frames_dir)))
print(len(os.listdir(val_frames_dir)))

84
21


### Utils

In [4]:
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')

In [5]:
def get_face_bbox(image_path):
  img = cv2.imread(image_path)
  gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
  faces = face_cascade.detectMultiScale(gray, 1.3, 5)

  if len(faces) == 0:
    return None
  
  if len(faces) == 1:
    face = faces[0]
  
  if len(faces) > 1:
    # finding larges face by area
    face = faces[0]
    for f in faces[1:]:

      # area = w * h
      if f[2] * f[3] > face[2] * face[3]:
        face = f
  
  x, y, w, h = face
  x1, y1 = x, y
  x2, y2 = x + w, y + h

  return x1, y1, x2, y2

In [6]:
# image_path = glob.glob(train_frames_dir + "*/*.jpg")[0]
# image_path

In [7]:
# img = cv2.imread(image_path)
# bbox = get_face_bbox(image_path)
# if bbox:
#   x1, y1, x2, y2 = bbox
# else:
#   print("No face")

In [8]:
# img = cv2.rectangle(img, (x1, y1), (x2, y2), (255,0,0), 2)
# cv2_imshow(img)

In [9]:
# cropped_face = img[y1:y2, x1:x2]
# cv2_imshow(cropped_face)

### Main

In [10]:
import json

In [11]:
def update_list_of_items(filepath, new_item):
  with open(filepath, "r") as f:
    list_of_items = json.loads(f.read())
  
  list_of_items.append(new_item)

  with open(filepath, "w") as f:
    f.write(json.dumps(list_of_items))

  print(f"Updated '{filepath}'.\n")

In [12]:
train_already_proccessed_file = "notebooks/train_already_proccessed_sub_folders"

In [13]:
with open(train_already_proccessed_file, "r") as f:
  train_already_proccessed_sub_folders = json.loads(f.read())

len(train_already_proccessed_sub_folders)

84

In [14]:
# train frames

face_not_found_frames = []

for i, sub_folder in enumerate(sorted(os.listdir(train_frames_dir))):
  if sub_folder in train_already_proccessed_sub_folders:
    print(f"{ i + 1} / {len(os.listdir(train_frames_dir))}, {sub_folder} Already processed, skipping it.")
    continue

  print(f"{ i + 1} / {len(os.listdir(train_frames_dir))}, {sub_folder}")
  for frame in tqdm(sorted(os.listdir(os.path.join(train_frames_dir, sub_folder)))):
    if frame.endswith(".txt"):
      continue

    frame_path = os.path.join(train_frames_dir, sub_folder, frame)
    bbox = get_face_bbox(frame_path)

    if not bbox:
      face_not_found_frames.append(frame_path)
      continue

    bbox_filepath = frame_path.replace(".jpg", ".txt")
    with open(bbox_filepath, "w") as fin:
      fin.write(" ".join(map(str, bbox)))

  update_list_of_items(filepath=train_already_proccessed_file, new_item=sub_folder)

1 / 84, subject_50_Vid_1 Already processed, skipping it.
2 / 84, subject_50_Vid_2 Already processed, skipping it.
3 / 84, subject_50_Vid_3 Already processed, skipping it.
4 / 84, subject_50_Vid_4 Already processed, skipping it.
5 / 84, subject_50_Vid_5 Already processed, skipping it.
6 / 84, subject_50_Vid_6 Already processed, skipping it.
7 / 84, subject_51_Vid_7 Already processed, skipping it.
8 / 84, subject_52_Vid_7 Already processed, skipping it.
9 / 84, subject_53_Vid_2 Already processed, skipping it.
10 / 84, subject_53_Vid_3 Already processed, skipping it.
11 / 84, subject_53_Vid_4 Already processed, skipping it.
12 / 84, subject_53_Vid_5 Already processed, skipping it.
13 / 84, subject_54_Vid_6 Already processed, skipping it.
14 / 84, subject_55_Vid_6 Already processed, skipping it.
15 / 84, subject_56_Vid_1 Already processed, skipping it.
16 / 84, subject_56_Vid_2 Already processed, skipping it.
17 / 84, subject_56_Vid_3 Already processed, skipping it.
18 / 84, subject_56_Vid

In [20]:
len(glob.glob(train_frames_dir + "/*/*.jpg"))

64152

In [21]:
len(glob.glob(train_frames_dir + "/*/*.txt"))

53748

In [16]:
val_already_proccessed_file = "notebooks/val_already_proccessed_sub_folders"

In [17]:
with open(val_already_proccessed_file, "r") as f:
  val_already_proccessed_sub_folders = json.loads(f.read())

len(val_already_proccessed_sub_folders)

21

In [18]:
# validation frames

face_not_found_frames = []

for i, sub_folder in enumerate(sorted(os.listdir(val_frames_dir))):
  if sub_folder in val_already_proccessed_sub_folders:
    print(f"{ i + 1} / {len(os.listdir(val_frames_dir))}, {sub_folder} Already processed, skipping it.")
    continue

  print(f"{ i + 1} / {len(os.listdir(val_frames_dir))}, {sub_folder}")
  for frame in tqdm(sorted(os.listdir(os.path.join(val_frames_dir, sub_folder)))):
    if frame.endswith(".txt"):
      continue

    frame_path = os.path.join(val_frames_dir, sub_folder, frame)
    bbox = get_face_bbox(frame_path)

    if not bbox:
      face_not_found_frames.append(frame_path)
      continue

    bbox_filepath = frame_path.replace(".jpg", ".txt")
    with open(bbox_filepath, "w") as fin:
      fin.write(" ".join(map(str, bbox)))

  update_list_of_items(filepath=val_already_proccessed_file, new_item=sub_folder)

1 / 21, subject_1_Vid_1 Already processed, skipping it.
2 / 21, subject_1_Vid_2 Already processed, skipping it.
3 / 21, subject_1_Vid_3 Already processed, skipping it.
4 / 21, subject_1_Vid_4 Already processed, skipping it.
5 / 21, subject_1_Vid_5 Already processed, skipping it.
6 / 21, subject_2_Vid_6 Already processed, skipping it.
7 / 21, subject_3_Vid_1 Already processed, skipping it.
8 / 21, subject_3_Vid_2 Already processed, skipping it.
9 / 21, subject_3_Vid_3 Already processed, skipping it.
10 / 21, subject_3_Vid_4 Already processed, skipping it.
11 / 21, subject_3_Vid_5 Already processed, skipping it.
12 / 21, subject_3_Vid_7 Already processed, skipping it.
13 / 21, subject_4_Vid_6 Already processed, skipping it.
14 / 21, subject_5_Vid_6 Already processed, skipping it.
15 / 21, subject_6_Vid_6 Already processed, skipping it.
16 / 21, subject_7_Vid_1 Already processed, skipping it.
17 / 21, subject_7_Vid_2 Already processed, skipping it.
18 / 21, subject_7_Vid_3 Already process

In [19]:
len(face_not_found_frames)

0

In [22]:
len(glob.glob(val_frames_dir + "/*/*.jpg"))

21971

In [23]:
len(glob.glob(val_frames_dir + "/*/*.txt"))

18791