In [1]:
from scipy.io import loadmat
from pathlib import Path
import random
import pandas as pd
import stone

dataset_folderpath = Path("/dataset")
output_folderpath = Path("/output")

split_filepath = dataset_folderpath / "YouTubeFaces" / "meta_data"/ "meta_and_splits.mat"
videos_folderpath = dataset_folderpath / "YouTubeFaces" / "frame_images_DB"
metadata_output_filepath = output_folderpath / "metadata.csv"
gender_metadata_folder = dataset_folderpath / "Additional_Labels"

In [3]:
%%sh
ls /dataset/Additional_Labels

females.txt
males.txt


In [4]:
"""
Read the triplets describing whether two videos feature the same face or not from the corresponding MatLab file.
According to the dataset doc:
'The Splits is a data structure dividing the data set to 10 independent splits.
Each triplet in the Splits is in the format of (index1, index2, is_same_person), where index1 and index2 are the indices in the mat_names structure.
All together 5000 pairs divided equaly to 10 independent splits, with 2500 same pairs and 2500 not-same pairs.'
"""
def read_triplets_from_file(mat_filepath:str) -> list[tuple[str, str, int]]:
    # Read the MatLab file containing the list of splits
    data_dict = loadmat(mat_filepath)

    result = []
    for split_list in data_dict['Splits']:
        for video_index_1, video_index_2, is_same_person in zip(*split_list):
            # The file was generated in MatLab where indexes start at 1, whereas they begin at 0 in Python
            video_index_1 -= 1
            video_index_2 -= 1
            
            result.append((data_dict['video_names'][video_index_1][0][0], data_dict['video_names'][video_index_2][0][0], is_same_person))

    return result

triplets = read_triplets_from_file(split_filepath)
triplets[:5], triplets[-5:]

([('Sadie_Frost/1', 'Sadie_Frost/5', 1),
  ('Saied_Hadi_al_Mudarissi/0', 'Saied_Hadi_al_Mudarissi/2', 1),
  ('Lucio_Stanca/3', 'Lucio_Stanca/4', 1),
  ('Mary_Carey/0', 'Mary_Carey/4', 1),
  ('Liu_Ye/0', 'Liu_Ye/5', 1)],
 [('Chris_Rock/1', 'Einars_Repse/4', 0),
  ('Beatriz_Merino/2', 'Tim_Howard/0', 0),
  ('Frank_Caliendo/1', 'Paul_Reiser/2', 0),
  ('Bertie_Ahern/1', 'Festus_Mogae/1', 0),
  ('Ben_Curtis/0', 'Hank_Stram/4', 0)])

In [5]:
from tqdm import tqdm

"""
For a given video, choose randomly one or more images (default=1) from that video.
Also, extracts the position of the face in the chosen images.
In the YouTubeFaces dataset, this position does not need to be predicted as it is given in the "{celeb_name}.labeled_faces.txt" file.
Store those information as a dictionary along the name of the subject.
"""
def choose_image_and_find_face_location(video_name:str, nb_images:int=1) -> list[dict]:
    # Chose an image in the video
    video_folderpath = videos_folderpath / video_name
    all_images = list(video_folderpath.iterdir())
    chosen_images = random.sample(all_images, nb_images)
    
    # Find the location of the face in each of the image
    celeb_name = video_name.split('/')[0]
    face_location_filepath = videos_folderpath / f"{celeb_name}.labeled_faces.txt"
    df = pd.read_csv(face_location_filepath, sep=',', header=None, index_col=0, names=["key", "x", "y", "width", "height"], usecols=[0,2,3,4,5])
    face_positions = []
    for img_filepath in chosen_images:
        # Convert the image filepath to the format used as a key in the "{celeb_name}.labeled_faces.txt" file
        img_key = str(img_filepath).replace(str(videos_folderpath), '')[1:].replace("/", "\\")
        face_positions.append(df.loc[img_key].to_dict())

    # Format the result as a list of dictionary
    result = []
    for img_filepath, face_position in zip(chosen_images, face_positions):
        face_position["filepath"] = img_filepath
        face_position["subject_name"] = celeb_name
        result.append(face_position)

    return result

"""
For each triplet:
- Select a single image to represent each of the two videos and find its filepath
- Find the position of the face of the subject in each image.
- Keep track of the name of the subject
"""
def expand_triplets(triplet_list:list[tuple]) -> list[dict]:
    new_triplet_list = []
    for video_name_1, video_name_2, label in tqdm(triplet_list):
        if video_name_1 == video_name_2:
            # To make sure the image selected is not the same, we sample two images of the video at once
            info = choose_image_and_find_face_location(video_name_1, 2)
            new_triplet_list.append((info[0], info[1], label))
        else:
            info1 = choose_image_and_find_face_location(video_name_1, 1)[0]
            info2 = choose_image_and_find_face_location(video_name_2, 1)[0]
            new_triplet_list.append((info1, info2, label))

    return new_triplet_list
        
new_triplets = expand_triplets(triplets)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 5000/5000 [01:35<00:00, 52.11it/s]


In [6]:
# Format the data as a Pandas Dataframe so it can be exported easily to a '.csv' file and for ease of analysis
series_list = []
for i, triplet in enumerate(new_triplets):
    series_1 = pd.Series(triplet[0]).add_suffix("_1")
    series_2 = pd.Series(triplet[1]).add_suffix("_2")
    series_3 = pd.Series({"label": triplet[2]})
    row = pd.concat([series_1, series_2, series_3])
    series_list.append(row)
metadata_df = pd.DataFrame(series_list)
metadata_df.to_csv(metadata_output_filepath, index=False)
metadata_df

Unnamed: 0,x_1,y_1,width_1,height_1,filepath_1,subject_name_1,x_2,y_2,width_2,height_2,filepath_2,subject_name_2,label
0,152,147,124,124,/dataset/YouTubeFaces/frame_images_DB/Sadie_Fr...,Sadie_Frost,159,50,53,53,/dataset/YouTubeFaces/frame_images_DB/Sadie_Fr...,Sadie_Frost,1
1,174,89,56,56,/dataset/YouTubeFaces/frame_images_DB/Saied_Ha...,Saied_Hadi_al_Mudarissi,174,86,54,54,/dataset/YouTubeFaces/frame_images_DB/Saied_Ha...,Saied_Hadi_al_Mudarissi,1
2,246,152,120,120,/dataset/YouTubeFaces/frame_images_DB/Lucio_St...,Lucio_Stanca,187,97,63,63,/dataset/YouTubeFaces/frame_images_DB/Lucio_St...,Lucio_Stanca,1
3,228,93,115,115,/dataset/YouTubeFaces/frame_images_DB/Mary_Car...,Mary_Carey,160,137,153,153,/dataset/YouTubeFaces/frame_images_DB/Mary_Car...,Mary_Carey,1
4,242,140,112,112,/dataset/YouTubeFaces/frame_images_DB/Liu_Ye/0...,Liu_Ye,142,97,106,106,/dataset/YouTubeFaces/frame_images_DB/Liu_Ye/5...,Liu_Ye,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,173,102,56,56,/dataset/YouTubeFaces/frame_images_DB/Chris_Ro...,Chris_Rock,212,92,103,103,/dataset/YouTubeFaces/frame_images_DB/Einars_R...,Einars_Repse,0
4996,187,114,61,61,/dataset/YouTubeFaces/frame_images_DB/Beatriz_...,Beatriz_Merino,208,80,67,67,/dataset/YouTubeFaces/frame_images_DB/Tim_Howa...,Tim_Howard,0
4997,135,53,47,47,/dataset/YouTubeFaces/frame_images_DB/Frank_Ca...,Frank_Caliendo,158,87,76,76,/dataset/YouTubeFaces/frame_images_DB/Paul_Rei...,Paul_Reiser,0
4998,131,72,70,70,/dataset/YouTubeFaces/frame_images_DB/Bertie_A...,Bertie_Ahern,172,64,29,29,/dataset/YouTubeFaces/frame_images_DB/Festus_M...,Festus_Mogae,0


In [7]:
def parse_gender_file(gender_filepath:Path) -> set:
    lines = gender_filepath.read_text().splitlines()

    name_set = set()
    for line in lines:
        name = "_".join(line.split("_")[:-1]) # Remove the suffix and extension of the filename to get the name of the subject
        name_set.add(name)

    return name_set

def compute_gender_lists(gender_folderpath:Path) -> tuple[set, set]:
    female_gender_filepath = gender_folderpath / "females.txt"
    male_gender_filepath = gender_folderpath / "males.txt"

    return parse_gender_file(female_gender_filepath), parse_gender_file(male_gender_filepath)

def find_gender(name:str) -> str:
    if name in female_set:
        gender = "F"
    elif name in male_set:
        gender = "M"
    else:
        gender = "N/A"
    return gender

female_set, male_set = compute_gender_lists(gender_metadata_folder)
metadata_df["gender_1"] = metadata_df["subject_name_1"].apply(find_gender)
metadata_df["gender_2"] = metadata_df["subject_name_2"].apply(find_gender)
metadata_df.to_csv(metadata_output_filepath, index=False)
metadata_df

Unnamed: 0,x_1,y_1,width_1,height_1,filepath_1,subject_name_1,x_2,y_2,width_2,height_2,filepath_2,subject_name_2,label,gender_1,gender_2
0,152,147,124,124,/dataset/YouTubeFaces/frame_images_DB/Sadie_Fr...,Sadie_Frost,159,50,53,53,/dataset/YouTubeFaces/frame_images_DB/Sadie_Fr...,Sadie_Frost,1,F,F
1,174,89,56,56,/dataset/YouTubeFaces/frame_images_DB/Saied_Ha...,Saied_Hadi_al_Mudarissi,174,86,54,54,/dataset/YouTubeFaces/frame_images_DB/Saied_Ha...,Saied_Hadi_al_Mudarissi,1,M,M
2,246,152,120,120,/dataset/YouTubeFaces/frame_images_DB/Lucio_St...,Lucio_Stanca,187,97,63,63,/dataset/YouTubeFaces/frame_images_DB/Lucio_St...,Lucio_Stanca,1,M,M
3,228,93,115,115,/dataset/YouTubeFaces/frame_images_DB/Mary_Car...,Mary_Carey,160,137,153,153,/dataset/YouTubeFaces/frame_images_DB/Mary_Car...,Mary_Carey,1,F,F
4,242,140,112,112,/dataset/YouTubeFaces/frame_images_DB/Liu_Ye/0...,Liu_Ye,142,97,106,106,/dataset/YouTubeFaces/frame_images_DB/Liu_Ye/5...,Liu_Ye,1,M,M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,173,102,56,56,/dataset/YouTubeFaces/frame_images_DB/Chris_Ro...,Chris_Rock,212,92,103,103,/dataset/YouTubeFaces/frame_images_DB/Einars_R...,Einars_Repse,0,M,M
4996,187,114,61,61,/dataset/YouTubeFaces/frame_images_DB/Beatriz_...,Beatriz_Merino,208,80,67,67,/dataset/YouTubeFaces/frame_images_DB/Tim_Howa...,Tim_Howard,0,F,M
4997,135,53,47,47,/dataset/YouTubeFaces/frame_images_DB/Frank_Ca...,Frank_Caliendo,158,87,76,76,/dataset/YouTubeFaces/frame_images_DB/Paul_Rei...,Paul_Reiser,0,,M
4998,131,72,70,70,/dataset/YouTubeFaces/frame_images_DB/Bertie_A...,Bertie_Ahern,172,64,29,29,/dataset/YouTubeFaces/frame_images_DB/Festus_M...,Festus_Mogae,0,M,M


In [9]:
def predict_skin_tone(img_filepath:Path):
    # From original filepath, find related cropped/aligned image
    img_filepath = str(img_filepath)
    img_filepath = img_filepath.replace("frame_images_DB", "aligned_images_DB")
    img_filepath = img_filepath.split("/")
    img_filepath[-1] = "aligned_detect_" + img_filepath[-1]
    img_filepath = Path("/".join(img_filepath))

    # Predict skin tone from cropped/aligned image
    info_dict = stone.process(img_filepath, image_type="color")
    skin_tone = info_dict["faces"][0]["skin_tone"]

    return skin_tone

tqdm.pandas()
metadata_df["skin_tone_1"] = metadata_df["filepath_1"].progress_apply(predict_skin_tone)
metadata_df["skin_tone_2"] = metadata_df["filepath_2"].progress_apply(predict_skin_tone)
metadata_df.to_csv(metadata_output_filepath, index=False)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 5000/5000 [04:45<00:00, 17.52it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 5000/5000 [04:44<00:00, 17.57it/s]


In [11]:
metadata_df.to_csv(metadata_output_filepath, index=False)