In [1]:
import cv2
import glob
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pickle
import time
import torch
from concurrent.futures import ThreadPoolExecutor, wait, as_completed
from sklearn.model_selection import train_test_split
from torch.utils.data.dataset import Dataset
from tqdm import tqdm

METADATA_DIR = 'data/metadata'
FACES_DIR = 'faces_saved'

%matplotlib inline

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [2]:
%%time

with open('FACES_PATHS.pickle', 'rb') as handle:
    FACES_PATHS = pickle.load(handle)

with open('FACES_LABELS.pickle', 'rb') as handle:
    FACES_LABELS = pickle.load(handle)

len(FACES_PATHS), len(FACES_LABELS)

CPU times: user 124 ms, sys: 52 ms, total: 176 ms
Wall time: 174 ms


(1085181, 1085181)

In [3]:
def train_dev_test_split(faces_paths, faces_labels, split=[0.98, 0.01, 0.01]):
    VIDEO_NAMES = []
    VIDEO_LABELS = []
    for file in os.listdir(METADATA_DIR):
        df = pd.read_json(os.path.join(METADATA_DIR, file))
        video_names_file = list(map(lambda x: x.split('.')[0], df.index))
        video_labels_file = list(df['label'].values)
        VIDEO_NAMES = VIDEO_NAMES + video_names_file
        VIDEO_LABELS = VIDEO_LABELS + video_labels_file
    train_names, dev_names, train_labels, dev_labels = train_test_split(VIDEO_NAMES, VIDEO_LABELS, train_size=split[0], random_state=13)
    dev_names, test_names, dev_labels, test_labels = train_test_split(dev_names, dev_labels, train_size=split[1] / (split[1] + split[2]), random_state=13)
#     print(len(train_names), len(train_labels), len(dev_names), len(dev_labels), len(test_names), len(test_labels))
    train_faces, train_target = [], []
    dev_faces, dev_target = [], []
    test_faces, test_target = [], []
    for i in tqdm(range(len(faces_paths))):
        name = faces_paths[i].split('/')[-2]
        if name in train_names:
            train_faces.append(faces_paths[i])
            train_target.append(faces_labels[i])
        elif name in dev_names:
            dev_faces.append(faces_paths[i])
            dev_target.append(faces_labels[i])
        elif name in test_names:
            test_faces.append(faces_paths[i])
            test_target.append(faces_labels[i])
    return train_faces, dev_faces, test_faces, train_target, dev_target, test_target

train_faces, dev_faces, test_faces, train_target, dev_target, test_target = train_dev_test_split(FACES_PATHS, FACES_LABELS)
len(train_faces), len(dev_faces), len(test_faces), len(train_target), len(dev_target), len(test_target)

100%|██████████| 1085181/1085181 [30:07<00:00, 600.34it/s]


(1063586, 10792, 10803, 1063586, 10792, 10803)

In [4]:
with open('train_faces.pickle', 'wb') as handle:
    pickle.dump(train_faces, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('dev_faces.pickle', 'wb') as handle:
    pickle.dump(dev_faces, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('test_faces.pickle', 'wb') as handle:
    pickle.dump(test_faces, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('train_target.pickle', 'wb') as handle:
    pickle.dump(train_target, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('dev_target.pickle', 'wb') as handle:
    pickle.dump(dev_target, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('test_target.pickle', 'wb') as handle:
    pickle.dump(test_target, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [5]:
print('TRAIN')
print('Number of fake faces: {}/{} ({:.2f}%)'.format(sum(train_target), len(train_target), np.mean(train_target) * 100))
print('Number of real faces: {}/{} ({:.2f}%)'.format(len(train_target) - sum(train_target), len(train_target), (1 - np.mean(train_target)) * 100))
print('DEV')
print('Number of fake faces: {}/{} ({:.2f}%)'.format(sum(dev_target), len(dev_target), np.mean(dev_target) * 100))
print('Number of real faces: {}/{} ({:.2f}%)'.format(len(dev_target) - sum(dev_target), len(dev_target), (1 - np.mean(dev_target)) * 100))
print('TEST')
print('Number of fake faces: {}/{} ({:.2f}%)'.format(sum(test_target), len(test_target), np.mean(test_target) * 100))
print('Number of real faces: {}/{} ({:.2f}%)'.format(len(test_target) - sum(test_target), len(test_target), (1 - np.mean(test_target)) * 100))

TRAIN
Number of fake faces: 888206/1063586 (83.51%)
Number of real faces: 175380/1063586 (16.49%)
DEV
Number of fake faces: 8860/10792 (82.10%)
Number of real faces: 1932/10792 (17.90%)
TEST
Number of fake faces: 9183/10803 (85.00%)
Number of real faces: 1620/10803 (15.00%)
