# Linear Autoencoder with Mask R-CNN Data

In [31]:
import cv2
import json
import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd
import shutil
import torch
from albumentations import Compose, PadIfNeeded
from torch import nn
from torch import optim
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms.functional import to_tensor

In [29]:
import sys
sys.path.append(os.path.join('..', '..', 'models', 'vit', 'efficient-vit', 'transforms'))
from albu import IsotropicResize

In [2]:
# Check CUDA
device = 'cuda' if torch.cuda.is_available() else 'cpu'
if torch.cuda.is_available():
    print("CUDA is available!")
    print(f"Current device: {torch.cuda.get_device_name(device)}")
else:
    print("CUDA unavailable :(")

CUDA is available!
Current device: NVIDIA GeForce RTX 3060 Ti


## Data Pre-Processing

In [3]:
with open(os.path.join('..', '..', 'dfdc_train_part_45', 'metadata.json'), 'r', encoding='utf-8') as f:
    metadata = json.load(f)

display(metadata)

{'gthvvygfcj.mp4': {'label': 'FAKE',
  'split': 'train',
  'original': 'lxeqbyddvt.mp4'},
 'yunqitmhjo.mp4': {'label': 'FAKE',
  'split': 'train',
  'original': 'hbrgvmrtnn.mp4'},
 'mwizcjywkd.mp4': {'label': 'FAKE',
  'split': 'train',
  'original': 'stdavraahk.mp4'},
 'ymswdhnnyp.mp4': {'label': 'FAKE',
  'split': 'train',
  'original': 'gwzttezkpv.mp4'},
 'lsfsintfky.mp4': {'label': 'FAKE',
  'split': 'train',
  'original': 'xrtmepdfyh.mp4'},
 'bfkfrtmiub.mp4': {'label': 'FAKE',
  'split': 'train',
  'original': 'bpkpvrqduq.mp4'},
 'ryuocfdklj.mp4': {'label': 'FAKE',
  'split': 'train',
  'original': 'rohuegtohf.mp4'},
 'jrtgsrmvfu.mp4': {'label': 'REAL', 'split': 'train'},
 'xuszvuvabr.mp4': {'label': 'FAKE',
  'split': 'train',
  'original': 'lauifordgs.mp4'},
 'tltyusajlj.mp4': {'label': 'FAKE',
  'split': 'train',
  'original': 'kdhfvtnpik.mp4'},
 'ipouzwmwit.mp4': {'label': 'FAKE',
  'split': 'train',
  'original': 'icbzbcwlgf.mp4'},
 'dbixwxtuxs.mp4': {'label': 'REAL', 'split'

In this notebook, we'll be using the output from the Mask R-CNN instead of the raw data itself. A challenge is that because of the masking, all of the images are different sizes. To assess feasibility, we need to see what the distribution of those
sizes is like. From there we can configure a standard size to resize all of them to, with padding.

In [39]:
data_root = os.path.join("..", "..", "masked_frames")
data_originals = os.path.join(data_root, "originals")

real_train = os.path.join(data_originals, "real_train")
fake_train = os.path.join(data_originals, "fake_train")
real_val = os.path.join(data_originals, "real_val")
fake_val = os.path.join(data_originals, "fake_val")
real_test = os.path.join(data_originals, "real_test")
fake_test = os.path.join(data_originals, "fake_test")

In [7]:
# Go through all of the folders in each and quantify their sizes

def compute_frame_sizes(real_dir, fake_dir):
    heights = []
    widths = []
    for dir in os.listdir(real_dir):
        for frame in os.listdir(os.path.join(real_dir, dir)):
            img = cv2.imread(os.path.join(real_dir, dir, frame))
            heights.append(img.shape[0])
            widths.append(img.shape[1])
    for dir in os.listdir(fake_dir):
        for frame in os.listdir(os.path.join(fake_dir, dir)):
            img = cv2.imread(os.path.join(fake_dir, dir, frame))
            heights.append(img.shape[0])
            widths.append(img.shape[1])
    
    return heights, widths

In [19]:
heights, widths = compute_frame_sizes(real_train, fake_train)

In [20]:
# NOTE: Typically we give image dimensions as width x height like 1920x1080, but cv2 imports the dimensions as
# height x width so 1080x1920
print("Training Set Stats:")
train_pd = pd.DataFrame({'Heights': heights, 'Widths': widths})
print(train_pd.describe())

Training Set Stats:
           Heights       Widths
count  9006.000000  9006.000000
mean    965.184099   602.677437
std     133.848509   228.361786
min     294.000000   165.000000
25%     921.000000   397.000000
50%     988.000000   642.000000
75%    1037.000000   757.000000
max    1685.000000  1590.000000


In [40]:
heights_v, widths_v = compute_frame_sizes(real_val, fake_val)

In [41]:
print("Validation Set Stats:")
val_pd = pd.DataFrame({'Heights': heights_v, 'Widths': widths_v})
print(val_pd.describe())

Validation Set Stats:
           Heights       Widths
count  1500.000000  1500.000000
mean    948.962000   556.397333
std     154.168429   222.555416
min     637.000000   168.000000
25%     902.000000   371.250000
50%     978.000000   608.000000
75%    1025.000000   708.000000
max    1623.000000  1086.000000


In [21]:
heights_t, widths_t = compute_frame_sizes(real_test, fake_test)

In [22]:
print("Test Set Stats:")
test_pd = pd.DataFrame({'Heights': heights_t, 'Widths': widths_t})
print(test_pd.describe())

Test Set Stats:
           Heights       Widths
count  3003.000000  3003.000000
mean    962.570763   602.338661
std     126.916398   231.539521
min     306.000000   166.000000
25%     926.000000   405.000000
50%     990.000000   645.000000
75%    1042.000000   760.000000
max    1238.000000  1401.000000


From these stats, it seems that most of the images are laid out in portrait orientation with a longer height. This makes sense since we used Mask R-CNN on people who tend to be taller than wider in frame. In order to stack for batching, we need all of these images to have the same dimensions. Thus we can use isotropic resizing with the images to fix them to the median dimensions of the training set. The isotropic resize function will scale based based on the max sidelength, so we will
pass the median height there. Width should already be lower in most cases. Then, we will using padding to make the image a square so that everything's a uniform size.

In [55]:
rescale_transform = Compose([
            IsotropicResize(max_side=988, interpolation_down=cv2.INTER_AREA, interpolation_up=cv2.INTER_CUBIC),
            PadIfNeeded(min_width=988, border_mode=cv2.BORDER_CONSTANT),
        ])

NOTE: The following cell is for resizing the image and storing in a directory. Only run if those files need to be changed.

In [57]:
# # Purge the rescaled folder each time
# data_rescaled = os.path.join(data_root, 'rescaled')
# if os.path.isdir(data_rescaled):
#     shutil.rmtree(data_rescaled)

# # Now create the folder and setup its subdirectories
# os.mkdir(data_rescaled)
# rescale_train = os.path.join(data_rescaled, 'train')
# rescale_val = os.path.join(data_rescaled, 'val')
# rescale_test = os.path.join(data_rescaled, 'test')

# subdirs = {
#     rescale_train: [real_train, fake_train],
#     rescale_val: [real_val, fake_val],
#     rescale_test: [real_test, fake_test],
# }

# for subdir in subdirs.keys():
#     os.mkdir(subdir)


# # Each subdirectory corresponds to a group - train, validation, or test. Each of those has two original folders (fake
# # and real) to absorb. We will go through the 2 original folders for each subdirectory and store all the images
# # together for ease.
# for subdir in subdirs.keys():  # train, val, test
#     for label_group in subdirs[subdir]:  # real, fake
#         for img_group in os.listdir(label_group):  # video id
#             for img in os.listdir(os.path.join(label_group, img_group)):
#                 rescaled = rescale_transform(image=cv2.imread(os.path.join(label_group, img_group, img)))['image']
                
#                 frame_num = img.split('.')[0][5:]  # Cut by extension to get frame# and then skip 'frame'
#                 new_name = img_group + '_' + frame_num + '.jpg'
#                 cv2.imwrite(os.path.join(subdir, new_name), rescaled)

## Dataset Object

In [None]:
class FrameDataset(Dataset):

    def __init__(self, metadata: dict, img_dir: str, transform=None) -> None:
        super().__init__()
        self.img_dir = img_dir

        # Go through the names of all the images in the directory. Compile a set of just the video names
        filenames = os.listdir(img_dir)

        vid_names = set()
        for filename in filenames:
            vid_names.add(filename.split('_')[0])

        # Now for each video name, pull up all the frames and add the images
        self.id = []
        self.frames = []
        self.labels = []
        for vid_name in vid_names:
            for i in range(0, 300, 30):
                self.id.append(vid_name)
                self.labels.append(0 if metadata[vid_name]['label']=='REAL' else 1)

                frame_name = f'{vid_name}_{i}.jpg'
                self.frames.append(frame_name)

    def __len__(self) -> int:
        return len(self.labels)
    
    def __getitem__(self, index):
        # Most of the images are in landscape orientation where cv2 will read them height, width, channels. A few
        # are in portrait mode so they are read in width, height, channels. We need to use swap axes to fix these
        img = cv2.imread(os.path.join(self.img_dir, self.frames[index]))
        if img.shape[0] > img.shape[1]:
            # print('trigger')
            img = img.swapaxes(0, 1)

        return self.id[index], to_tensor(cv2.resize(img, (0,0), fx=0.1, fy=0.1, interpolation=cv2.INTER_AREA)), self.labels[index]

In [26]:
cv2.imread(os.path.join(fake_test, 'amsrzsvcij', 'frame1.jpg')).shape

(748, 354, 3)