In [None]:
import os, re, time
import functools
import cv2, json
import numpy as np

# Deepfake Data Preprocessor

This Notebook is for preprocessing the training data, i.e. (currently anyway) creating difference-blend-mode image files   
for every frame of the faked video, plus a final "fakerprint" image (see comments below). You should be able to preprocess  
kaggle's little sample dataset in only like 20 minutes. This video player: https://darbyjohnston.github.io/DJV/ can play  
the JPG difference-image sequences as if they were a video if you want (Premiere can do this too, but not Quicktime).

Make sure to correctly set the **DATA_LAKE** below before preprocessing (and **run_sample_data** flag if apropos). 

#### Main Function Table of Contents:
+ **summarize_data()** - Just has a tertiary look at the dataset and prints out some metrics.
+ **faked_video_pairs()** - Creates list of fake/original video pairs from metadata.json files.
+ **create_fakerframes_cv2()** - Creates the actual difference and fakerprint image files.
+ **preprocess_data()** - Orchestrates calls to faked_video_pairs() and create_fakerframes_cv2() on the dataset.


In [None]:
# Note: set flag to False to switch to real dataset
run_sample_data = True

# This is for real dataset. Only DATA_LAKE need adjustment
DATA_LAKE = '/Volumes/My Book/deepfake-detect-datalake'

# Built off DATA_LAKE and kaggle partition directory structure.
DATA_TRAIN_PART = 'dfdc_train_part_IDX'
ROOT_DATA_TRAIN = f'{DATA_LAKE}/{DATA_TRAIN_PART}'
ROOT_FAKER_FRAMES = f'{DATA_LAKE}/dfdc_frames_part_IDX'

def trainpart(i): return DATA_TRAIN_PART.replace('IDX', str(i))
def traindir(i): return ROOT_DATA_TRAIN.replace('IDX', str(i))
def fakerdir(i): return ROOT_FAKER_FRAMES.replace('IDX', str(i))

if run_sample_data:
#{
    # These should be set right if you create a data directory in
    # the repo root and unzip the kaggle sample data file there

    DATA_LAKE = '../data'
    DATA_TRAIN_PART = 'train_sample_videos'
    ROOT_DATA_TRAIN = f'{DATA_LAKE}/{DATA_TRAIN_PART}'
    ROOT_FAKER_FRAMES = f'{DATA_LAKE}/train_sample_frames'
    
    def trainpart(i): return DATA_TRAIN_PART
    def traindir(i): return ROOT_DATA_TRAIN
    def fakerdir(i): return ROOT_FAKER_FRAMES
#}


In [None]:
def file_exists(filename):
    try:
        with open(filename) as video: return True
    except FileNotFoundError: return False

# This fcn takes just an initial glance at the video and metadata
# in the training set and prints out some informational metrics.
def summarize_data(istart, istop=None, netsum=True, verbose=False):
#{
    if istop is None:
        istart, istop = 0, istart

    summation = []
    for i in range(istart, istop):
    #{
        try:
        #{
            numvids, numtrain, vidsfounds, numfakes, origfound = 0,0,0,0,0
            with open(f"{traindir(i)}/metadata.json") as jsonfile:
            #{
                metadata = json.load(jsonfile)
                for vidname, meta in metadata.items():
                #{
                    vname = f"{traindir(i)}/{vidname}"
                    numvids += 1

                    if meta['split'] == 'train': numtrain += 1

                    if file_exists(vname): vidsfounds += 1

                    if (meta['label'] == 'FAKE'):
                        oname = f"{traindir(i)}/{meta['original']}"
                        if file_exists(oname): origfound += 1
                        numfakes += 1
                #}
            #}
        #}
        except PermissionError as err: print("ERROR:", err)
        
        if verbose:
            print(f"Partition: {trainpart(i)}:")
            print(f"  Numb of primary video meta-references/train split: {numvids}/{numtrain}")
            print(f"  Numb of actual primary videos files found on disk: {vidsfounds}")
            print(f"  Numb of fake/original meta-pairs: {numfakes}")
            print(f"  Numb of actual video pairs found on disk: {origfound}")
            
        if (numvids != vidsfounds) or (numvids != numtrain) or (numfakes != origfound):
            print(f"  DISCREPENCY found in partition {trainpart(i)}")
            
        if netsum: summation.append((numvids, numtrain, vidsfounds, numfakes, origfound))
    #}
    
    if netsum:
    #{
        numvids, numtrain, vidsfounds, numfakes, origfound = functools.reduce(
            lambda st1, st2: (st1[0]+st2[0], st1[1]+st2[1], st1[2]+st2[2], 
                              st1[3]+st2[3], st1[4]+st2[4]), summation)
        
        print(f"\nNet dataset summary:")
        print(f"  Numb of primary video meta-references/train split: {numvids}/{numtrain}")
        print(f"  Numb of actual primary videos files found on disk: {vidsfounds}")
        print(f"  Numb of fake/original meta-pairs: {numfakes}")
        print(f"  Numb of actual video pairs found on disk: {origfound}\n")
    #}
#}

if not run_sample_data:
#{
    summarize_data(50)

    # Partitions 18 and 35 appear slightly incomplete. C'est la vie.
    # summarize_data(18, 19, netsum=False, verbose=True)
    # summarize_data(35, 36, netsum=False, verbose=True)
#}
else: summarize_data(1, verbose=True)


In [None]:
# This fcn creates an array of the valid deepfake-original video name pairs
# based on the metadata.json file and the existence (or not) of the videos
def faked_video_pairs(ipart):
#{
    pc_pairs = []
    
    try:
    #{
        with open(f"{traindir(ipart)}/metadata.json") as jsonfile:
        #{
            metadata = json.load(jsonfile)
            for vidname, meta in metadata.items():
                vname = f"{traindir(ipart)}/{vidname}"
                if file_exists(vname) and (meta['label'] == 'FAKE'):
                    oname = f"{traindir(ipart)}/{meta['original']}"
                    if file_exists(oname): pc_pairs.append((vname, oname))
        #}
    #}  
    except PermissionError as err: print("ERROR:", err)
    return pc_pairs
#}

# Given a video name pair: where pc_pair[0] is a deepfake, pc_pair[1] its original/parent, 
# this function creates one "fakerframe" JPEG image (a difference blend mode image) per 
# video frame pair (expect 300 per 10s video at 30fps), and a single 'fakerprint' (a kind 
# of video fingerprint), which is the scalled sum the fakerframes. Saves files to datapath.
def create_fakerframes_cv2(pc_pair, datapath):
#{
    video = cv2.VideoCapture(pc_pair[0])
    orig = cv2.VideoCapture(pc_pair[1])
    
    count, vsuccess, osuccess, fakerprint = 0, True, True, None
    while video.isOpened() and orig.isOpened() and vsuccess and osuccess:
    #{
        # Each fakerframe created similarly to the Photoshop
        # difference blend-mode b/w the faked frame and its original
        # https://helpx.adobe.com/photoshop/using/blending-modes.htm 
        vsuccess, videoframe = video.read()
        osuccess, origframe = orig.read()
        
        if vsuccess and osuccess:           
            fakerframe = cv2.subtract(cv2.max(videoframe, origframe), cv2.min(videoframe, origframe))
            cv2.imwrite(f"{datapath}/fakerframe{count}.jpg", fakerframe)
            
            if fakerprint is None: fakerprint = fakerframe.astype(np.uint16)
            else: fakerprint = fakerprint + fakerframe
        
        count += 1
    #}
    
    fkmax = np.amax(fakerprint)
    fscale = fkmax if fkmax <= 255 else fkmax / 255
    fakerprint = np.divide(fakerprint, fscale).astype(np.uint16)
    cv2.imwrite(f"{datapath}/fakerprint.jpg", fakerprint)
    
    video.release()
    orig.release()
#}

In [None]:
%%time

# This fcn puts it together, generates video pairs, defines a datapath per,
# pair, and generates all the fakerframes and the fakerprint for that pair.
def preprocess_data(istart, istop=None, batchsz=10):
#{
    if istop is None: istart, istop = 0, istart

    batchtimes, batchnum = [], 1
    for i in range(istart, istop):
    #{
        parttime = time.time()
        fpairs = faked_video_pairs(i)
        if not os.path.isdir(fakerdir(i)): os.mkdir(fakerdir(i))
        print(f"Preprocessing {len(fpairs)} video pairs in partition {trainpart(i)}:")
        for j, pc_pair in enumerate(fpairs):
        #{
            # This is some finagled code to pickup somewhere mid-partition
            #if re.split(r'[/.]', pc_pair[0])[-2] == 'wmoigsbnem': pickup = True
            #if not pickup: continue
        
            batchtimes.append(time.time())
            datapath = f"{fakerdir(i)}/{re.split(r'[/.]', pc_pair[0])[-2]}"
            if not os.path.isdir(datapath): os.mkdir(datapath)
            create_fakerframes_cv2(pc_pair, datapath)
                        
            batchtimes[-1] = time.time() - batchtimes[-1]
            if len(batchtimes) == batchsz:
                print(f"  Average preprocess time per pair for batch {batchnum}: {np.average(batchtimes):.2f} sec")
                batchnum += 1; batchtimes = []
        #}
        print(f"  Total partition {trainpart(i)} preprocessing time: {(time.time()-parttime):.2f} sec")
    #}
#}

if not run_sample_data:
#{
    # With arg=1, preprocesses just the first partition (1334 videos). 
    # On my iMac, this averaged ~28 sec/video (or 10.5 hrs total).
    preprocess_data(1)
#}
else: preprocess_data(1)
