# Deepfake Data Preprocessor

This Notebook is for preprocessing the training data, i.e. (currently anyway) creating difference-blend-mode image files   
for every frame of the faked video, plus a final "fakerprint" image (see comments below). You should be able to preprocess  
kaggle's little sample dataset in only like 20 minutes. This video player: https://darbyjohnston.github.io/DJV/ can play  
the JPG difference-image sequences as if they were a video if you want (Premiere can do this too, but not Quicktime).

Make sure to correctly set the **DATA_LAKE** below before preprocessing (and **run_sample_data** flag if apropos). 

#### Main Function Table of Contents:
+ **summarize_data()** - Just has a tertiary look at the dataset and prints out some metrics.
+ **create_fakerframes_cv2()** - Creates the actual difference and fakerprint image files.
+ **preprocess_data()** - Orchestrates calls to faked_video_pairs() and create_fakerframes_cv2() on the dataset.


In [1]:
%load_ext autoreload
%autoreload 2

import sys
import os, re, time
import functools
import cv2, json
import numpy as np

# For managing relative imports from notebook
if '..' not in sys.path: sys.path.append('..')

import config.config as dfc
import deepfake.dfutillib as df
import deepfake.preprocessor as dproc

In [None]:
dproc.Preprocessor(minqueued=None).run()

Preprocessing epoch block 426:
  Total process time: 0.00 min
Preprocessing epoch block 382:
  Processed 10 videos, running average time/pair: 4.17 sec
  Processed 20 videos, running average time/pair: 3.53 sec
  Processed 30 videos, running average time/pair: 3.84 sec
  Processed 40 videos, running average time/pair: 3.98 sec
  Processed 50 videos, running average time/pair: 3.70 sec
  Processed 60 videos, running average time/pair: 3.89 sec
  Processed 70 videos, running average time/pair: 3.97 sec
  Processed 80 videos, running average time/pair: 3.94 sec
  Processed 90 videos, running average time/pair: 3.75 sec
  Processed 100 videos, running average time/pair: 3.71 sec
  Processed 110 videos, running average time/pair: 3.54 sec
  Processed 120 videos, running average time/pair: 3.71 sec
  Processed 130 videos, running average time/pair: 3.73 sec
  Processed 140 videos, running average time/pair: 4.05 sec
  Processed 150 videos, running average time/pair: 3.61 sec
  Total process 

In [None]:
# This fcn takes just an initial glance at the video and metadata
# in the training set and prints out some informational metrics.
def summarize_data(istart, istop=None, netsum=True, verbose=False):
#{
    # Like range args
    if istop is None:
        istart, istop = 0, istart

    summation = []
    for i in range(istart, istop):
    #{
        try:
        #{
            numvids, numtrain, vidsfounds, numfakes, origfound = 0,0,0,0,0
            with open(f"{df.traindir(i)}/metadata.json") as jsonfile:
            #{
                metadata = json.load(jsonfile)
                for vidname, meta in metadata.items():
                #{
                    vname = f"{df.traindir(i)}/{vidname}"
                    numvids += 1

                    if meta['split'] == 'train': numtrain += 1
                    if df.file_exists(vname): vidsfounds += 1

                    if (meta['label'] == 'FAKE'):
                        oname = f"{df.traindir(i)}/{meta['original']}"
                        if df.file_exists(oname): origfound += 1
                        numfakes += 1
                #}
            #}
        #}
        except PermissionError as err: print("ERROR:", err)
        
        if verbose:
            print(f"Partition: {df.trainpart(i)}:")
            print(f"  Numb of primary video meta-references/train split: {numvids}/{numtrain}")
            print(f"  Numb of actual primary videos files found on disk: {vidsfounds}")
            print(f"  Numb of fake/original meta-pairs: {numfakes}")
            print(f"  Numb of actual video pairs found on disk: {origfound}")
            
        if (numvids != vidsfounds) or (numvids != numtrain) or (numfakes != origfound):
            print(f"  DISCREPENCY found in partition {df.trainpart(i)}")
            
        if netsum: summation.append((numvids, numtrain, vidsfounds, numfakes, origfound))
    #}
    
    if netsum:
    #{
        numvids, numtrain, vidsfounds, numfakes, origfound = functools.reduce(
            lambda st1, st2: (st1[0]+st2[0], st1[1]+st2[1], st1[2]+st2[2], 
                              st1[3]+st2[3], st1[4]+st2[4]), summation)
        
        print(f"\nNet dataset summary:")
        print(f"  Numb of primary video meta-references/train split: {numvids}/{numtrain}")
        print(f"  Numb of actual primary videos files found on disk: {vidsfounds}")
        print(f"  Numb of fake/original meta-pairs: {numfakes}")
        print(f"  Numb of actual video pairs found on disk: {origfound}\n")
    #}
#}

if dfc.DATA_SOURCE == 'production':
#{
    summarize_data(50, verbose=False)

    # Partitions 18 and 35 appear slightly incomplete.
    # summarize_data(18, 19, netsum=False, verbose=True)
    # summarize_data(35, 36, netsum=False, verbose=True)
#}
elif dfc.DATA_SOURCE == 'sample': 
    summarize_data(1, verbose=True)


In [None]:
%%time

# Generates video pairs from metadata file, defines a datapath per
# pair, and generates fakerframes and fakerprint for that pair.
def preprocess_data(istart=1, istop=None, batchsz=10):
#{
    if istop is None: istart, istop = 0, istart

    batchtimes, batchnum = [], 1
    for i in range(istart, istop):
    #{
        parttime = time.time()
        fpairs = df.faked_video_pairs(i)
        if not os.path.isdir(df.fakerdir(i)): os.mkdir(df.fakerdir(i))
        print(f"Preprocessing {len(fpairs)} video pairs in partition {df.trainpart(i)}:")
        for j, pc_pair in enumerate(fpairs):
        #{
            # This is some finagled code to pickup somewhere mid-partition
            #if re.split(r'[/.]', pc_pair[0])[-2] == 'wmoigsbnem': pickup = True
            #if not pickup: continue
        
            batchtimes.append(time.time())
            datapath = f"{df.fakerdir(i)}/{re.split(r'[/.]', pc_pair[0])[-2]}"
            if not os.path.isdir(datapath): os.mkdir(datapath)
            df.create_diff_frames(pc_pair[0], pc_pair[1], datapath)
            
            batchtimes[-1] = time.time() - batchtimes[-1]
            if len(batchtimes) == batchsz:
                print(f"  Average preprocess time per pair for batch {batchnum}: {np.average(batchtimes):.2f} sec")
                batchnum += 1; batchtimes = []
        #}
        print(f"  Total partition {df.trainpart(i)} preprocessing time: {(time.time()-parttime):.2f} sec")
    #}
#}

if dfc.DATA_SOURCE == 'sample':
    preprocess_data()

elif dfc.DATA_SOURCE == 'production': 
#{
    # With no arguments, preprocesses just the first partition (1334 videos). 
    # On my iMac, this averaged ~28 sec/video (or 10.5 hrs total).
    preprocess_data()
#}


In [None]:
# Just copies all the fakerprints into a single 
# for directory testing/debugging purposes

import shutil

dirlist = !ls -tr1 {df.fakerdir()}
for i, dfile in enumerate(dirlist): 
    shutil.copyfile(f"{df.fakerdir()}/{dfile}/fakerprint.jpg",
                    f"{df.fakerdir()}/../fakerprints/fakerprint{i}.jpg")