In [2]:
%load_ext autoreload
%autoreload 2

import sys, random, time
import psycopg2, json
import numpy as np

from collections import defaultdict

# For managing relative imports from notebook
if '..' not in sys.path: sys.path.append('..')

import config.config as dfc
import deepfake.dfutillib as df
import deepfake.postgresdb as pgdb

In [3]:
# Validate database connection to dfc.DATABASE
with pgdb.PostgreSqlHandle(verbose=True): pass

Connecting to the PostgreSQL database...
PostgreSQL version:
  ('PostgreSQL 11.6 on x86_64-pc-linux-gnu, compiled by gcc (GCC) 4.8.3 20140911 (Red Hat 4.8.3-9), 64-bit',)


In [5]:
%%time

# This function reads each partition's metadata.json file, compiles a correpsonding 
# list of all videos, split into order-randomized training and validation sets,
# then assigns these to epoch blocks and inserts everything into the database.

def create_videos_data(istart, istop=None, validation_split=0.1, epochsz=200):
#{
    # Like range args
    if istop is None: istart, istop = 0, istart

    vtrains, vvalids = [],[]
    for i in range(istart, istop):
    #{
        vpart, initial = [], time.time()
        try:
            # Store all valid tuples from the partition metadata file
            with open(f"{df.traindir(i)}/metadata.json") as jsonfile:
            #{
                metadata = json.load(jsonfile)
                for vidname, meta in metadata.items():
                #{
                    if df.file_exists(f"{df.traindir(i)}/{vidname}"):
                        vtup = pgdb.VideoTuple(vidname=vidname, partition=i, label=meta['label'])
                        if meta['label'] == 'REAL': vpart.append(vtup) 
                        elif df.file_exists(f"{df.traindir(i)}/{meta['original']}"):
                            vtup.origname = meta['original']
                            vpart.append(vtup)
                #}
            #}
        except PermissionError as err: print("ERROR:", err)

        # Randomly select a validation subset
        nvalids = round(validation_split*len(vpart))
        vindices = set(random.sample(range(len(vpart)), nvalids))

        # Separate into respective split
        for j in range(len(vpart)):
        #{
            if j in vindices:
                vpart[j].split = "validate"
                vvalids.append(vpart[j])
            else:
                vpart[j].split = "train"
                vtrains.append(vpart[j])
        #}
        
        print((f"Partition {i}: {len(vpart)} valid videos, "
               f"parse time: {time.time()-initial:.3f} sec"))
    #}

    print(f"\nTotal dataset:")
    print(f"  {len(vtrains)} training videos")
    print(f"  {len(vvalids)} validation videos")
    print("\nAssigning epoch video-blocks...")
    
    # Randomize video orders
    np.random.shuffle(vtrains)
    np.random.shuffle(vvalids)

    # Init step sizes and create epoch blocks
    nvalids = int(epochsz*validation_split)    
    rgt = range(0, len(vtrains), epochsz - nvalids)
    rgv = range(0, len(vvalids), nvalids)
    
    for blkid, (ti, vi) in enumerate(zip(rgt, rgv)):
        for vtup in vtrains[ti:ti+epochsz-nvalids]: vtup.blk_id = blkid
        for vtup in vvalids[vi:vi+nvalids]: vtup.blk_id = blkid
    print(f"  {blkid} epoch video-blocks assigned")

    # Save straggler remainders to back-fill unreadable video
    print(f"\nRemaining back-fill videos:")
    print(f"  {len(vtrains[ti:])} training videos")
    print(f"  {len(vvalids[vi:])} validation videos")
    
    for vtup in vtrains[ti:]: vtup.blk_id = -1
    for vtup in vvalids[vi:]: vtup.blk_id = -1
    
    # Insert epoch video-blocks into DB
    print(f"\nAttempting database insert of:")
    print(f"  {len(vtrains[:ti])} training videos")
    print(f"  {len(vvalids[:vi])} validation videos")
    print(f"  {len(vtrains[ti:])+len(vvalids[vi:])} back-fill videos\n")
        
    with pgdb.PostgreSqlHandle() as db_handle:
        if not db_handle.initialize_database(): print("Populate aborted")
        else: db_handle.populate_database(vtrains, vvalids)
#}
               

#create_videos_data(50)

Partition 0: 1334 valid videos, parse time: 0.057 sec
Partition 1: 1699 valid videos, parse time: 0.073 sec
Partition 2: 1748 valid videos, parse time: 0.071 sec
Partition 3: 1455 valid videos, parse time: 0.059 sec
Partition 4: 1701 valid videos, parse time: 0.070 sec
Partition 5: 2483 valid videos, parse time: 0.100 sec
Partition 6: 3464 valid videos, parse time: 0.141 sec
Partition 7: 2473 valid videos, parse time: 0.119 sec
Partition 8: 1816 valid videos, parse time: 0.075 sec
Partition 9: 1736 valid videos, parse time: 0.071 sec
Partition 10: 3192 valid videos, parse time: 0.130 sec
Partition 11: 2118 valid videos, parse time: 0.086 sec
Partition 12: 2225 valid videos, parse time: 0.090 sec
Partition 13: 3694 valid videos, parse time: 0.152 sec
Partition 14: 2464 valid videos, parse time: 0.098 sec
Partition 15: 2273 valid videos, parse time: 0.090 sec
Partition 16: 2061 valid videos, parse time: 0.083 sec
Partition 17: 2430 valid videos, parse time: 0.096 sec
Partition 18: 2683 v

In [21]:
import deepfake.modelutil as mutil

trloader = mutil.ModelLoader(split='train')
gtor = trloader.lazy_loader()
for i in range(180): next(gtor)

MLoaderID: 139991405383504-train, loading blk_id: 479
/home/ec2-user/SageMaker/ebs/deepfake-detect-datalake/dfdc_frames_part_6/jbucifddgs
/home/ec2-user/SageMaker/ebs/deepfake-detect-datalake/dfdc_frames_part_38/ijgmcbwkjy
/home/ec2-user/SageMaker/ebs/deepfake-detect-datalake/dfdc_frames_part_16/cspadzkkxy
/home/ec2-user/SageMaker/ebs/deepfake-detect-datalake/dfdc_frames_part_47/gqopkqbkfc
/home/ec2-user/SageMaker/ebs/deepfake-detect-datalake/dfdc_frames_part_31/vrrtdmetws
/home/ec2-user/SageMaker/ebs/deepfake-detect-datalake/dfdc_frames_part_24/uftrzkegbd
/home/ec2-user/SageMaker/ebs/deepfake-detect-datalake/dfdc_frames_part_24/aaoyvnkeyo
/home/ec2-user/SageMaker/ebs/deepfake-detect-datalake/dfdc_frames_part_11/zccgqwyjjf
/home/ec2-user/SageMaker/ebs/deepfake-detect-datalake/dfdc_frames_part_31/iqebgcigjj
/home/ec2-user/SageMaker/ebs/deepfake-detect-datalake/dfdc_frames_part_18/cecqflguyc
/home/ec2-user/SageMaker/ebs/deepfake-detect-datalake/dfdc_frames_part_20/znclzrxipl
/home/ec2-us

AttributeError: 'NoneType' object has no attribute 'flatten'