In [1]:
%load_ext autoreload
%autoreload 2

import sys, random, time
import psycopg2, json
import numpy as np

from collections import defaultdict

# For managing relative imports from notebook
if '..' not in sys.path: sys.path.append('..')

import config.config as dfc
import deepfake.dfutillib as df
import deepfake.postgresdb as pgdb

In [2]:
%%time

# This function reads each partition's metadata.json file, compiles a correpsonding 
# list of all videos, split into order-randomized training and validation sets,
# then assigns these to epoch blocks and inserts everything into the database.

def create_videos_data(istart, istop=None, validation_split=0.1, epochsz=200):
#{
    # Like range args
    if istop is None: istart, istop = 0, istart

    vtrains, vvalids = [],[]
    for i in range(istart, istop):
    #{
        vpart, initial = [], time.time()
        try:
            # Store all valid tuples from the partition metadata file
            with open(f"{df.traindir(i)}/metadata.json") as jsonfile:
            #{
                metadata = json.load(jsonfile)
                for vidname, meta in metadata.items():
                #{
                    if df.file_exists(f"{df.traindir(i)}/{vidname}"):
                        vtup = pgdb.VideoTuple(vidname=vidname, partition=i, label=meta['label'])
                        if meta['label'] == 'REAL': vpart.append(vtup) 
                        elif df.file_exists(f"{df.traindir(i)}/{meta['original']}"):
                            vtup.origname = meta['original']
                            vpart.append(vtup)
                #}
            #}
        except PermissionError as err: print("ERROR:", err)

        # Randomly select a validation subset
        nvalids = round(validation_split*len(vpart))
        vindices = set(random.sample(range(len(vpart)), nvalids))

        # Separate into respective split
        for j in range(len(vpart)):
        #{
            if j in vindices:
                vpart[j].split = "validate"
                vvalids.append(vpart[j])
            else:
                vpart[j].split = "train"
                vtrains.append(vpart[j])
        #}
        
        print((f"Partition {i}: {len(vpart)} valid videos, "
               f"parse time: {time.time()-initial:.3f} sec"))
    #}

    print(f"\nTotal dataset:")
    print(f"  {len(vtrains)} training videos")
    print(f"  {len(vvalids)} validation videos")
    print("\nAssigning epoch video-blocks...")
    
    # Randomize video orders
    np.random.shuffle(vtrains)
    np.random.shuffle(vvalids)

    # Init step sizes and create epoch blocks
    nvalids = int(epochsz*validation_split)    
    rgt = range(0, len(vtrains), epochsz - nvalids)
    rgv = range(0, len(vvalids), nvalids)
    
    for blkid, (ti, vi) in enumerate(zip(rgt, rgv)):
        for vtup in vtrains[ti:ti+epochsz-nvalids]: vtup.blk_id = blkid
        for vtup in vvalids[vi:vi+nvalids]: vtup.blk_id = blkid
    print(f"  {blkid} epoch video-blocks assigned")

    # Save straggler remainders to back-fill unreadable video
    print(f"\nRemaining back-fill videos:")
    print(f"  {len(vtrains[ti:])} training videos")
    print(f"  {len(vvalids[vi:])} validation videos")
    
    for vtup in vtrains[ti:]: vtup.blk_id = -1
    for vtup in vvalids[vi:]: vtup.blk_id = -1
    
    # Insert epoch video-blocks into DB
    print(f"\nAttempting database insert of:")
    print(f"  {len(vtrains[:ti])} training videos")
    print(f"  {len(vvalids[:vi])} validation videos")
    print(f"  {len(vtrains[ti:])+len(vvalids[vi:])} back-fill videos\n")
        
    with pgdb.PostgreSqlHandle() as db_handle:
        if not db_handle.initialize_database(): print("Populate aborted")
        else: db_handle.populate_database(vtrains, vvalids)
#}
               

create_videos_data(50)

Partition 0: 1334 valid videos, parse time: 0.153 sec
Partition 1: 1699 valid videos, parse time: 0.163 sec
Partition 2: 1748 valid videos, parse time: 0.159 sec
Partition 3: 1455 valid videos, parse time: 0.135 sec
Partition 4: 1701 valid videos, parse time: 0.163 sec
Partition 5: 2483 valid videos, parse time: 0.209 sec
Partition 6: 3464 valid videos, parse time: 0.297 sec
Partition 7: 2473 valid videos, parse time: 0.242 sec
Partition 8: 1816 valid videos, parse time: 0.163 sec
Partition 9: 1736 valid videos, parse time: 0.149 sec
Partition 10: 3192 valid videos, parse time: 0.277 sec
Partition 11: 2118 valid videos, parse time: 0.184 sec
Partition 12: 2225 valid videos, parse time: 0.187 sec
Partition 13: 3694 valid videos, parse time: 0.302 sec
Partition 14: 2464 valid videos, parse time: 0.206 sec
Partition 15: 2273 valid videos, parse time: 0.188 sec
Partition 16: 2061 valid videos, parse time: 0.176 sec
Partition 17: 2430 valid videos, parse time: 0.202 sec
Partition 18: 2683 v

Are you sure you want reinitialize this database?
[N/y] y



Commencing database initialization...
Database initialization complete.

Commencing database population...
Database population complete.

CPU times: user 6.85 s, sys: 6.47 s, total: 13.3 s
Wall time: 24.2 s
