# Process raw videos into SWIN consumable

In [1]:
import os
import sys
import string
import random
import shutil

In [2]:
source_folder = "../raw_videos"
destination_folder = "../processed_videos"

In [3]:
validation_prob = 0.2

In [4]:
# Remove existing folder
shutil.rmtree(destination_folder)

In [5]:
# Empty the existing folder
os.makedirs(destination_folder, exist_ok=True)
for each_folder in ['val', 'train', 'test']:
    os.makedirs(os.path.join(destination_folder, each_folder), exist_ok=True)

In [6]:
# List of all folders 
raw_folders = os.listdir(source_folder)

In [7]:
# Get symbol
def get_symbol(folder_name):
    folder_name = folder_name.lower()
    if 'done' in folder_name:
        return 0
    elif 'water' in folder_name:
        return 1
    elif 'poop' in folder_name:
        return 2
    elif 'dad' in folder_name:
        return 3
    elif 'mom' in folder_name:
        return 4
    else:
        raise ValueError('Symbol could not be found for folder %s' % folder_name)

In [8]:
# 
train_list = []
val_list = []
test_list = []
for each_folder in raw_folders:
    # One source for data
    one_source = os.path.join(source_folder, each_folder)
    
    # Types of videos 
    type_folders = os.listdir(one_source)
    
    # For each type of data
    for one_type in type_folders:
        symbol = get_symbol(one_type)
        
        # Get all the files within that folder for this type
        files_within_folder = os.listdir(os.path.join(one_source, one_type))

        # For each file within the folder
        for one_file in files_within_folder:
            
                # This is the filename within the folder
                source_filename = os.path.join(one_source, one_type, one_file)

                # Random filename
                ran_file = ''.join(random.choice(string.ascii_lowercase) for _ in range(20)) + '.mp4'

                if 'alex' not in each_folder:
                    # 10% of the data goes to validation
                    if random.random() < validation_prob:
                        dest_folder = os.path.join(destination_folder, "val")
                        val_list.append(ran_file + ' ' + str(symbol))
                    else:
                        # Remaining 90% goes to train
                        dest_folder = os.path.join(destination_folder, "train")
                        train_list.append(ran_file + ' ' + str(symbol))
                else:
                    # Make test set completely unique
                    dest_folder = os.path.join(destination_folder, "test")
                    test_list.append(ran_file + ' ' + str(symbol))
                
                # 
                destination_filename = os.path.join(dest_folder, ran_file)
                shutil.copyfile(source_filename, destination_filename)

with open(os.path.join(destination_folder, 'bsl_train_video.txt'), 'w') as outfi:
    for x in train_list:
        outfi.write(x + '\n')

with open(os.path.join(destination_folder, 'bsl_val_video.txt'), 'w') as outfi:
    for x in val_list:
        outfi.write(x + '\n')

with open(os.path.join(destination_folder, 'bsl_test_video.txt'), 'w') as outfi:
    for x in test_list:
        outfi.write(x + '\n')