# Process raw videos into SWIN consumable

In [1]:
import os
import cv2
import sys
import time
import string
import random
import shutil

In [2]:
source_folder = "../raw_videos"
destination_folder = "../processed_videos"

In [3]:
validation_prob = 0.2

In [4]:
# Remove existing folder
shutil.rmtree(destination_folder)

In [5]:
# Empty the existing folder
os.makedirs(destination_folder, exist_ok=True)
for each_folder in ['val', 'train', 'test']:
    os.makedirs(os.path.join(destination_folder, each_folder), exist_ok=True)

In [6]:
# List of all folders 
raw_folders = os.listdir(source_folder)

In [7]:
# Get symbol
def get_symbol(folder_name):
    folder_name = folder_name.lower()
    if 'done' in folder_name:
        return 0
    elif 'water' in folder_name:
        return 1
    elif 'poop' in folder_name:
        return 2
    elif 'dad' in folder_name:
        return 3
    elif 'mom' in folder_name:
        return 4
    else:
        raise ValueError('Symbol could not be found for folder %s' % folder_name)

In [8]:
def show_updated_video(input_video_path):
    cap = cv2.VideoCapture(input_video_path)
    print(f'Displaying video {input_video_path}')
    while cap.isOpened():
        start = time.perf_counter()
        # Capture the frame
        ret, frame = cap.read()
        
        if frame is not None:
            print('Found frame')
            cv2.imshow('frame', frame)
        else:
            break
    cv2.destroyAllWindows()

In [9]:
def analyze_video_file(input_video_path):
    """
    """
    
    frame_list = []
    # max_frames = 64
    min_frames = 128
    cap = cv2.VideoCapture(input_video_path)
    shape = None
    fps = None
    while cap.isOpened():
        start = time.perf_counter()
        # Capture the frame
        ret, frame = cap.read()
        
        # Get the video input frame rate
        fps = cap.get(cv2.CAP_PROP_FPS)
        # get the video width and height
        shape = (int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)))
        # print(shape)
        if frame is not None:
            frame_list.append(frame)
            shape = frame.shape[:2]
        else:
            break
    cap.release()
    
    print(f'Analyzing input file {input_video_path} with length {len(frame_list)}')
    output_video_path = input_video_path[:-4] + '_looped.mp4'
    
    #  
    mult = round((min_frames/len(frame_list) + 0.5))
    mult = 1 if mult < 1 else mult
    print(f'\tLooping frames {mult} times')
    
    # # If the video length is too long then cut it short
    # if len(frame_list) > max_frames:
    #     print(f'\tVideo length is too long {len(frame_list)}, cutting it short to {max_frames} frames ')
    #     left_index = int((len(frame_list) - max_frames)/2)
    #     # frame_list = frame_list[left_index:left_index+max_frames]

    print(f'\tOutput length {len(frame_list)}')
    print(f'\tOutput filename {output_video_path}')
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    mp4_out = cv2.VideoWriter(output_video_path, fourcc, fps=fps, frameSize=shape)
    # print(f'Writing frames of shape {shape}')

    # Write the frames 
    for frame in frame_list: 
        # print(f'Writing frame shape {frame.shape}')
        mp4_out.write(frame)
    mp4_out.release()
        
    # show_updated_video(output_video_path)
    return output_video_path, fps, shape, len(frame_list)

In [10]:
# 
train_list = []
val_list = []
test_list = []
frame_counts = []
video_shapes = []
video_fps = []
for each_folder in raw_folders:
    # One source for data
    one_source = os.path.join(source_folder, each_folder)
    
    # Types of videos 
    type_folders = os.listdir(one_source)
    
    # For each type of data
    for one_type in type_folders:
        symbol = get_symbol(one_type)
        
        # Get all the files within that folder for this type
        files_within_folder = os.listdir(os.path.join(one_source, one_type))

        # For each file within the folder
        for one_file in files_within_folder:
            
                # This is the filename within the folder
                source_filename = os.path.join(one_source, one_type, one_file)
                
                if '.ipynb_checkpo' in source_filename or 'looped' in source_filename:
                    continue
                
                source_filename, fps, frame_shape, num_frames = analyze_video_file(source_filename)
                video_shapes.append(frame_shape)
                video_fps.append(fps)
                frame_counts.append(num_frames)

                # Random filename
                ran_file = ''.join(random.choice(string.ascii_lowercase) for _ in range(20)) + '.mp4'

                if 'alex' not in each_folder.lower():
                    # 10% of the data goes to validation
                    if random.random() < validation_prob:
                        dest_folder = os.path.join(destination_folder, "val")
                        val_list.append(ran_file + ' ' + str(symbol))
                    else:
                        # Remaining 90% goes to train
                        dest_folder = os.path.join(destination_folder, "train")
                        train_list.append(ran_file + ' ' + str(symbol))
                else:
                    # Make test set completely unique
                    dest_folder = os.path.join(destination_folder, "test")
                    test_list.append(ran_file + ' ' + str(symbol))
                
                # 
                destination_filename = os.path.join(dest_folder, ran_file)
                shutil.copyfile(source_filename, destination_filename)
                print(f'moving {source_filename} to {destination_filename}')
                
                # If it is a looped file then remove it from source
                if 'looped' in source_filename:
                    print(f'Removing temporary file {source_filename}')
                    os.remove(source_filename)

with open(os.path.join(destination_folder, 'bsl_train_video.txt'), 'w') as outfi:
    for x in train_list:
        outfi.write(x + '\n')

with open(os.path.join(destination_folder, 'bsl_val_video.txt'), 'w') as outfi:
    for x in val_list:
        outfi.write(x + '\n')

with open(os.path.join(destination_folder, 'bsl_test_video.txt'), 'w') as outfi:
    for x in test_list:
        outfi.write(x + '\n')

Analyzing input file ../raw_videos/dan-round-1/water/wqter-8.mp4 with length 61
	Looping frames 3 times
	Output length 61
	Output filename ../raw_videos/dan-round-1/water/wqter-8_looped.mp4
moving ../raw_videos/dan-round-1/water/wqter-8_looped.mp4 to ../processed_videos/train/ordxpljdvfqbzqkifeto.mp4
Removing temporary file ../raw_videos/dan-round-1/water/wqter-8_looped.mp4
Analyzing input file ../raw_videos/dan-round-1/water/water-5.mp4 with length 41
	Looping frames 4 times
	Output length 41
	Output filename ../raw_videos/dan-round-1/water/water-5_looped.mp4
moving ../raw_videos/dan-round-1/water/water-5_looped.mp4 to ../processed_videos/train/ltynloebpidehdvlurnt.mp4
Removing temporary file ../raw_videos/dan-round-1/water/water-5_looped.mp4
Analyzing input file ../raw_videos/dan-round-1/water/water-2.mp4 with length 75
	Looping frames 2 times
	Output length 75
	Output filename ../raw_videos/dan-round-1/water/water-2_looped.mp4
moving ../raw_videos/dan-round-1/water/water-2_looped.mp

In [11]:
import numpy as np
bins, counts = np.unique(frame_counts, return_counts=True)

In [12]:
np.sum(counts[bins>30])

275