## Please run vid2img_ite first to change video clips to frames (if you added new data)

In [1]:
import json
import glob
import os
import numpy as np
import collections
import shutil
from pathlib import Path
from env_vars import VIDEOS_DIR, PREPROCESSED_DATA_ROOT, RAW_DATA_ROOT, FRAMES_DIR
for var in [VIDEOS_DIR, PREPROCESSED_DATA_ROOT, RAW_DATA_ROOT, FRAMES_DIR]:
    print(var)
       
DEMO_DIR = os.path.join(RAW_DATA_ROOT, 'demo_videos') #'demo_videos'
WRITE_TO_DEMO = False  # Set to False if don't want to rewrite to demo (as its slow)

dataset_split_name = 'pc_101' # name of dataset setup
dataset_split_path = os.path.join(PREPROCESSED_DATA_ROOT, dataset_split_name) # path to save 

#read list of classes
with open(os.path.join(dataset_split_path,'actions_label_map.txt')) as f:
    categories = f.readlines()
    categories = [c.strip().replace(' ', '_').replace('"', '').replace('(', '').replace(')', '').replace("'", '') for c in categories]
    dict_categories = {}
    for i, category in enumerate(categories):
        dict_categories[category] = i


C:\Users\User1\Desktop\projects\ITE_APAMS\ite_dataset\videos
C:\Users\User1\Desktop\projects\ITE_APAMS\ite_dataset
C:\Users\User1\Desktop\projects\ITE_APAMS
C:\Users\User1\Desktop\projects\ITE_APAMS\ite_dataset\frames


In [2]:
#get paths of all videos
video_files_paths = glob.glob(os.path.join(VIDEOS_DIR,'*', '*', '*.MP4'))
video_fpaths_dict = dict()

#make dict video paths with the video filename as keys to avoid duplicates
video_filenames = list()
for video_file_path in video_files_paths:
    video_fname = os.path.basename(video_file_path)
    if video_fname in video_filenames:
        print(video_file_path)
    else:
        video_filenames.append(video_fname)
        video_fpaths_dict[video_fname] = video_file_path

In [3]:
#split video files by section
section_dict = collections.defaultdict(list)
for video_filename in video_filenames:
    section_dict[video_filename[0]].append(video_filename)


In [4]:
#split into train val test by videos

train, val, test = [], [], [] # for counting purposes

split_filename_map = collections.defaultdict()
for section, vid_filenames in section_dict.items():
    np.random.seed(35) # set seed for repeatability
    np.random.shuffle(vid_filenames)
    num_vids = len(vid_filenames)
    train_idx = round(num_vids*0.8)
    val_idx = round(num_vids*0.9)
    
    train_vids = vid_filenames[:train_idx]
    val_vids = vid_filenames[train_idx:val_idx]
    test_vids = vid_filenames[val_idx:] 
    
    if WRITE_TO_DEMO:
        Path(DEMO_DIR).mkdir(exist_ok=True)
        for vid_list, name in [(val_vids, 'val'), (test_vids, 'test')]:
            dest_dir = os.path.join(DEMO_DIR, name)
            Path(dest_dir).mkdir(exist_ok=True)
            for vid_fname in vid_list:
                #copy video
                video_fpath = video_fpaths_dict[vid_fname]
                vid_dest_path = os.path.join(dest_dir, vid_fname)
                shutil.copyfile(video_fpath, vid_dest_path)
                #copy csv file too
                if os.path.exists(video_fpath+'.csv'):
                    shutil.copyfile(video_fpath+'.csv', vid_dest_path+'.csv')
                else:
                    print(video_fpath+'.csv')
    
    train.extend(train_vids)
    val.extend(val_vids)
    test.extend(test_vids)
    
    for vid_fnames, split in zip([train_vids, val_vids, test_vids], ['train', 'val', 'test']):
        for fname in vid_fnames:
            split_filename_map[fname] = split


tot = len(train)+len(val)+len(test)
print(f'total number of samples: {tot}')
print(f'number of train samples: {len(train)}')
print(f'number of val samples: {len(val)}')
print(f'number of test samples: {len(test)}')
print(f'ratio of train: {len(train)/tot}')
print(f'ratio of val: {len(val)/tot}')
print(f'ratio of test: {len(test)/tot}')

total number of samples: 489
number of train samples: 391
number of val samples: 50
number of test samples: 48
ratio of train: 0.7995910020449898
ratio of val: 0.10224948875255624
ratio of test: 0.09815950920245399


In [5]:
clip_paths_by_split_map = collections.defaultdict(list)
action_dirs = glob.glob(os.path.join(FRAMES_DIR, '*'))

no_action_samples = []
action_samples = []

for folder in action_dirs:
    
#     print(folder) 
    clips = glob.glob(os.path.join(folder, '*'))
    
    for clip_path in clips:
        
        #find videoname of clip:
        parts = os.path.basename(clip_path).split('_')
        if 'no_action' in clip_path:
            videofilename = '_'.join(parts[4:])
        else:
            videofilename = '_'.join(parts[2:])
        videofilename+='.MP4'

        
        
        class_name = clip_path.split('\\')[-2]
            
        #change invalid classes to no_action
        if class_name not in categories:
            class_name = 'no_action'

        class_idx = dict_categories[class_name]

        num_frames = len(glob.glob(os.path.join(clip_path, '*')))
        if class_name == 'no_action':
            no_action_samples.append((videofilename, clip_path, num_frames, class_idx))
        else: 
            action_samples.append((videofilename, clip_path, num_frames, class_idx))
            
print(f'number of positive (action) samples: {len(action_samples)}')
print(f'number of negative (no_action) samples: {len(no_action_samples)}')
sample_size = min(len(action_samples), len(no_action_samples))
action_samples = [action_samples[i] for i in np.random.choice(len(action_samples), size=sample_size, replace=False)]
no_action_samples = [no_action_samples[i] for i in np.random.choice(len(no_action_samples), size=sample_size, replace=False)]
print(f'number of positive (action) samples after resampling: {len(action_samples)}')
print(f'number of negative (no_action) samples after resampling: {len(no_action_samples)}')


#write to files
train_file = open(os.path.join(dataset_split_path, 'train_videofolder.txt'), 'w+')
val_file = open(os.path.join(dataset_split_path, 'val_videofolder.txt'), 'w+')
test_file = open(os.path.join(dataset_split_path, 'test_videofolder.txt'), 'w+')
split_file_dict = {
    'train':train_file,
    'val':val_file,
    'test':test_file
}
for videofilename, clip_path, num_frames, class_idx in action_samples+no_action_samples:
#         find train/val/test of sample:
    split = split_filename_map[videofilename]
#     print(split)
#     clip_paths_by_split_map[split].append()
    file = split_file_dict[split]

    file.write(f'{clip_path},{num_frames},{class_idx}\n')
        
train_file.close()
val_file.close()
test_file.close() 

number of positive (action) samples: 163
number of negative (no_action) samples: 154
number of positive (action) samples after resampling: 154
number of negative (no_action) samples after resampling: 154


In [6]:
train_file = open(os.path.join(dataset_split_path, 'train_videofolder.txt'), 'r')
val_file = open(os.path.join(dataset_split_path, 'val_videofolder.txt'), 'r')
test_file = open(os.path.join(dataset_split_path, 'test_videofolder.txt'), 'r')

num_vids = len(glob.glob(os.path.join(FRAMES_DIR, '*', '*')))
with open(os.path.join(dataset_split_path, 'stats.csv'), 'w') as f:
    print(f'total number of clips: {num_vids}')
    f.write(f'total number of clips: {num_vids}\n\n')
    
    for file in [train_file, val_file, test_file]:
        f.write(f'{file.name}\n')
        file.seek(0)
#         print(file.name)
        class_counter = collections.defaultdict(list)
        frames_counter = collections.defaultdict(list)
        
        #do counting
        for line in file.readlines():
            _, num_frames, label = line.strip().split(',')
            label = categories[int(label)]
            class_counter[label].append(0)
            frames_counter[label].append(int(num_frames))
            
        all_lengths = []
        num_clips = 0
        
        #summary
        f.write(f'class,num_samples,avg_len\n')
        for (k, v), (key, lengths) in zip(class_counter.items(), frames_counter.items()):
            #get mean of lengths
            all_lengths.extend(lengths)
            avg_len = np.mean(lengths)
            if k!=key:
                print("unmatched keys detected! this shouldn't happen")
                print(f'k: {k}, key: {key}')
            print(f'class: {k}, number of samples: {len(v)}, average length: {avg_len}')
            num_clips+=len(v)
            f.write(f'{k},{len(v)},{avg_len}\n')
        print(f'total number of clips:,{num_clips},average length of clips:,{np.mean(all_lengths)}')
        f.write(f'total number of clips:,{num_clips},average length of clips:,{np.mean(all_lengths)}\n\n')

train_file.close()
val_file.close()
test_file.close()  

total number of clips: 317
class: remove_pc_casing, number of samples: 4, average length: 88.5
class: unplug_power_cable, number of samples: 3, average length: 49.0
class: connect_hdd_power_cable, number of samples: 4, average length: 92.75
class: turn_on_pc, number of samples: 12, average length: 74.0
class: connect_display_cable, number of samples: 4, average length: 119.0
class: disconnect_hdd_data_cable, number of samples: 6, average length: 72.66666666666667
class: switch_off_power_source, number of samples: 4, average length: 117.25
class: connect_atx_cable, number of samples: 4, average length: 133.25
class: insert_hdd, number of samples: 4, average length: 143.75
class: disconnect_atx_cable, number of samples: 4, average length: 112.75
class: remove_odd, number of samples: 4, average length: 99.5
class: switch_off_power, number of samples: 5, average length: 39.0
class: put_back_pc_casing, number of samples: 5, average length: 189.4
class: login_screen, number of samples: 10, a

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [7]:
vid_list_lists = [train, val, test]
filename_list = ['train.csv', 'val.csv', 'test.csv']
for vid_list, filename in zip(vid_list_lists, filename_list):
    with open(os.path.join(dataset_split_path, filename), 'w+') as file:
        for vidname in vid_list:
            file.write(f'{vidname}\n')

