# notes
changed to output method to use fps* clip_duration instead of checking the number of frames extracted from that clip, because 
for certain videos, extracting 2s worth of 30fps videos will result in more than 60 frames.

# Imports


In [1]:
from tools.vid2img import vid2jpg, convert_folder
import os
import glob
from pathlib import Path
import cv2
import numpy as np
import pandas as pd

from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip
from matplotlib import pyplot as plt
from matplotlib.colors import Normalize
import time

from ops.dataset import TSNDataSet
from ops import dataset_config
from ops.models import TSN
from env_vars import VIDEOS_DIR, PREPROCESSED_DATA_ROOT, RAW_DATA_ROOT

import torch.nn.parallel
import torch.optim
from ops.transforms import *
from torch.nn import functional as F #for softmax 

import pickle

# Functions

In [2]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def accuracy(output, target, topk=(1,)):
    """Computes the precision@k for the specified values of k"""
    maxk = max(topk)
    batch_size = target.size(0)
    _, pred = output.topk(maxk, 1, True, True)
    pred = pred.t()
    correct = pred.eq(target.view(1, -1).expand_as(pred))
    res = []
    for k in topk:
         correct_k = correct[:k].contiguous().view(-1).float().sum(0)
         res.append(correct_k.mul_(100.0 / batch_size))
    return res


def parse_shift_option_from_log_name(log_name):
    if 'shift' in log_name:
        strings = log_name.split('_')
        for i, s in enumerate(strings):
            if 'shift' in s:
                break
        return True, int(strings[i].replace('shift', '')), strings[i + 1]
    else:
        return False, None, None
    

def eval_video(video_data, net, this_test_segments, modality):
    net.eval()
    with torch.no_grad():
        i, data, label = video_data
        batch_size = label.numel()
        num_crop = test_crops
        if dense_sample:
            num_crop *= 10  # 10 clips for testing when using dense sample

        if twice_sample:
            num_crop *= 2

        if modality == 'RGB':
            length = 3
        elif modality == 'Flow':
            length = 10
        elif modality == 'RGBDiff':
            length = 18
        else:
            raise ValueError("Unknown modality "+ modality)

        data_in = data.view(-1, length, data.size(2), data.size(3))
        if is_shift:
            data_in = data_in.view(batch_size * num_crop, this_test_segments, length, data_in.size(2), data_in.size(3))
        rst = net(data_in)
        rst = rst.reshape(batch_size, num_crop, -1).mean(1)

        if softmax:
            # take the softmax to normalize the output to probability
            rst = F.softmax(rst, dim=1)

        rst = rst.data.cpu().numpy().copy()

        if net.module.is_shift:
            rst = rst.reshape(batch_size, num_class)
        else:
            rst = rst.reshape((batch_size, -1, num_class)).mean(axis=1).reshape((batch_size, num_class))

        return i, rst, label

## DATASET VERSIONS, WEIGHTS TO LOAD, SOURCE_FOLDERS, DEST_FOLDERS

In [3]:
#dataset_version = '20210702_rev_all_actions_new_data_ek_ITE_batch1'
dataset_version = '20210709_rev_all_actions_new_data_ek_ITE_batch2'

# all actions
this_weights = f'pretrained/fully_trained/01_all_actions/'\
               f'20210710_TSM_ite_RGB_resnet50_shift8_blockres_avg_segment8_e150_{dataset_version}_dense/'\
                'checkpoint/ckpt.best.pth.tar'

#20210703_TSM_ite_RGB_resnet50_shift8_blockres_avg_segment8_e150_20210702_rev_all_actions_new_data_ek_ITE_batch1_dense
#20210710_TSM_ite_RGB_resnet50_shift8_blockres_avg_segment8_e150_20210709_rev_all_actions_new_data_ek_ITE_batch2_dense

# FOLDERS========================================================================================
# video locations for dividing video into clips for getting the predictions

source_dir = os.path.join(RAW_DATA_ROOT, 'ite_dataset', 'videos') #'demo_videos'
test_csv_filepath = os.path.join(PREPROCESSED_DATA_ROOT, 'ite_dataset', dataset_version, 'test.csv')  # test.csv
video_paths = []

with open(test_csv_filepath, 'r') as test_csv_file:
    for line in test_csv_file.readlines():
        video_filename = line.strip()
        result_paths = glob.glob(os.path.join(source_dir, '**', f'*{video_filename}'), recursive =True)
        if len(result_paths) == 1:
            video_paths.append(result_paths[0])
        else:
            print(f'Error! found 0 or more than 1 video file for test video {video_filename}')

cwd = os.getcwd()

# Fixed param, where the ite_dataset is at
root_data_path = PREPROCESSED_DATA_ROOT

# folder to store the prediction .csv and video files
dt = time.strftime('%Y%m%d%H%M', time.localtime())
output_path = os.path.join(cwd, f'{dt}_{dataset_version}_nseg8_ITE_VIDEO_TEST_results') #- used for just dense, full_res = False, test_crops=1
#output_path = os.path.join(cwd, '202107042048_20210702_rev_all_actions_new_data_ek_ITE_batch1_nseg8_ITE_VIDEO_TEST_results')

Path(output_path).mkdir(exist_ok=True)

# temp directory location for storing the split video from source_dir into 2s clips
tmp_dir = os.path.join(PREPROCESSED_DATA_ROOT, 'tmp')
Path(tmp_dir).mkdir(exist_ok=True)


In [4]:
video_paths

['F:\\DS_dev\\data_raw\\ite_dataset\\videos\\210628\\ALL\\Student2.MP4',
 'F:\\DS_dev\\data_raw\\ite_dataset\\videos\\210629\\ghost\\edric_290621.MP4',
 'F:\\DS_dev\\data_raw\\ite_dataset\\videos\\210630\\ordeo_239\\irfanakid1_300621.MP4',
 'F:\\DS_dev\\data_raw\\ite_dataset\\videos\\210629\\ghost\\jason_290621.MP4',
 'F:\\DS_dev\\data_raw\\ite_dataset\\videos\\210629\\ghost\\mingzhi2_290621.MP4',
 'F:\\DS_dev\\data_raw\\ite_dataset\\videos\\210630\\ghost\\chanmunhong2_300621.MP4',
 'F:\\DS_dev\\data_raw\\ite_dataset\\videos\\210629\\ordeo_237\\decanay_290621.MP4',
 'F:\\DS_dev\\data_raw\\ite_dataset\\videos\\210630\\ghost\\daniel3_300621.MP4',
 'F:\\DS_dev\\data_raw\\ite_dataset\\videos\\210630\\ordeo_239\\lohshengyi1_300621.MP4',
 'F:\\DS_dev\\data_raw\\ite_dataset\\videos\\210630\\ghost\\yihong2_300621.MP4']

In [5]:
root_data_path

'C:\\DS_data'

## MODEL PARAMETERS
- Inferred by this_weights
- Set by User

In [6]:
#============================================================================================================#
# Fixed parameters based on the model trained
num_segments = 8

#============================================================================================================#
# Currently tested parameters that can be changed
# FOR A MODEL THAT IS TRAINED WITH DENSE_SAMPLE, MUST SET EITHER TO TRUE
dense_sample = True  # True 
twice_sample = False

CLIP_DURATION = 2 #duration of clip to send to model for prediction

#============================================================================================================#
# Parameters that were fixed throughout different models prediction (perhaps could be altered for performance)
# Are these training parameters as well?????
test_crops = 1  #1
full_res = False #False
this_test_segments = 8
#========================# Data Loading, Etc Parameters (changes based on computer, etc)==========================#
batch_size = 2
num_workers = 0

#============================================================================================================#
# Fixed parameters (either from parsing from this_weights, or not really changed)
is_shift, shift_div, shift_place = parse_shift_option_from_log_name(this_weights)
softmax = True
SOFTMAX_THRESH = 0.8

#============================================================================================================#
# Check for device
if torch.cuda.is_available():
    dev = "cuda:0"
else:
    dev = "cpu"

# Load model

In [7]:
this_arch = this_weights.split('TSM_')[1].split('_')[2]
if 'RGB' in this_weights:
        modality = 'RGB'
else:
    modality = 'Flow'
    
num_class, train_list, val_list, root_path, prefix = dataset_config.return_dataset(root_data_path, 'ite',
                                                                                            modality, version = dataset_version)

net = TSN(num_class, this_test_segments if is_shift else 1, modality,
              base_model=this_arch,
              consensus_type='avg',
              img_feature_dim=256,
              pretrain='imagenet',
              is_shift=is_shift, shift_div=shift_div, shift_place=shift_place,
              non_local='_nl' in this_weights,
              )

if 'tpool' in this_weights:
    from ops.temporal_shift import make_temporal_pool
    make_temporal_pool(net.base_model, this_test_segments)  # since DataParallel

checkpoint = torch.load(this_weights, map_location=torch.device(dev))
checkpoint = checkpoint['state_dict']

# base_dict = {('base_model.' + k).replace('base_model.fc', 'new_fc'): v for k, v in list(checkpoint.items())}
base_dict = {'.'.join(k.split('.')[1:]): v for k, v in list(checkpoint.items())}
replace_dict = {'base_model.classifier.weight': 'new_fc.weight',
                'base_model.classifier.bias': 'new_fc.bias',
                }
for k, v in replace_dict.items():
    if k in base_dict:
        base_dict[v] = base_dict.pop(k)

net.load_state_dict(base_dict)

input_size = net.scale_size if full_res else net.input_size
if test_crops == 1:
    cropping = torchvision.transforms.Compose([
        GroupScale(net.scale_size),
        GroupCenterCrop(input_size),
    ])
elif test_crops == 3:  # do not flip, so only 5 crops
    cropping = torchvision.transforms.Compose([
        GroupFullResSample(input_size, net.scale_size, flip=False)
    ])
elif test_crops == 5:  # do not flip, so only 5 crops
    cropping = torchvision.transforms.Compose([
        GroupOverSample(input_size, net.scale_size, flip=False)
    ])
elif test_crops == 10:
    cropping = torchvision.transforms.Compose([
        GroupOverSample(input_size, net.scale_size)
    ])
else:
    raise ValueError("Only 1, 5, 10 crops are supported while we got {}".format(test_crops))

ite: 33 classes

    Initializing TSN with base model: resnet50.
    TSN Configurations:
        input_modality:     RGB
        num_segments:       8
        new_length:         1
        consensus_module:   avg
        dropout_ratio:      0.8
        img_feature_dim:    256
            
=> base model: resnet50
Adding temporal shift...
=> n_segment per stage: [8, 8, 8, 8]
=> Processing stage with 3 blocks residual
=> Using fold div: 8
=> Using fold div: 8
=> Using fold div: 8
=> Processing stage with 4 blocks residual
=> Using fold div: 8
=> Using fold div: 8
=> Using fold div: 8
=> Using fold div: 8
=> Processing stage with 6 blocks residual
=> Using fold div: 8
=> Using fold div: 8
=> Using fold div: 8
=> Using fold div: 8
=> Using fold div: 8
=> Using fold div: 8
=> Processing stage with 3 blocks residual
=> Using fold div: 8
=> Using fold div: 8
=> Using fold div: 8




In [8]:
tmp_dir

'C:\\DS_data\\tmp'

# Preprocess data and create data loaders

In [9]:
def prepare_video(video_path, preprocess = True, dataloaders = True, del_clips = True):
    filename = os.path.basename(video_path)
    
    #get video properties
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)      # OpenCV2 version 2 used "CV_CAP_PROP_FPS"
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))   # float `width`
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT) ) # float `height`
    cap.release()
    
    #convert video to clips
    dest_dir = os.path.join(tmp_dir, filename)
    print(dest_dir)
    print(f'fps: {fps}, frame count: {frame_count}, clip duration: {CLIP_DURATION}, filename: {filename}')
    
    if preprocess:
        for i in range(0, int(frame_count/fps), int(CLIP_DURATION)):
            Path(dest_dir).mkdir(exist_ok=True)
            target_filepath = os.path.join(dest_dir, f'{i:04}_{filename}')
            
            # convert clips to images
            ffmpeg_extract_subclip(video_path, i, i + int(CLIP_DURATION), targetname=target_filepath)

        #convert clips to images
        convert_folder(dest_dir, dest_dir)
    
    #generate video file list
    video_folders_file = 'videofolder.txt'
    video_folders_filepath = os.path.join(dest_dir, video_folders_file)
    with open(video_folders_filepath, 'w+') as file:
        for folder in glob.glob(os.path.join(dest_dir, '*')):
            if not os.path.isdir(folder):
                continue
            num_images = len(glob.glob(os.path.join(folder, '*')))
            file.write(f'{folder},{num_images},{-1}\n')
            
    if del_clips:
        for p in glob.glob(os.path.join(dest_dir, '*.MP4')):
            os.remove(p)
            
    #prepare data loaders
    if dataloaders:
        print(f'net.input_mean: {net.input_mean}, net.input_std: {net.input_std}')
        data_loader = torch.utils.data.DataLoader(
                    TSNDataSet(root_path, video_folders_filepath, num_segments=num_segments,
                               new_length=1 if modality == "RGB" else 5,
                               modality=modality,
                               image_tmpl=prefix,
                               test_mode=True,
                               random_shift = False, #use consistent spacing between segments (frames)
                               transform=torchvision.transforms.Compose([
                                   cropping,
                                   Stack(roll=(this_arch in ['BNInception', 'InceptionV3'])),
                                   ToTorchFormatTensor(div=(this_arch not in ['BNInception', 'InceptionV3'])),
                                   GroupNormalize([0.485, 0.456, 0.406], [0.485, 0.456, 0.406]),
                               ]), dense_sample=dense_sample, twice_sample=twice_sample),
                    batch_size=batch_size, shuffle=False,
                    num_workers=num_workers, pin_memory=True,
            )
    else:
        data_loader = None
        
    return {'data_loader': data_loader, 'videofolder':video_folders_filepath, 'fps':fps, 'frame_count':frame_count, 'width': width, 'height':height}
    


In [10]:
video_dict = dict()
for video_path in video_paths:
    video_dict[video_path] = prepare_video(video_path, preprocess = False, dataloaders = True, del_clips=False) 
    #video_dict[video_path] = prepare_video(video_path, preprocess = False, dataloaders = True, del_clips=False) 


C:\DS_data\tmp\Student2.MP4
fps: 30.0, frame count: 45915, clip duration: 2, filename: Student2.MP4
net.input_mean: [0.485, 0.456, 0.406], net.input_std: [0.485, 0.456, 0.406]
=> Using dense sample for the dataset...
video number:765
C:\DS_data\tmp\edric_290621.MP4
fps: 29.97002997002997, frame count: 35488, clip duration: 2, filename: edric_290621.MP4
net.input_mean: [0.485, 0.456, 0.406], net.input_std: [0.485, 0.456, 0.406]
=> Using dense sample for the dataset...
video number:592
C:\DS_data\tmp\irfanakid1_300621.MP4
fps: 30.0, frame count: 25290, clip duration: 2, filename: irfanakid1_300621.MP4
net.input_mean: [0.485, 0.456, 0.406], net.input_std: [0.485, 0.456, 0.406]
=> Using dense sample for the dataset...
video number:422
C:\DS_data\tmp\jason_290621.MP4
fps: 29.97002997002997, frame count: 25006, clip duration: 2, filename: jason_290621.MP4
net.input_mean: [0.485, 0.456, 0.406], net.input_std: [0.485, 0.456, 0.406]
=> Using dense sample for the dataset...
video number:417
C:\D

# Predict

In [11]:
net = torch.nn.DataParallel(net.to(dev))
net.eval()
for video_path in video_paths:
    data_loader = video_dict[video_path]['data_loader']
    data_gen = enumerate(data_loader)
    this_rst_list = []
    label_list = []
    for i, (data, label) in data_gen:
        rst = eval_video((i, data, label), net, this_test_segments, modality)

        for l, r in zip(label, rst[1]): #unpack batch to individual samples
            #save to lists
            this_rst_list.append(r) 
            label_list.append(l) 
            
    video_dict[video_path]['preds']= this_rst_list
    video_dict[video_path]['labels']= label_list


In [12]:
# Save using pickle.....have to stop this to do object placement
import pickle
Path(output_path).mkdir(exist_ok=True)
file_to_write = open( os.path.join(output_path, 'prediction_results'), "wb")
pickle.dump(video_dict, file_to_write)

file_to_write.close()

### Loading video_dict and all its predictions so as to not go through predictions

##### variables required?
cwd = 
output_path =  os.path.join(cwd, f'{dt}_{dataset_version}_nseg{num_segments}_ITE_VIDEO_results')

##### please load categories

import pickle

file_to_read = open(os.path.join(output_path, 'prediction_results'), "rb")

video_dict = pickle.load(file_to_read)

file_to_read.close()

# Load section time info

load scoring table, then get section time info from there

In [13]:
from collections import defaultdict
sequence_dict, marks_dict, section_times_dict = {}, {}, {}
section_dict = defaultdict(list)
sect_dict = defaultdict(list)
with open('scoring_table_danny.csv', 'r') as csv_file:
    lines = csv_file.readlines()
    for line in lines[1:]:
#         print(line)
        section, action, sequence, class_type, time_span, mark = line.split(',')
        if class_type == 'action':
            action_section = f'{action}_{section}'
            sequence_dict[action_section] = int(sequence)
            marks_dict[action_section] = int(mark)
            sect_dict[action].append(section)
            start, end = time_span.split('-')
            section_times_dict[section] = {'start':float(start), 'end':float(end)}
            if section not in section_dict[action]:
                section_dict[action].append(section)

In [14]:
section_times_dict

{'A': {'start': 0.0, 'end': 4.0},
 'B': {'start': 0.0, 'end': 10.0},
 'C': {'start': 4.0, 'end': 20.0},
 'E': {'start': 8.0, 'end': 35.0},
 'F': {'start': 15.0, 'end': 40.0}}

In [15]:
section_timespans = []
for k, v in section_times_dict.items():
    section_timespans.append(((v['start'], v['end']), k))
for video_path in video_paths:
    video_dict[video_path]['sect_timespans'] = section_timespans

In [16]:
#read label map:
with open(os.path.join(root_data_path, 'ite_dataset', dataset_version, 'actions_label_map.txt'), 'r') as file:
    categories = file.readlines()
    categories = [c.strip().replace(' ', '_').replace('"', '').replace('(', '').replace(')', '').replace("'", '') for c in categories]

In [17]:
categories

['connect_alligator_clip',
 'connect_atx_cable',
 'connect_display_cable',
 'connect_hdd_data_cable',
 'connect_hdd_power_cable',
 'connect_odd_data_cable',
 'connect_odd_power_cable',
 'disconnect_atx_cable',
 'disconnect_display_cable',
 'disconnect_hdd_data_cable',
 'disconnect_hdd_power_cable',
 'disconnect_odd_data_cable',
 'disconnect_odd_power_cable',
 'enter_bios_setup_mode',
 'insert_hdd',
 'insert_odd',
 'insert_ram',
 'insert_vga',
 'login_screen',
 'no_action',
 'place_anti_static_mat',
 'put_back_pc_casing',
 'remove_hdd',
 'remove_odd',
 'remove_pc_casing',
 'remove_ram',
 'remove_vga',
 'switch_off_power',
 'switch_off_power_source',
 'turn_on_pc',
 'unplug_power_cable',
 'verify_boot_sequence',
 'wear_wrist_wrap']

# Postprocessing functions

In [18]:
def get_actions_from_sections(inp_sections):
    """
    Gets list of actions for list of sections
    input: list of sections. E.g: ['A', 'B']
    returns: list of actions
    """
    ret = list()
    for action, sections in sect_dict.items():
        if type(sections)!=list:
            sections=[sections]
        #check if the current action's sections are in queried list of sections:
        #(https://www.geeksforgeeks.org/python-check-two-lists-least-one-element-common/)
        if (set(sections)&set(inp_sections)):
            ret.append(action)
    return ret

def get_sections_from_time(minute, section_timespans, offset=0):
    """
    Gets list of sections that pertain to certain time
    minute: current time of prediction
    offset: offset to add to minute
    """
    minute += offset
    sections = list()
    for timespan, section in section_timespans:
        if timespan[0] <= minute < timespan[1]:
            sections.append(section)
    return sections

def get_actions_from_time(minute, offset=0):
    """
    gets list of actions that should be allowed to be predicted by the model for current time (minute)
    """
    sections = get_sections_from_time(minute, offset)
    actions = get_actions_from_sections(sections)
    return actions

def output_mask(actions):
    """
    Gets mask to ignore actions
    actions: list of actions to allow
    """   
    return [1 if action in actions or action == 'no_action' else 0 for action in categories ]

def get_video_properties(video_path):
    #get video properties
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)      # OpenCV2 version 2 used "CV_CAP_PROP_FPS"
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))   # float `width`
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT) ) # float `height`
    cap.release()
    return fps, frame_count, width, height

def get_preds_from_csv(csv_path):
    preds = list()
    with open(csv_path) as file:
        for line in file.readlines()[1:]:
            pred = line.strip().split(',')[1:]
#             pred = [float(i) for i in str_pred.split(',')]
            preds.append(pred)
    return preds

def process_preds(preds, softmax_thresh=False, suppress_to = None):
    pred_idxs = []
    for pred in preds:
        if softmax_thresh:
            if max(pred)<softmax_thresh:
                pred_idxs.append(suppress_to)
            else:
                pred_idxs.append(np.argmax(pred))
    return pred_idxs

def get_clips_paths(video_folders_filepath):
    #get video clip paths from videofolder.txt
    clip_paths = list()
    with open(video_folders_filepath, 'r') as file:
        for line in file.readlines():
            clip_path = line.split(',')[0]+'.MP4'
            clip_paths.append(clip_path)
    return clip_paths

def get_clips_lengths(video_folders_filepath):
    #get video clip paths from videofolder.txt
    clip_lengths = list()
    with open(video_folders_filepath, 'r') as file:
        for line in file.readlines():
            clip_num_frames = int(line.split(',')[1])
            clip_lengths.append(clip_num_frames)
    return clip_lengths

def save_preds_to_csv(video_path):
    file_name = os.path.basename(video_path)
    video_folders_filepath = video_dict[video_path]['videofolder']
#     print(video_folders_filepath)
    preds = video_dict[video_path]['preds']
    labels = video_dict[video_path]['labels']
        
    #get video clip paths from videofolder.txt
    clip_lengths = get_clips_lengths(video_folders_filepath)
    print('number of preds: ', len(preds), 'number of clips: ',len(clip_lengths))
    assert len(preds)==len(clip_lengths)
    
    # write results to csv file
    csv_filepath = os.path.join(output_path, 'pred_'+file_name+'.csv')
    csv_file = open(csv_filepath, 'w+')
    cats_string = ','.join(categories)
    csv_file.write(f'frame_pos,{cats_string}\n')
    i = 1
    for raw_pred, label, clip_length in zip(preds, labels, clip_lengths):
        #used to open clip to find number of frames
        # now will just use value from videofolder.txt file to find that number
        #get video properties
        fps, frame_count, width, height = get_video_properties(video_path)
        if clip_length > fps*CLIP_DURATION:
            clip_length = fps*CLIP_DURATION #to avoid this problem: https://video.stackexchange.com/questions/23373/ffmpeg-not-creating-exact-duration-clip
            
        for j in range(int(clip_length)):
            pred_str = ','.join([str(i) for i in raw_pred])
            csv_file.write(f'{i},{pred_str}\n')
            i+= 1
    csv_file.close()

# Write predictions to CSV

In [19]:
for video_path in video_paths:
    save_preds_to_csv(video_path)
    

number of preds:  765 number of clips:  765
number of preds:  592 number of clips:  592
number of preds:  422 number of clips:  422
number of preds:  417 number of clips:  417
number of preds:  475 number of clips:  475
number of preds:  1086 number of clips:  1086
number of preds:  661 number of clips:  661
number of preds:  647 number of clips:  647
number of preds:  866 number of clips:  866
number of preds:  497 number of clips:  497


# Process predictions 
Split by section using time, then suppress


In [20]:
CONF_THRES = 0.4 # previously 0.5 lead to worse scores than no confidence threshold
USE_CONF_THRES = False

USE_SECTION_SUPPRESION = True

In [21]:
for video_path in video_paths:
    #read predictions from csv:
    file_name = os.path.basename(video_path)
    csv_filepath = os.path.join(output_path, 'pred_'+ file_name +'.csv')
    preds = get_preds_from_csv(csv_filepath)    

    section_timespans = video_dict[video_path]['sect_timespans']
    print(section_timespans)
    
    #get video properties
    fps, frame_count, width, height = get_video_properties(video_path)
    
    #process predictions
    processed_preds = list()
    predicted_actions = list()
    pred_act_sect = list()
    per_frame_sections = list()
    processed_preds_unmasked = list()
    
    for idx, pred in enumerate(preds):
        pred = [float(p) for p in pred]
        #current minute into the video
        current_minute = idx/fps/60
        
        #get allowed sections and actions for current time
        sections = get_sections_from_time(current_minute, section_timespans)
        actions = get_actions_from_sections(sections)
        
        #get output mask
        pred_mask = output_mask(actions)
        
        if USE_SECTION_SUPPRESION:
            #mask out invalid predictions (out of section, etc)
            masked_preds = np.array(pred)*np.array(pred_mask)
        else:
            # use whatever raw prediction that comes out
            masked_preds = np.array(pred)
        
      
        if USE_CONF_THRES:
            # for actions (suppressed or not), check whether it meets a confidence criteria --- ADDED 22/6/2021
            if np.max(masked_preds) > CONF_THRES:
                predicted_action = categories[np.argmax(masked_preds)]      
            else:
                predicted_action = 'no_action'
        else:
            #get predicted action --- ORIGINAL, 22/6/2021
            predicted_action = categories[np.argmax(masked_preds)]
            
        predicted_action_no_mask = categories[np.argmax(pred)]
            
            
                
        #find the section that the action belongs to
        #some actions may belong to multiple sections
        #so we find the overlap from get_sections_from_time and the sect_dict
        sections_for_this_action = sect_dict[predicted_action]
        #put into list if it isn't
        if type(sections_for_this_action)!=list:
            sections_for_this_action=[sections_for_this_action]
        current_section = set(sections_for_this_action)&set(sections)
        current_section = ''.join(list(current_section))
        
#         if idx%600==0:
#             print(current_section)
#             print(f'{predicted_action}_{current_section}')

        per_frame_sections.append(current_section)
        processed_preds.append(masked_preds)
        predicted_actions.append(predicted_action)
        pred_act_sect.append(f'{predicted_action}_{current_section}')
        processed_preds_unmasked.append(predicted_action_no_mask)
        
    video_dict[video_path]['processed_preds'] = processed_preds
    video_dict[video_path]['predicted_actions'] = predicted_actions
    video_dict[video_path]['pred_act_sect'] = pred_act_sect
    video_dict[video_path]['per_frame_sections'] = per_frame_sections
    video_dict[video_path]['processed_preds_unmasked'] = processed_preds_unmasked
    

[((0.0, 4.0), 'A'), ((0.0, 10.0), 'B'), ((4.0, 20.0), 'C'), ((8.0, 35.0), 'E'), ((15.0, 40.0), 'F')]
[((0.0, 4.0), 'A'), ((0.0, 10.0), 'B'), ((4.0, 20.0), 'C'), ((8.0, 35.0), 'E'), ((15.0, 40.0), 'F')]
[((0.0, 4.0), 'A'), ((0.0, 10.0), 'B'), ((4.0, 20.0), 'C'), ((8.0, 35.0), 'E'), ((15.0, 40.0), 'F')]
[((0.0, 4.0), 'A'), ((0.0, 10.0), 'B'), ((4.0, 20.0), 'C'), ((8.0, 35.0), 'E'), ((15.0, 40.0), 'F')]
[((0.0, 4.0), 'A'), ((0.0, 10.0), 'B'), ((4.0, 20.0), 'C'), ((8.0, 35.0), 'E'), ((15.0, 40.0), 'F')]
[((0.0, 4.0), 'A'), ((0.0, 10.0), 'B'), ((4.0, 20.0), 'C'), ((8.0, 35.0), 'E'), ((15.0, 40.0), 'F')]
[((0.0, 4.0), 'A'), ((0.0, 10.0), 'B'), ((4.0, 20.0), 'C'), ((8.0, 35.0), 'E'), ((15.0, 40.0), 'F')]
[((0.0, 4.0), 'A'), ((0.0, 10.0), 'B'), ((4.0, 20.0), 'C'), ((8.0, 35.0), 'E'), ((15.0, 40.0), 'F')]
[((0.0, 4.0), 'A'), ((0.0, 10.0), 'B'), ((4.0, 20.0), 'C'), ((8.0, 35.0), 'E'), ((15.0, 40.0), 'F')]
[((0.0, 4.0), 'A'), ((0.0, 10.0), 'B'), ((4.0, 20.0), 'C'), ((8.0, 35.0), 'E'), ((15.0, 40.

# Write processed predictions to CSV

In [22]:
for video_path in video_paths:
    #read predictions from csv:
    file_name = os.path.basename(video_path)
    csv_filepath = os.path.join(output_path, 'processed_pred_'+file_name+'.csv')
    act_sect_preds = video_dict[video_path]['pred_act_sect']
    print(len(act_sect_preds))
    with open(csv_filepath, 'w+') as file:
        file.write(f'frame_pos,pred\n')
        for idx, pred in enumerate(act_sect_preds):
            file.write(f'{idx+1},{pred}\n')

45900
34928
25290
24603
28025
64072
39630
38173
51960
29323


# Write processed unmasked predictions to CSV
(without timespan filters)

In [23]:
for video_path in video_paths:
    #read predictions from csv:
    file_name = os.path.basename(video_path)
    csv_filepath = os.path.join(output_path, 'processed_pred_unmasked_'+file_name+'.csv')
    preds = video_dict[video_path]['processed_preds_unmasked']
    print(len(preds))
    with open(csv_filepath, 'w+') as file:
        file.write(f'frame_pos,pred\n')
        for idx, pred in enumerate(preds):
            file.write(f'{idx+1},{pred}\n')

45900
34928
25290
24603
28025
64072
39630
38173
51960
29323


# Calculate section timings

In [24]:
for video_path in video_paths:
    #read predictions from csv:
    file_name = os.path.basename(video_path)
    csv_filepath = os.path.join(output_path, 'processed_pred_'+file_name+'.csv')
    act_sect_preds = video_dict[video_path]['pred_act_sect']
#     print(act_sect_preds)
    sects = [pred[-1] if 'no_action' not in pred else 0 for pred in act_sect_preds]
    firsts_lasts_dict = {'A':{'first':None, 'last':None},
                        'B':{'first':None, 'last':None},
                       'C':{'first':None, 'last':None},
                       'E':{'first':None, 'last':None},
                       'F':{'first':None, 'last':None}}
    
    #find the first and last occurence of each section action
    for i, section in enumerate(sects):
        if section in firsts_lasts_dict.keys():
            if firsts_lasts_dict[section]['first'] is None:
                firsts_lasts_dict[section]['first'] = i
            firsts_lasts_dict[section]['last'] = i
    
    #remove sections where no action was detected:
    temp = dict()
    for k, v in firsts_lasts_dict.items():
        if v['first'] is not None:
            temp[k] = v
    firsts_lasts_dict = temp
    
    #calculate midpoint of overlaps
    for i, (k, v) in enumerate(firsts_lasts_dict.items()):
        if i < len(list(firsts_lasts_dict.keys()))-1:
            current_last = v['last'] 
#                 current_last = current_last if current_last is not None else 0
            next_sect = list(firsts_lasts_dict.keys())[i+1]
            next_first = firsts_lasts_dict[next_sect]['first']
#                 next_first = next_first if next_first is not None else 0
    #         print(current_last, next_first, (current_last+next_first)//2, (current_last+next_first)//2+1)
    #         print(k, next_sect)

#             print(firsts_lasts_dict)
            firsts_lasts_dict[k]['last'] = (current_last+next_first)//2
            firsts_lasts_dict[next_sect]['first'] = (current_last+next_first)//2+1
        fps = video_dict[video_path]['fps']
        m = v['last']/fps/60
        v['time'] = m
#     print(video_path)
#     print(firsts_lasts_dict)
    #write to csv
    
    csv_filepath = os.path.join(output_path, 'section_timings_'+ file_name +'.csv')
    with open(csv_filepath, 'w') as file:
        file.write(f'section,minute_end\n')
        for s in ['A','B','C','E','F']:
            if s in firsts_lasts_dict.keys():
                time = firsts_lasts_dict[s]['time']
                file.write(f'{s},{time}\n')
            else:
                file.write(f'{s},{-1}\n')
        
    video_dict[video_path]['section_times'] = firsts_lasts_dict
    
    

# Write predictions to video
### warning: will take long time!

In [27]:
#TODO: read preds from processed_preds csv file 
# def write_preds_to_video(video_path):
for video_path in video_paths[1:]: #video_paths[14:19] - TO SELECT A FEW
    
    file_name = os.path.basename(video_path)
    print(f'processing {file_name}')
    video_folders_filepath = video_dict[video_path]['videofolder']
    
    #get labels
#     df = get_gt_onehot_labels(video_path)
#     labels = df[df==1].stack().reset_index().drop(['level_0', 0],1).level_1
    
    #get preds
    act_sect_preds = video_dict[video_path]['pred_act_sect']
    preds = act_sect_preds#video_dict[video_path]['processed_preds']
    
    #get section labels
    per_frame_sections = video_dict[video_path]['per_frame_sections']
    
    #get video clip paths from videofolder.txt
    clip_paths = get_clips_paths(video_folders_filepath)
    print(len(preds), len(clip_paths))
#     assert len(preds)==len(clip_paths)
    
    fps, frame_count, width, height = get_video_properties(video_path)
    print(len(preds),frame_count)
    label_height = 60
    fourcc = cv2.VideoWriter_fourcc(*'MP4V')
    
    output_video_filepath = os.path.join(output_path, 'nodelay_pred_'+file_name)
    out = cv2.VideoWriter(output_video_filepath ,fourcc, fps, (width, height+label_height))
    print(f'writing to: {output_video_filepath}')

    i=0
    for clip_path in clip_paths:        
        
        #open clip video
        current_clip_frame_count = 0
        cap = cv2.VideoCapture(clip_path) 
        while True:
            _, img = cap.read()  

            if not _:
#                 print("No Image")
                cap.release()
                break
            elif current_clip_frame_count >= CLIP_DURATION*fps:
                cap.release()
                break
            elif i >= len(preds):
                print(f'pred limit reached. i={i}, clip_path = {clip_path}')
                cap.release()
                break
            pred = preds[i].split('_')[:-1]
            pred = ' '.join(pred)
            current_section = per_frame_sections[i]
            
#             print(current_section)
#             if i%100==0:
#                 print(pred)

            label_area = np.zeros([label_height, width, 3]).astype('uint8') + 255
    
#             print(pred)

            cv2.putText(label_area, 
                        f'Prediction: {pred}.  ',# ({max(pred):.6f}) Label: {labels[i]}',
                        (5, int(label_height-20)),
                        cv2.FONT_HERSHEY_SIMPLEX,
                        0.7, 
                        (0, 0, 0), 
                        2)
            img = np.concatenate((img, label_area), axis=0)
            
            out.write(img)
            i+= 1
            current_clip_frame_count += 1
    out.release()
        

processing edric_290621.MP4
34928 592
34928 35488
writing to: F:\DS_dev\temporal-shift-module\202107042048_20210702_rev_all_actions_new_data_ek_ITE_batch1_nseg8_ITE_VIDEO_TEST_results\nodelay_pred_edric_290621.MP4
pred limit reached. i=34928, clip_path = C:\DS_data\tmp\edric_290621.MP4\1164_edric_290621.MP4
pred limit reached. i=34928, clip_path = C:\DS_data\tmp\edric_290621.MP4\1166_edric_290621.MP4
pred limit reached. i=34928, clip_path = C:\DS_data\tmp\edric_290621.MP4\1168_edric_290621.MP4
pred limit reached. i=34928, clip_path = C:\DS_data\tmp\edric_290621.MP4\1170_edric_290621.MP4
pred limit reached. i=34928, clip_path = C:\DS_data\tmp\edric_290621.MP4\1172_edric_290621.MP4
pred limit reached. i=34928, clip_path = C:\DS_data\tmp\edric_290621.MP4\1174_edric_290621.MP4
pred limit reached. i=34928, clip_path = C:\DS_data\tmp\edric_290621.MP4\1176_edric_290621.MP4
pred limit reached. i=34928, clip_path = C:\DS_data\tmp\edric_290621.MP4\1178_edric_290621.MP4
pred limit reached. i=3492

# Score predictions


In [38]:
#read actions_label_map.txt
#read label map:
#     with open(os.path.join(PREPROCESSED_DATA_ROOT, 'ite_dataset', 'actions_label_map.txt'), 'r') as file:
#         categories = file.readlines()
#         categories = [c.strip().replace(' ', '_').replace('"', '').replace('(', '').replace(')', '').replace("'", '') for c in categories]
#or since categories has been read earlier in the notebook, don't need to read it here.

iou_means = list()
iou_per_class_all_videos_df = pd.DataFrame(columns=categories)

for video_path in video_paths:
    file_name = os.path.basename(video_path)
    print(file_name)
    
    # read GT and convert to per-frame labels
    gt_csv = video_path +'.csv'
    gt_labels = pd.read_csv(gt_csv)
    #remove no_action rows
    gt_labels = gt_labels[gt_labels['action']!='no_action']
    gt_labels.sort_values('z_start', inplace=True)
    #get frame count and fps of video
    frame_count = video_dict[video_path]['frame_count']
    fps = video_dict[video_path]['fps']
#     base_data = np.zeros((frame_count, len(categories)))
    gt_labels_onehot = pd.DataFrame(0, index=np.arange(frame_count), columns=categories)
    gt_labels_onehot.no_action = 1
    for index, row in gt_labels.iterrows():
        action, z_start, z_end = row
        
        #ignore action if it isn't in actions_label_map.txt
        if action not in categories:
            continue
#         print(action, z_start, z_end)
        frame_pos_start = int(z_start*fps)
        frame_pos_end = int(z_end*fps)
#         print(frame_pos_start, frame_pos_end)
        gt_labels_onehot[action][frame_pos_start:frame_pos_end]=1
        gt_labels_onehot.no_action[frame_pos_start:frame_pos_end]=0
        
    #read predictions file
    output_csv_filepath = os.path.join(output_path, 'processed_pred_unmasked_'+ file_name + '.csv')
    preds_per_frame = pd.read_csv(output_csv_filepath)
    print(len(preds_per_frame), len(gt_labels_onehot))
    
#     assert len(preds_per_frame) == len(gt_labels_onehot)
    #convert to one-hot labels
    
    preds_onehot = pd.DataFrame(0, index=np.arange(frame_count), columns=categories)
    for index, row in preds_per_frame.iterrows():
        frame_pos, action = row
#         print(frame_pos, action)
        preds_onehot[action][frame_pos-1] = 1
    
    intersection_per_class = (gt_labels_onehot*preds_onehot).sum()
    union_per_class = (gt_labels_onehot|preds_onehot).sum()
#     union_per_class = gt_labels_onehot.sum()+preds_onehot.sum()
    iou_per_class = intersection_per_class/union_per_class
    mean_iou = iou_per_class.mean()
#     print(f'intersection: {intersection}, union: {union}')
    iou_means.append(mean_iou)
    iou_per_class_all_videos_df=iou_per_class_all_videos_df.append(iou_per_class, ignore_index=True)
#     break


Student2.MP4
45900 45915
edric_290621.MP4
34928 35488
irfanakid1_300621.MP4
25290 25290
jason_290621.MP4
24603 25006
mingzhi2_290621.MP4
28025 28496
chanmunhong2_300621.MP4
64072 65092
decanay_290621.MP4
39630 39630
daniel3_300621.MP4
38173 38781
lohshengyi1_300621.MP4
51960 51960
yihong2_300621.MP4
29323 29805


In [43]:
iou_per_class_all_videos_df.to_csv('iou_per_class_all_videos.csv')

In [39]:
print('iou mean per video:')
iou_per_class_all_videos_df.mean(axis=1, skipna=True)

iou mean per video:


0    0.242401
1    0.058902
2    0.272405
3    0.085856
4    0.097485
5    0.029131
6    0.253188
7    0.086856
8    0.276567
9    0.041162
dtype: float64

In [40]:
print('iou mean per class:')
iou_per_class_all_videos_df.mean(axis=0)

iou mean per class:


connect_alligator_clip        0.061950
connect_atx_cable             0.102127
connect_display_cable         0.070853
connect_hdd_data_cable        0.097434
connect_hdd_power_cable       0.138730
connect_odd_data_cable        0.087476
connect_odd_power_cable       0.140265
disconnect_atx_cable          0.051385
disconnect_display_cable      0.070887
disconnect_hdd_data_cable     0.095071
disconnect_hdd_power_cable    0.079208
disconnect_odd_data_cable     0.090311
disconnect_odd_power_cable    0.077021
enter_bios_setup_mode         0.065044
insert_hdd                    0.222426
insert_odd                    0.104843
insert_ram                    0.158235
insert_vga                    0.156324
login_screen                  0.191378
no_action                     0.692466
place_anti_static_mat         0.198856
put_back_pc_casing            0.110267
remove_hdd                    0.217084
remove_odd                    0.116442
remove_pc_casing              0.146839
remove_ram               

In [41]:
print('model score:')
iou_per_class_all_videos_df.mean(axis=1, skipna=True).mean()

model score:


0.1443953083313043

# Model output analysis

In [None]:
maxes = []
for video_path in video_paths:
#     print(video_dict[video_path])
    file_name = os.path.basename(video_path)
    preds = video_dict[video_path]['preds']
    for pred in preds:
        maxes.append(max(pred))

In [None]:
plt.hist(maxes, bins=100)
plt.show()

In [None]:
plt.boxplot(maxes)
# plt.show()