## Frame-by-Frame processing

In [None]:
import os

import cv2
import math
import numpy as np
import moviepy 
import torch

from matplotlib import pyplot as plt



from rt_alphaction.config import cfg
from rt_alphaction.modeling.detector import build_detection_model
from rt_alphaction.utils.checkpoint import ActionCheckpointer
from rt_alphaction.utils.comm import get_world_size


from my_utils.gen_utils import create_experiment_folder
from my_utils.video_processing import get_video_info, get_frame_from_video
from my_utils.slicing import get_slice_bboxes, generate_sliding_window_gif

from my_utils.video_processing import segment_crop_video
from my_utils.ava_preprocessing import ava_preprocessing_cv2, clip_constructor, prepare_collated_batches, prepare_collated_batches_v2
from my_utils.ava_postprocessing import concatenate_results
from my_utils.visualization import action_visualizer_frame_index

from my_utils.gen_utils import parse_label_file

from my_utils.ava_postprocessing import clip_boxes_tensor, map_bbox_from_prep_to_crop, map_bbox_from_crop_to_orig


from tqdm import tqdm


### 1. CONFIG
#### 1.1 Main Parameters

In [None]:
model_name = 'VMAEv2'


person_threshold = 0.3 # confidence threshold on actor. 0.6 is the defualt
sampling_rate = 3 # sampling rate: 4 is the defualt
top_k = 5 # number of actions per person
video_path = '../input_dir/Fighting_14.mp4'
stream = False

slice_height = 600
slice_width = 800
overlap_ratio = 0

starting_frame_index = 900
length_input = 300

exp_dict = {'model_name': model_name,
            'model_params': {'person_threshold': person_threshold, 
                             'sampling_rate': sampling_rate},
            'orig_post_processing':{'top_k': top_k},
            'aggregation': {'method': {}, 
                            'params': {}},
            'video_path': video_path,
            'slicing_params': {'slice_height': slice_height, 
                               'slice_width': slice_width, 
                               'overlap_ratio':overlap_ratio},
            'video_params': {'st_frame_index': starting_frame_index, 
                             'length_input':length_input
                             }
           }

exp_dict


In [None]:
video_name = os.path.basename(video_path).split('.')[0]
#output_directory = f'../output_dir/{video_name}/{model_name}/patch_batch/' 
#output_directory = create_experiment_folder(output_directory, 'exp')
#output_directory


#### 1.2 Model Config Setup

In [None]:
def cfg_create(model_name, person_threshold, sampling_rate, test_videos_batch=1, num_workers=1):
    
    if model_name == 'VMAEv2':
        config_file = '../config_files/VMAEv2-ViTB-16x4.yaml'
        
    if model_name == 'VMAE':
        config_file = '../config_files/VMAE-ViTB-16x4.yaml'
        
    cfg.merge_from_file(config_file)
    
    # change model weight path
    if model_name == 'VMAEv2':
        cfg.merge_from_list(["MODEL.WEIGHT", "../checkpoints/VMAEv2_ViTB_16x4.pth"])
    if model_name == 'VMAE':
        cfg.merge_from_list(["MODEL.WEIGHT", "../checkpoints/VMAE_ViTB_16x4.pth"])

    # change output dir
    cfg.merge_from_list(["OUTPUT_DIR", "../output_dir/"])

    # change person threshold
    cfg.merge_from_list(["MODEL.STM.PERSON_THRESHOLD", person_threshold])

    # change sampling rate
    cfg.merge_from_list(["DATA.SAMPLING_RATE", sampling_rate])

    # change path for data_dir
    cfg.merge_from_list(["DATA.PATH_TO_DATA_DIR", "/work/ava"])

    # folder name of annotations
    cfg.merge_from_list(["AVA.ANNOTATION_DIR", "annotations/"])

    # file name of  frame_lists
    cfg.merge_from_list(["AVA.TRAIN_LISTS", ['sample.csv']])
    cfg.merge_from_list(["AVA.TEST_LISTS", ['sample.csv']])

    # file name of predicted_bboxes
    cfg.merge_from_list(["AVA.TRAIN_GT_BOX_LISTS", ['ava_sample_predicted_boxes.csv']])
    cfg.merge_from_list(["AVA.TEST_GT_BOX_LISTS", ['ava_sample_predicted_boxes.csv']])

    # file name of exlusions
    cfg.merge_from_list(["AVA.EXCLUSION_FILE", 'ava_sample_train_excluded_timestamps_v2.2.csv'])

    # number of batches in test scenario
    cfg.merge_from_list(["TEST.VIDEOS_PER_BATCH", test_videos_batch])

    # number of workers
    cfg.merge_from_list(["DATALOADER.NUM_WORKERS", num_workers])
    
    
    return cfg


    

In [None]:
cfg = cfg_create(model_name, person_threshold, sampling_rate)

In [None]:
model = build_detection_model(cfg)

In [None]:
debug = True
if debug:
    # The shape of model input should be divisible into this. Otherwise, padding 0 to left and bottum. 
    print("cfg.DATALOADER.SIZE_DIVISIBILITY: ", cfg.DATALOADER.SIZE_DIVISIBILITY)
    
    # Sampling rate in constructing the clips.
    self_sample_rate =  cfg.DATA.SAMPLING_RATE
    print("cfg.DATA.SAMPLING_RATE: ", cfg.DATA.SAMPLING_RATE)
    
    # Length of clip
    self_video_length = cfg.DATA.NUM_FRAMES
    print("cfg.DATA.NUM_FRAMES: ", cfg.DATA.NUM_FRAMES)
    
    # Length of sequence frames from which a clip is constructed.
    self_seq_len = self_video_length * self_sample_rate
    print("self_seq_len: ", self_seq_len)
    
    self_num_classes = cfg.MODEL.STM.ACTION_CLASSES
    print("cfg.MODEL.STM.ACTION_CLASSES: ", self_num_classes)
    
    # Augmentation params.
    self_data_mean = cfg.DATA.MEAN
    self_data_std = cfg.DATA.STD
    self_use_bgr = cfg.AVA.BGR
    print("Augmentation params: ", self_data_mean, self_data_std, self_use_bgr)
    
    self_jitter_min_scale = cfg.DATA.TEST_MIN_SCALES
    self_jitter_max_scale = cfg.DATA.TEST_MAX_SCALE
    self_test_force_flip = cfg.AVA.TEST_FORCE_FLIP

    print("scale and flip params", self_jitter_min_scale, self_jitter_max_scale, self_test_force_flip)

In [None]:
seq_len = cfg.DATA.NUM_FRAMES * cfg.DATA.SAMPLING_RATE
print(seq_len)

In [None]:
model.to("cuda")

### 5. loading weight

In [None]:
output_dir = cfg.OUTPUT_DIR
output_dir

In [None]:
checkpointer = ActionCheckpointer(cfg, model, save_dir=output_dir)
checkpointer.load(cfg.MODEL.WEIGHT)

In [None]:
num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
num_gpus

In [None]:
mem_active = cfg.MODEL.STM.MEM_ACTIVE
mem_active  

In [None]:
device = torch.device("cuda")

In [None]:
num_devices = get_world_size()
num_devices

In [None]:
model.eval()

### 3. VIDEO Info and Slicing Visualization


In [None]:
video_info = get_video_info(video_path)

In [None]:
video_info

In [None]:
exp_dict['video_params'].update(video_info)
exp_dict

In [None]:
frame_height = video_info['height']
frame_width = video_info['width']

In [None]:
patches_coordinates = get_slice_bboxes(frame_height, frame_width, slice_height, slice_width, False, overlap_ratio)

In [None]:
len(patches_coordinates)

In [None]:
patches_coordinates

In [None]:
patches_shapes = [[item[2] - item[0], item[3] - item[1]] for item in patches_coordinates]

In [None]:
patches_shapes

In [None]:
def crop_frame(frame, patches_coordinates):
    frame_slices = [frame[y1:y2, x1:x2] for x1, y1, x2, y2 in patches_coordinates]
    return frame_slices

In [None]:
seq_len

In [None]:
def apply_threshold(scores, threshold):
    # Create a boolean mask where True indicates scores higher than the threshold
    mask = scores > threshold
    
    # Apply the mask to filter out detections
    detections = []
    for batch_idx in range(scores.shape[0]):
        batch_detections = torch.nonzero(mask[batch_idx]).squeeze(-1).tolist()
        detections.append(batch_detections)
    
    return detections

In [None]:
def create_selected_detections_list(all_detections_tensor, indices_list):
    selected_detections_list = []
    
    for batch_idx, indices in enumerate(indices_list):
        selected_detections = all_detections_tensor[batch_idx, indices]
        selected_detections_list.append(selected_detections)
    
    return selected_detections_list


In [None]:
def batch_thresholding(inter_class_logits, inter_pred_bboxes, inter_action_logits, person_threshold):
    
    # applying softmax to get objectness score
    obj_scores = F.softmax(inter_class_logits[-1], dim=-1)[:, :, 0] # cuda torch of shape nr_patches x 100
    
    # applying sigmoid on last item to get action scores
    action_scores = torch.sigmoid(inter_action_logits[-1])  # cuda torch of shape nr_patches x 100 x 80
    
    # list of valid detection indices (from 100) at each path
    list_val_det_indices = apply_threshold(obj_scores, person_threshold) # list of length nr_pathces
    
    # list of obj scores of valid detections at each patch
    selected_obj_scores = create_selected_detections_list(obj_scores, list_val_det_indices)
    
    # list of actions scores of valid detections at each patch
    selected_action_scores = create_selected_detections_list(action_scores, list_val_det_indices)
    
    # list of bbox actions of valid detections at each patch
    selected_bboxes = create_selected_detections_list(inter_pred_bboxes[-1], list_val_det_indices)
    
    return selected_obj_scores, selected_action_scores, selected_bboxes

### Inference

In [None]:
import torch.nn.functional as F


cap = cv2.VideoCapture(video_path)


frames_tensor_list = []

central_frames = [] # list of central frames
central_frames_id = []
starting_inferece = False

frame_id = -1


buffer_size = sampling_rate * (cfg.DATA.NUM_FRAMES - 1) + 1 # 46 for sampling 3

temp_results_dict = {}

# Read frames from video
while True:
    
    ret, frame = cap.read()
    if not ret:
        break
    
    frame_id += 1
    
    # add first central frame or add new frame if we have already added first central frame
    if frame_id == sampling_rate * (cfg.DATA.NUM_FRAMES // 2) or len(central_frames) != 0:
        central_frames.append(frame)
        central_frames_id.append(frame_id)
    
        
    # list of slices of current frame. length of list: n_patches. shape of each frame: H_patch, W_patch, 3
    frame_slices = crop_frame(frame, patches_coordinates)
    
    # result of preprocessing of patches of current frame: torch of shape [3, n_patches, 256, 455]
    frames_prep = ava_preprocessing_cv2(frame_slices, cfg) 
    
    # reshape preprocessed patches of current frame to [nr_patch, 3, 256, 307] and add it to the list
    frames_tensor_list.append(frames_prep.permute(1, 0, 2, 3))
    
    # 
    if len(frames_tensor_list) == buffer_size:
        
        temp_results_dict[central_frames_id[0]] = []

        # creating a video of shape nr_patches x buffer_size(46) x 256 x 307
        video_tensor = torch.stack(frames_tensor_list).permute(1, 2, 0, 3, 4)
        
        # creating a clip of shape nr_patches x 3 x cfg.DATA.NUM_FRAMES x 256 x 307
        clip_tensor = video_tensor[:,:, 0::sampling_rate,:,:] 
        
        # passing clip to collate: list of length nr_patches, each item is list of length 7 (slow_video, ...) 
        list_collated_batches = prepare_collated_batches_v2(clip_tensor, cfg)
        
        # creating a tensor of shape: nr_patches x 3 x cfg.DATA.NUM_FRAMES x 256 x 320 
        slow_video = torch.stack([torch.squeeze(item[0]) for item in list_collated_batches])
        
        # hardcoding fast_video None for MAE-STMixer
        fast_video = None
        
        whwh = torch.stack([torch.squeeze(item[2]) for item in list_collated_batches])
        boxes = list_collated_batches[0][3]
        labels = list_collated_batches[0][4]
        
        slow_video = slow_video.to(device)
        if fast_video is not None:
            fast_video = fast_video.to(device)
        whwh = whwh.to(device)
    
        # INFERENCE
        with torch.no_grad():
            inter_class_logits, inter_pred_bboxes, inter_action_logits, B, N = model(slow_video, fast_video, whwh, boxes, labels)
        
        selected_obj_scores, selected_action_scores, selected_bboxes = batch_thresholding(inter_class_logits, inter_pred_bboxes, inter_action_logits, person_threshold)
        
        top_values = [[] for _ in range(B)]
        top_indices = [[] for _ in range(B)]
        
        output_objectness_np = [[] for _ in range(B)]
        output_bbox_frame_np = [[] for _ in range(B)]
        
        top_indices_np = [[] for _ in range(B)]
        top_values_np = [[] for _ in range(B)]
        
        
        for i in range(B):
            w = whwh[i,0].int()
            h = whwh[i,1].int()
            selected_bboxes[i] = clip_boxes_tensor(selected_bboxes[i], 
                                           height=h, 
                                           width =w)
            selected_bboxes[i] = map_bbox_from_prep_to_crop(selected_bboxes[i], 
                                                    (patches_shapes[i][1], patches_shapes[i][0]), 
                                                    (h, w))
            selected_bboxes[i] = map_bbox_from_crop_to_orig(selected_bboxes[i], patches_coordinates[i][:2])
            
            
            top_values[i], top_indices[i] = torch.topk(selected_action_scores[i], k=top_k, dim=1)
            
            output_objectness_np[i] = np.reshape(selected_obj_scores[i].cpu().numpy(), (-1, 1))
            output_bbox_frame_np[i] = selected_bboxes[i].cpu().numpy()
            
            # shifting to ava dataset labeling
            top_indices_np[i] = top_indices[i].cpu().numpy() + 1
            top_values_np[i] = top_values[i].cpu().numpy()
            
            agg_result = np.concatenate((output_objectness_np[i], 
                                         output_bbox_frame_np[i], 
                                         top_indices_np[i], 
                                         top_values_np[i]), axis=1)
        
                                         
        
            temp_results_dict[central_frames_id[0]].append(agg_result)
            
        
        
        del frames_tensor_list[0]
        del central_frames[0]
        del central_frames_id[0]
        
            
cap.release()


In [None]:
temp_results_dict.keys()