In [1]:
import os

import cv2
import math
import numpy as np
import moviepy 
import torch

from matplotlib import pyplot as plt



from alphaction.config import cfg
from alphaction.modeling.detector import build_detection_model
from alphaction.utils.checkpoint import ActionCheckpointer
from alphaction.utils.comm import get_world_size


from my_utils.gen_utils import create_experiment_folder
from my_utils.video_processing import get_video_info, get_frame_from_video
from my_utils.slicing import get_slice_bboxes, generate_sliding_window_gif

from my_utils.video_processing import segment_crop_video
from my_utils.my_ava_preprocessing import ava_preprocessing_cv2, clip_constructor, prepare_collated_batches
from my_utils.ava_postprocessing import concatenate_results
from my_utils.visualization import action_visualizer_frame_index

from my_utils.gen_utils import parse_label_file

from my_utils.ava_postprocessing import clip_boxes_tensor, map_bbox_from_prep_to_crop, map_bbox_from_crop_to_orig


from tqdm import tqdm


### 1. CONFIG
#### 1.1 Main Parameters

In [6]:
model_name = 'VMAEv2'


person_threshold = 0.3 # confidence threshold on actor. 0.6 is the defualt
sampling_rate = 3 # sampling rate: 4 is the defualt
top_k = 5 # number of actions per person
video_path = '../input_dir/markt2_fight.mp4' # path to video

slice_height = 800 # patch height
slice_width = 1000 # patch width
overlap_ratio = 0.2 # patch overlap

starting_frame_index = 50 # starting frame index, defualt = 0
length_input = 100 # number of frames to be processed

exp_dict = {'model_name': model_name,
            'model_params': {'person_threshold': person_threshold, 
                             'sampling_rate': sampling_rate},
            'orig_post_processing':{'top_k': top_k},
            'aggregation': {'method': {}, 
                            'params': {}},
            'video_path': video_path,
            'slicing_params': {'slice_height': slice_height, 
                               'slice_width': slice_width, 
                               'overlap_ratio':overlap_ratio},
            'video_params': {'st_frame_index': starting_frame_index, 
                             'length_input':length_input
                             }
           }

exp_dict


{'model_name': 'VMAEv2',
 'model_params': {'person_threshold': 0.3, 'sampling_rate': 3},
 'orig_post_processing': {'top_k': 5},
 'aggregation': {'method': {}, 'params': {}},
 'video_path': '../input_dir/markt2_fight.mp4',
 'slicing_params': {'slice_height': 800,
  'slice_width': 1000,
  'overlap_ratio': 0.2},
 'video_params': {'st_frame_index': 50, 'length_input': 100}}

In [7]:
video_name = os.path.basename(video_path).split('.')[0]
output_directory = f'../output_dir/{video_name}/{model_name}/patch_batch/' 
output_directory = create_experiment_folder(output_directory, 'exp')
output_directory



'../output_dir/markt2_fight/VMAEv2/patch_batch/exp_5'

#### 1.2 Model Config Setup

In [8]:
if model_name == 'VMAEv2':
    config_file = '../config_files/VMAEv2-ViTB-16x4.yaml'
if model_name == 'VMAE':
    config_file = '../config_files/VMAE-ViTB-16x4.yaml'


In [9]:
cfg.merge_from_file(config_file)

In [10]:
# change model weight path
if model_name == 'VMAEv2':
    cfg.merge_from_list(["MODEL.WEIGHT", "../checkpoints/VMAEv2_ViTB_16x4.pth"])
if model_name == 'VMAE':
    cfg.merge_from_list(["MODEL.WEIGHT", "../checkpoints/VMAE_ViTB_16x4.pth"])

# change output dir
cfg.merge_from_list(["OUTPUT_DIR", "../output_dir/"])

# change person threshold
cfg.merge_from_list(["MODEL.STM.PERSON_THRESHOLD", person_threshold])

# change sampling rate
cfg.merge_from_list(["DATA.SAMPLING_RATE", sampling_rate])

# change path for data_dir
cfg.merge_from_list(["DATA.PATH_TO_DATA_DIR", "/work/ava"])

# folder name of annotations
cfg.merge_from_list(["AVA.ANNOTATION_DIR", "annotations/"])

# file name of  frame_lists
cfg.merge_from_list(["AVA.TRAIN_LISTS", ['sample.csv']])
cfg.merge_from_list(["AVA.TEST_LISTS", ['sample.csv']])

# file name of predicted_bboxes
cfg.merge_from_list(["AVA.TRAIN_GT_BOX_LISTS", ['ava_sample_predicted_boxes.csv']])
cfg.merge_from_list(["AVA.TEST_GT_BOX_LISTS", ['ava_sample_predicted_boxes.csv']])

# file name of exlusions
cfg.merge_from_list(["AVA.EXCLUSION_FILE", 'ava_sample_train_excluded_timestamps_v2.2.csv'])

# number of batches in test scenario
cfg.merge_from_list(["TEST.VIDEOS_PER_BATCH", 1])

# number of workers
cfg.merge_from_list(["DATALOADER.NUM_WORKERS", 1])


### 2. ARGS

In [11]:
debug = True
if debug:
    # The shape of model input should be divisible into this. Otherwise, padding 0 to left and bottum. 
    print("cfg.DATALOADER.SIZE_DIVISIBILITY: ", cfg.DATALOADER.SIZE_DIVISIBILITY)
    
    # Sampling rate in constructing the clips.
    self_sample_rate =  cfg.DATA.SAMPLING_RATE
    print("cfg.DATA.SAMPLING_RATE: ", cfg.DATA.SAMPLING_RATE)
    
    # Length of clip
    self_video_length = cfg.DATA.NUM_FRAMES
    print("cfg.DATA.NUM_FRAMES: ", cfg.DATA.NUM_FRAMES)
    
    # Length of sequence frames from which a clip is constructed.
    self_seq_len = self_video_length * self_sample_rate
    print("self_seq_len: ", self_seq_len)
    
    self_num_classes = cfg.MODEL.STM.ACTION_CLASSES
    print("cfg.MODEL.STM.ACTION_CLASSES: ", self_num_classes)
    
    # Augmentation params.
    self_data_mean = cfg.DATA.MEAN
    self_data_std = cfg.DATA.STD
    self_use_bgr = cfg.AVA.BGR
    print("Augmentation params: ", self_data_mean, self_data_std, self_use_bgr)
    
    self_jitter_min_scale = cfg.DATA.TEST_MIN_SCALES
    self_jitter_max_scale = cfg.DATA.TEST_MAX_SCALE
    self_test_force_flip = cfg.AVA.TEST_FORCE_FLIP

    print("scale and flip params", self_jitter_min_scale, self_jitter_max_scale, self_test_force_flip)

cfg.DATALOADER.SIZE_DIVISIBILITY:  32
cfg.DATA.SAMPLING_RATE:  3
cfg.DATA.NUM_FRAMES:  16
self_seq_len:  48
cfg.MODEL.STM.ACTION_CLASSES:  80
Augmentation params:  [0.45, 0.45, 0.45] [0.225, 0.225, 0.225] False
scale and flip params [256] 1333 False


In [12]:
seq_len = cfg.DATA.NUM_FRAMES * cfg.DATA.SAMPLING_RATE
print(seq_len)

48


### 3. VIDEO Info and Slicing Visualization


In [13]:
video_info = get_video_info(video_path)

In [14]:
video_info

{'frame_count': 368,
 'frame_rate': 16.999,
 'width': 2592,
 'height': 1944,
 'fps': 16.999}

In [15]:
exp_dict['video_params'].update(video_info)
exp_dict

{'model_name': 'VMAEv2',
 'model_params': {'person_threshold': 0.3, 'sampling_rate': 3},
 'orig_post_processing': {'top_k': 5},
 'aggregation': {'method': {}, 'params': {}},
 'video_path': '../input_dir/markt2_fight.mp4',
 'slicing_params': {'slice_height': 800,
  'slice_width': 1000,
  'overlap_ratio': 0.2},
 'video_params': {'st_frame_index': 50,
  'length_input': 100,
  'frame_count': 368,
  'frame_rate': 16.999,
  'width': 2592,
  'height': 1944,
  'fps': 16.999}}

In [16]:
frame_height = video_info['height']
frame_width = video_info['width']

In [17]:
patches_coordinates = get_slice_bboxes(frame_height, frame_width, slice_height, slice_width, False, overlap_ratio)

In [18]:
len(patches_coordinates)

9

In [19]:
patches_coordinates

[[0, 0, 1000, 800],
 [800, 0, 1800, 800],
 [1592, 0, 2592, 800],
 [0, 640, 1000, 1440],
 [800, 640, 1800, 1440],
 [1592, 640, 2592, 1440],
 [0, 1144, 1000, 1944],
 [800, 1144, 1800, 1944],
 [1592, 1144, 2592, 1944]]

In [20]:
sample_frame = get_frame_from_video(video_path, 0)

In [21]:
if False:
    generate_sliding_window_gif(sample_frame, patches_coordinates, gif_filename='sliding_window.gif')

In [22]:
from IPython.display import display, Image


In [23]:
if False:
    display(Image(filename='sliding_window.gif'))

In [24]:
if True:
    !rm sliding_window.gif

rm: cannot remove 'sliding_window.gif': No such file or directory


### 4. building model

In [25]:
model = build_detection_model(cfg)

In [26]:
model.to("cuda")

STMDetector(
  (backbone): ViT(
    (patch_embed): PatchEmbed(
      (proj): Conv3d(3, 768, kernel_size=(2, 16, 16), stride=(2, 16, 16))
    )
    (pos_drop): Dropout(p=0.0, inplace=False)
    (blocks): ModuleList(
      (0): Block(
        (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=768, out_features=2304, bias=False)
          (attn_drop): Dropout(p=0.0, inplace=False)
          (proj): Linear(in_features=768, out_features=768, bias=True)
          (proj_drop): Dropout(p=0.0, inplace=False)
        )
        (drop_path): Identity()
        (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (mlp): Mlp(
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (act): GELU(approximate='none')
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (drop): Dropout(p=0.0, inplace=False)
        )
      )
      (1): Block(
        (norm1): La

### 5. loading weight

In [27]:
output_dir = cfg.OUTPUT_DIR
output_dir

'../output_dir/'

In [28]:
checkpointer = ActionCheckpointer(cfg, model, save_dir=output_dir)
checkpointer.load(cfg.MODEL.WEIGHT)

{}

In [29]:
num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
num_gpus

1

In [30]:
mem_active = cfg.MODEL.STM.MEM_ACTIVE
mem_active  

False

In [31]:
device = torch.device("cuda")

In [32]:
num_devices = get_world_size()
num_devices

1

In [33]:
model.eval()

STMDetector(
  (backbone): ViT(
    (patch_embed): PatchEmbed(
      (proj): Conv3d(3, 768, kernel_size=(2, 16, 16), stride=(2, 16, 16))
    )
    (pos_drop): Dropout(p=0.0, inplace=False)
    (blocks): ModuleList(
      (0): Block(
        (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=768, out_features=2304, bias=False)
          (attn_drop): Dropout(p=0.0, inplace=False)
          (proj): Linear(in_features=768, out_features=768, bias=True)
          (proj_drop): Dropout(p=0.0, inplace=False)
        )
        (drop_path): Identity()
        (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (mlp): Mlp(
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (act): GELU(approximate='none')
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (drop): Dropout(p=0.0, inplace=False)
        )
      )
      (1): Block(
        (norm1): La

## 6. Inference

In [34]:
patches_coordinates

[[0, 0, 1000, 800],
 [800, 0, 1800, 800],
 [1592, 0, 2592, 800],
 [0, 640, 1000, 1440],
 [800, 640, 1800, 1440],
 [1592, 640, 2592, 1440],
 [0, 1144, 1000, 1944],
 [800, 1144, 1800, 1944],
 [1592, 1144, 2592, 1944]]

In [35]:
file_path = 'labels.txt'  # Specify the path to your text file
label_dict = parse_label_file(file_path)
print(label_dict)

{1: 'bend/bow (at the waist)', 2: 'crawl', 3: 'crouch/kneel', 4: 'dance', 5: 'fall down', 6: 'get up', 7: 'jump/leap', 8: 'lie/sleep', 9: 'martial art', 10: 'run/jog', 11: 'sit', 12: 'stand', 13: 'swim', 14: 'walk', 15: 'answer phone', 16: 'brush teeth', 17: 'carry/hold (an object)', 18: 'catch (an object)', 19: 'chop', 20: 'climb (e.g., a mountain)', 21: 'clink glass', 22: 'close (e.g., a door, a box)', 23: 'cook', 24: 'cut', 25: 'dig', 26: 'dress/put on clothing', 27: 'drink', 28: 'drive (e.g., a car, a truck)', 29: 'eat', 30: 'enter', 31: 'exit', 32: 'extract', 33: 'fishing', 34: 'hit (an object)', 35: 'kick (an object)', 36: 'lift/pick up', 37: 'listen (e.g., to music)', 38: 'open (e.g., a window, a car door)', 39: 'paint', 40: 'play board game', 41: 'play musical instrument', 42: 'play with pets', 43: 'point to (an object)', 44: 'press', 45: 'pull (an object)', 46: 'push (an object)', 47: 'put down', 48: 'read', 49: 'ride (e.g., a bike, a car, a horse)', 50: 'row boat', 51: 'sail 

In [36]:
temp_results_dict = {}

add_patch_index = True

for patch_index, patch_coordinates in tqdm(enumerate(patches_coordinates), desc='Processing patches'):
    
    # cropping and segmenting input video based on patch_coordinates and temporal window  
    cropped_video = segment_crop_video(video_path, 
                                      frame_index=starting_frame_index, 
                                      length=length_input, 
                                      crop=patch_coordinates)
    
    crop_height, crop_width = cropped_video[0].shape[:2]
    
    
    prep_video = ava_preprocessing_cv2(cropped_video, cfg)
    prep_height, prep_width = prep_video.shape[-2:]
    
    
    prep_clips, center_frames = clip_constructor(prep_video, 
                                                 rate_sample=cfg.DATA.SAMPLING_RATE, 
                                                 num_frames=cfg.DATA.NUM_FRAMES)
    
    list_collated_batches = prepare_collated_batches(prep_clips, center_frames, cfg)
    

    for batch, center_frame_index in tqdm(zip(list_collated_batches, center_frames), 
                                          desc='Processing frames', 
                                          total=len(list_collated_batches)):
        
        # updating frame index based on starting_frame_index
        cur_frame_index = center_frame_index + starting_frame_index
        
        # adding current frame index to result dict
        if cur_frame_index not in temp_results_dict:
            temp_results_dict[cur_frame_index] = []
        
        # passing batch to model
        with torch.no_grad():
            slow_video, fast_video, whwh, boxes, labels, metadata, idx = batch
            clips_height, clips_width = slow_video.shape[-2:]
            slow_video = slow_video.to(device)
            if fast_video is not None:
                fast_video = fast_video.to(device)
            whwh = whwh.to(device)
    
            # INFERENCE
            action_score_list, box_list, objectness_score_list = model(slow_video, fast_video, whwh, boxes, labels)
            #print(slow_video.shape)
        
        # Removing batch dimension
        if len(box_list) != 0:
           
            output_bbox = box_list[0]
            output_action = action_score_list[0]
            output_objectness = objectness_score_list[0]
        
            if output_bbox.shape[0] != 0:
            
                # denormalizing bboxes w.r.t. clips shape
                output_bbox_inp = output_bbox.clone()

                output_bbox_inp[:, 0] = output_bbox[:, 0] * clips_width
                output_bbox_inp[:, 1] = output_bbox[:, 1] * clips_height
                output_bbox_inp[:, 2] = output_bbox[:, 2] * clips_width
                output_bbox_inp[:, 3] = output_bbox[:, 3] * clips_height
    
                # clipping bbonx coordinates with prep shape because clip shape is right/bottum padded version of prep shape.
                output_bbox_prep = clip_boxes_tensor(output_bbox_inp, 
                                                     height=whwh[0,1], 
                                                     width = whwh[0,0])
    
                # Scaling bboxes from prep shape to crop shape
                output_bbox_crop = map_bbox_from_prep_to_crop(output_bbox_prep, 
                                                              (crop_height, crop_width), 
                                                              (prep_height, prep_width))
    
                # mapping from crop to original frame
                output_bbox_frame = map_bbox_from_crop_to_orig(output_bbox_crop , patch_coordinates[:2])
                
                # getting top_k action: scores and indices
                top_values, top_indices = torch.topk(output_action, k=top_k, dim=1)
    
                output_objectness_np = np.reshape(output_objectness.cpu().numpy(), (-1, 1))
                output_bbox_frame_np = output_bbox_frame.cpu().numpy()
                
                # shifting to ava dataset labeling
                top_indices_np = top_indices.cpu().numpy() + 1
                top_values_np = top_values.cpu().numpy()
                
                
                
                
                if add_patch_index:
                    # adding patch index to result.
                    patch_index_np = np.full((output_objectness_np.shape[0], 1), patch_index)
                    agg_result = np.concatenate((output_objectness_np, 
                                                 output_bbox_frame_np, 
                                                 top_indices_np, 
                                                 top_values_np,
                                                 patch_index_np), axis=1)
                else:
                    agg_result = np.concatenate((output_objectness_np, 
                                                 output_bbox_frame_np, 
                                                 top_indices_np, 
                                                 top_values_np), axis=1)
                
                
                temp_results_dict[cur_frame_index].append(agg_result)
        

Processing patches: 0it [00:00, ?it/s]
Processing frames:   0%|                                                                                                                                | 0/53 [00:00<?, ?it/s][A
Processing frames:   2%|██▎                                                                                                                     | 1/53 [00:02<01:56,  2.23s/it][A
Processing frames:   4%|████▌                                                                                                                   | 2/53 [00:02<00:59,  1.16s/it][A
Processing frames:   6%|██████▊                                                                                                                 | 3/53 [00:03<00:39,  1.25it/s][A
Processing frames:   8%|█████████                                                                                                               | 4/53 [00:03<00:30,  1.59it/s][A
Processing frames:   9%|███████████▎                              

Processing frames:  68%|████████████████████████████████████████████████████████████████████████████████▊                                      | 36/53 [00:13<00:06,  2.79it/s][A
Processing frames:  70%|███████████████████████████████████████████████████████████████████████████████████                                    | 37/53 [00:13<00:05,  2.80it/s][A
Processing frames:  72%|█████████████████████████████████████████████████████████████████████████████████████▎                                 | 38/53 [00:13<00:05,  2.80it/s][A
Processing frames:  74%|███████████████████████████████████████████████████████████████████████████████████████▌                               | 39/53 [00:14<00:04,  2.80it/s][A
Processing frames:  75%|█████████████████████████████████████████████████████████████████████████████████████████▊                             | 40/53 [00:14<00:04,  2.80it/s][A
Processing frames:  77%|█████████████████████████████████████████████████████████████████████████████████

Processing frames:  34%|████████████████████████████████████████▍                                                                              | 18/53 [00:06<00:12,  2.87it/s][A
Processing frames:  36%|██████████████████████████████████████████▋                                                                            | 19/53 [00:06<00:11,  2.87it/s][A
Processing frames:  38%|████████████████████████████████████████████▉                                                                          | 20/53 [00:07<00:11,  2.88it/s][A
Processing frames:  40%|███████████████████████████████████████████████▏                                                                       | 21/53 [00:07<00:11,  2.88it/s][A
Processing frames:  42%|█████████████████████████████████████████████████▍                                                                     | 22/53 [00:07<00:10,  2.89it/s][A
Processing frames:  43%|███████████████████████████████████████████████████▋                             

Processing frames:   0%|                                                                                                                                | 0/53 [00:00<?, ?it/s][A
Processing frames:   2%|██▎                                                                                                                     | 1/53 [00:00<00:17,  2.99it/s][A
Processing frames:   4%|████▌                                                                                                                   | 2/53 [00:00<00:17,  2.89it/s][A
Processing frames:   6%|██████▊                                                                                                                 | 3/53 [00:01<00:17,  2.87it/s][A
Processing frames:   8%|█████████                                                                                                               | 4/53 [00:01<00:17,  2.87it/s][A
Processing frames:   9%|███████████▎                                                                     

Processing frames:  68%|████████████████████████████████████████████████████████████████████████████████▊                                      | 36/53 [00:12<00:05,  2.87it/s][A
Processing frames:  70%|███████████████████████████████████████████████████████████████████████████████████                                    | 37/53 [00:12<00:05,  2.89it/s][A
Processing frames:  72%|█████████████████████████████████████████████████████████████████████████████████████▎                                 | 38/53 [00:13<00:05,  2.87it/s][A
Processing frames:  74%|███████████████████████████████████████████████████████████████████████████████████████▌                               | 39/53 [00:13<00:04,  2.85it/s][A
Processing frames:  75%|█████████████████████████████████████████████████████████████████████████████████████████▊                             | 40/53 [00:13<00:04,  2.86it/s][A
Processing frames:  77%|█████████████████████████████████████████████████████████████████████████████████

Processing frames:  34%|████████████████████████████████████████▍                                                                              | 18/53 [00:06<00:12,  2.84it/s][A
Processing frames:  36%|██████████████████████████████████████████▋                                                                            | 19/53 [00:06<00:11,  2.83it/s][A
Processing frames:  38%|████████████████████████████████████████████▉                                                                          | 20/53 [00:07<00:11,  2.83it/s][A
Processing frames:  40%|███████████████████████████████████████████████▏                                                                       | 21/53 [00:07<00:11,  2.83it/s][A
Processing frames:  42%|█████████████████████████████████████████████████▍                                                                     | 22/53 [00:07<00:10,  2.82it/s][A
Processing frames:  43%|███████████████████████████████████████████████████▋                             

slow_video.shape, fast_video, whwh, boxes, labels


(torch.Size([1, 3, 16, 256, 320]),
 None,
 tensor([[320., 256., 320., 256.]], device='cuda:0'),
 (None,),
 (None,))

#### creating dict with keys equal to actual frames and values as np.array of shape Nx16 or Nx15.

In [37]:
all_results_dict = concatenate_results(temp_results_dict, top_k=top_k, patch_index=True)

In [47]:
ss = all_results_dict[94][:,:]

In [49]:
ss.shape

(39, 16)

In [78]:
import numpy as np

def nms_action_detection(detections, agnostic=True, type_score='obj_score', nms_method='IOU', iou_threshold=0.5):
    """
    Apply Non-Maximum Suppression (NMS) on action detection results.

    Parameters:
        detections (numpy.ndarray): Array of shape Nx15 where N is the number of detected objects.
                                    Each row contains [obj_score, x1, y1, x2, y2, action_id1, action_id2, ...,
                                    action_id5, action_score1, action_score2, ..., action_score5].
        agnostic (bool): If True, NMS will be applied to all detections together.
                         If False, NMS will be applied separately for each class.
        type_score (str): Type of score to be used for sorting: 'obj_score', 'action_score', or 'joint'.
        nms_method (str): NMS method to be used: 'IOU' or 'IOS'.
        iou_threshold (float): Threshold for IOU or IOS.

    Returns:
        numpy.ndarray: Filtered detections after applying NMS.
    """

    # Sort detections based on score
    if type_score == 'obj_score':
        score_index = 0
    elif type_score == 'action_score':
        score_index = slice(-5, None)  # Selecting action score columns
    elif type_score == 'joint':
        score_index = 0 if agnostic else slice(-5, None)
        detections[:, score_index] *= detections[:, -5:].max(axis=1)  # Using the maximum action score

    sorted_indices = np.argsort(detections[:, score_index])[::-1]
    detections = detections[sorted_indices]

    # Perform NMS
    def compute_iou(box1, box2):
        x1 = max(box1[0], box2[0])
        y1 = max(box1[1], box2[1])
        x2 = min(box1[2], box2[2])
        y2 = min(box1[3], box2[3])

        intersection = max(0, x2 - x1) * max(0, y2 - y1)

        if nms_method == 'IOU':
            union = (box1[2] - box1[0]) * (box1[3] - box1[1]) + \
                    (box2[2] - box2[0]) * (box2[3] - box2[1]) - intersection
        elif nms_method == 'IOS':
            min_area = min((box1[2] - box1[0]) * (box1[3] - box1[1]),
                           (box2[2] - box2[0]) * (box2[3] - box2[1]))
            union = min_area

        iou = intersection / union if union > 0 else 0
        return iou

    keep_indices = []
    while len(detections) > 0:
        keep_indices.append(detections[0])

        iou_scores = np.array([compute_iou(detections[0, 1:5], detections[i, 1:5]) for i in range(1, len(detections))])

        if agnostic:
            mask = iou_scores <= iou_threshold
        else:
            ref_actions = np.repeat(detections[0, 5:10][np.newaxis, :], len(detections)-1, axis=0)
            mask = np.logical_or(iou_scores <= iou_threshold, detections[1:, 5:10] != ref_actions)

        detections = detections[1:][mask]

    return np.array(keep_indices)

# Example usage:
# detections = np.array([[0.9, 100, 100, 200, 200, 0, 0, 0, 0, 0, 0.8, 0.7, 0.6, 0.5, 0.4],
#                        [0.8, 150, 150, 250, 250, 1, 1, 1, 1, 1, 0.7, 0.6, 0.5, 0.4, 0.3],
#                        [0.7, 120, 120, 220, 220, 2, 2, 2, 2, 2, 0.6, 0.5, 0.4, 0.3, 0.2]])
# filtered_detections = nms_action_detection(detections, agnostic=True, type_score='action_score', nms_method='IOU')
# print(filtered_detections)


In [82]:
def nms_action_detection_per_cat(detections, type_score='obj_score', nms_method='IOU', iou_threshold=0.5):
    """
    Apply Non-Maximum Suppression (NMS) on action detection results.

    Parameters:
        detections (numpy.ndarray): Array of shape Nx15 where N is the number of detected objects.
                                    Each row contains [obj_score, x1, y1, x2, y2, action_id1, action_id2, ...,
                                    action_id5, action_score1, action_score2, ..., action_score5].
        agnostic (bool): If True, NMS will be applied to all detections together.
                         If False, NMS will be applied separately for each action ID.
        type_score (str): Type of score to be used for sorting: 'obj_score', 'action_score', or 'joint'.
        nms_method (str): NMS method to be used: 'IOU' or 'IOS'.
        iou_threshold (float): Threshold for IOU or IOS.

    Returns:
        numpy.ndarray: Filtered detections after applying NMS.
    """

    # Sort detections based on score
    if type_score == 'obj_score':
        score_index = 0
    elif type_score == 'action_score':
        score_index = slice(-5, None)  # Selecting action score columns
    elif type_score == 'joint':
        score_index = 0 if agnostic else slice(-5, None)
        detections[:, score_index] *= detections[:, -5:].max(axis=1)  # Using the maximum action score

    sorted_indices = np.argsort(detections[:, score_index])[::-1]
    detections = detections[sorted_indices]
    
    # Perform NMS
    def compute_iou(box1, box2):
        x1 = max(box1[0], box2[0])
        y1 = max(box1[1], box2[1])
        x2 = min(box1[2], box2[2])
        y2 = min(box1[3], box2[3])

        intersection = max(0, x2 - x1) * max(0, y2 - y1)

        if nms_method == 'IOU':
            union = (box1[2] - box1[0]) * (box1[3] - box1[1]) + \
                    (box2[2] - box2[0]) * (box2[3] - box2[1]) - intersection
        elif nms_method == 'IOS':
            min_area = min((box1[2] - box1[0]) * (box1[3] - box1[1]),
                           (box2[2] - box2[0]) * (box2[3] - box2[1]))
            union = min_area

        iou = intersection / union if union > 0 else 0
        return iou


    unique_action_ids = np.unique(detections[:, 5])
    
    

    keep_indices = []
    for action_id in unique_action_ids:
        action_mask = detections[:, 5] == action_id
        action_detections = detections[action_mask]

        while len(action_detections) > 0:
            keep_indices.append(action_detections[0])

            iou_scores = np.array([compute_iou(action_detections[0, 1:5], action_detections[i, 1:5]) for i in range(1, len(action_detections))])

            mask = iou_scores <= iou_threshold
            action_detections = action_detections[1:][mask]

    return np.array(keep_indices)

In [86]:
ss_id = nms_action_detection_per_cat(ss, nms_method="IOS")

In [95]:
def apply_nms_to_dict(detections_dict, agnostic=True, type_score='obj_score', nms_method='IOS', iou_threshold=0.5):
    """
    Apply NMS post-processing on a dictionary of action detections.

    Parameters:
        detections_dict (dict): Dictionary where keys are frame indices and values are arrays of action detections.
        agnostic (bool): If True, NMS will be applied to all detections together.
                         If False, NMS will be applied separately for each action ID.
        type_score (str): Type of score to be used for sorting: 'obj_score', 'action_score', or 'joint'.
        nms_method (str): NMS method to be used: 'IOU' or 'IOS'.
        iou_threshold (float): Threshold for IOU or IOS.

    Returns:
        dict: Dictionary with NMS-applied detections.
    """

    nms_applied_dict = {}
    for frame, detections in detections_dict.items():
        nms_applied_detections = nms_action_detection(detections, agnostic=agnostic, type_score=type_score, nms_method=nms_method, iou_threshold=iou_threshold)
        nms_applied_dict[frame] = nms_applied_detections

    return nms_applied_dict


In [96]:
ss[20,:]

array([9.94045496e-01, 1.34600000e+03, 6.55000000e+02, 1.45200000e+03,
       9.19000000e+02, 1.40000000e+01, 1.70000000e+01, 8.00000000e+01,
       7.90000000e+01, 7.40000000e+01, 9.91903603e-01, 5.57146549e-01,
       2.39134595e-01, 8.62589404e-02, 6.32744804e-02, 4.00000000e+00])

In [97]:
post_all_results_dict = apply_nms_to_dict(all_results_dict)

In [98]:
from my_utils.visualization import action_visualizer_frame_index


In [119]:
def action_visualizer_frame_index(all_results_dict, 
                                  video_path, 
                                  label_dict, 
                                  output_directory,
                                  top_k=5,
                                  interesting_actions_indices = [5, 64, 71, 75],
                                  interesting_actions_labels = {5:'fall', 64:'fight', 71:'kick', 76:'push'},
                                  action_colors = {5: (0, 0, 210), # Blue
                                                   64 : (255, 0, 0), # Red
                                                   71: (255, 165, 0), # Purple
                                                   75: (128, 0, 128)}, # Orange
                                  action_thrshold = {5: 0.3, 64: 0.2, 71: 0.3, 76: 0.3},
                                  other_actions_color = (0, 255, 0), # green
                                  all_actions=False, 
                                  long_text_show=False,
                                  add_patch_index=False,
                                  vis_patch_lines=False,
                                  mode='pic'):
    
    if mode in ['gif', 'movie']:
        vis_frames_list = []
    
    
    with tqdm(total=len(all_results_dict)) as pbar:
        
        for cur_frame, results_frame in all_results_dict.items():
            
            
            # checking results_frame type and converting it into np
            if isinstance(results_frame, list):
                results_frame_np = np.concatenate(results_frame, axis=0)
            elif isinstance(results_frame, np.ndarray):
                results_frame_np = results_frame
            else:
                raise ValueError("Input must be a list or a numpy array")
                
            
            # getting the frame
            frame = get_frame_from_video(video_path, int(cur_frame))
            vis_frame = cv2.cvtColor(frame.copy(), cv2.COLOR_BGR2RGB)
            
            # no detection on frame
            if results_frame_np.shape[0] == 0:
                if mode in ['pic', 'gif']:
                    continue
                else:
                    vis_frames_list.append(vis_frame)
                    continue
            
            # extracing object_score, bboxes, top action indices and score, and patch_index (if exists)
            obj_scores_frame = results_frame_np[:, :1]
            bboxes_frame = results_frame_np[:, 1:5]
            top_indices_frame = results_frame_np[:, 5:5+top_k]
            top_values_frame = results_frame_np[:, 5+top_k:5+2*top_k]
            
            if results_frame_np.shape[-1] != 5+2*top_k:
                patch_indices_frame = results_frame_np[:, -1].astype(int)
            else:
                add_patch_index = False
                
            
            # id of interesting actor starting from 0
            id_actor = 0
            
            # if frame contains any interesting action
            interesting_frame = False
            
            # looping over each actor
            for object_score, bbox, top_action_indices, top_action_scores, patch_index in zip(obj_scores_frame, bboxes_frame,
                                                                                              top_indices_frame, top_values_frame, patch_indices_frame):
                x1, y1, x2, y2 = bbox.astype(int)
                
                # if actor is interesting
                interesting_actor = False
                for ind_act, act in enumerate(top_action_indices):
                    if act in interesting_actions_indices and top_action_scores[ind_act] > action_thrshold[int(act)]:
                        interesting_actor = True
                        interesting_frame = True
                        bbox_action_color = action_colors[int(act)]
                        main_interesting_act = int(act) # the main interesting action of actor
                        main_interesting_score = top_action_scores[ind_act]
                        break
                
                # visualization for interesting actor
                if interesting_actor:
                    
                    
                    # plot the bbox of interesting actor 
                    cv2.rectangle(vis_frame, (x1, y1), (x2, y2), bbox_action_color, 2)
                    
                        # text on top of bbox of interesting actor
                    if add_patch_index:
                        id_text = '{}_{}_{}_{}'.format(id_actor, 
                                                       interesting_actions_labels[main_interesting_act], 
                                                       np.round(main_interesting_score, 2), 
                                                       patch_index)
                    else:
                        id_text = '{}_{}_{}'.format(id_actor, 
                                                    interesting_actions_labels[main_interesting_act], 
                                                    np.round(main_interesting_score, 2))
                    id_actor += 1
                    
                    
                    cv2.putText(vis_frame, id_text, (x1+10, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.75, bbox_action_color, 2)
            
                     # add text containing all actions of interesting actor
                    if long_text_show:
                        long_text = '{}-{}:'.format(id_actor, np.round(object_score, 2)) # add ID of actor for frame visualization
                        # add act and their scores on long text
                        for act, score in zip(top_action_indices, top_action_scores):
                            long_text += '{}_{}-'.format(label_dict[act].replace('(', '').replace(')', '').split('/')[0], 
                                                 (np.round(score, 2)))
                        cv2.putText(vis_frame, long_text[:-1], (20, 100 + 20 * id_actor), 
                                cv2.FONT_HERSHEY_SIMPLEX, 1,
                                bbox_action_color, 2)
                
                # visualization for other actors
                else:
                    if all_actions:
                        # plot bbox of other actors   
                        cv2.rectangle(vis_frame, (x1, y1), (x2, y2), other_actions_color, 2)
                        # add text
                        if add_patch_index:
                            id_text = '{}'.format(patch_index)
                            cv2.putText(vis_frame, id_text, (x1+10, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.75,
                                        other_actions_color, 2)
       
            if interesting_frame:
                cv2.putText(vis_frame, str(cur_frame), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 4)


            if vis_patch_lines:
                cv2.rectangle(vis_frame, (crop_box[0], crop_box[1]), (crop_box[2], crop_box[3]), (255, 255, 0), 1)
               
        
            if mode in ['gif', 'movie']:
                vis_frames_list.append(vis_frame)
            else:
                if True:
                    frame_path = os.path.join(output_directory_frames, f"frame_{cur_frame}.jpg")
                    sucess = cv2.imwrite(frame_path, cv2.cvtColor(vis_frame, cv2.COLOR_BGR2RGB))
        
        
    # Update tqdm progress bar
    pbar.update(1)
        
    if mode in ['gif', 'movie']:
        return vis_frames_list
    else:
        return True
    

In [124]:
output_directory_frames = os.path.join(output_directory, 'frames')
os.makedirs(output_directory_frames, exist_ok=True)

ou = action_visualizer_frame_index(post_all_results_dict, 
                              video_path, 
                              label_dict, 
                              output_directory_frames,
                              all_actions=True,
                              long_text_show=False,
                                   mode='movie')

  0%|                                                                                                                                                   | 0/53 [00:25<?, ?it/s]


In [125]:
len(ou)

53

In [126]:
from moviepy.editor import ImageSequenceClip

def create_video_from_frames(frames, output_path, fps=20):
    """
    Create a video from a list of frames using moviepy.

    Parameters:
        frames (list): List of frames (each frame is a numpy array or an image file path).
        output_path (str): Path to save the output video file (including file extension, e.g., 'output.mp4').
        fps (int, optional): Frames per second for the output video (default is 30).

    Returns:
        None
    """
    # Create video clip from frames
    clip = ImageSequenceClip(frames, fps=fps)

    # Write video file
    clip.write_videofile(output_path)

create_video_from_frames(ou, 'output.mp4')

Moviepy - Building video output.mp4.
Moviepy - Writing video output.mp4



                                                                                                                                                                               

Moviepy - Done !
Moviepy - video ready output.mp4


In [None]:
interesting_actions_indices = [5, 64, 71, 75]
interesting_actions_labels = {5:'fall', 64:'fight', 71:'kick', 75:'push'}

vis_frames_list = []
vis_iteresting_frames_list = []


output_directory_frames = os.path.join(output_directory, 'frames')
os.makedirs(output_directory_frames, exist_ok=True)


from my_utils.video_processing import get_frame_from_video




# Initialize tqdm with the length of all_results_dict
with tqdm(total=len(all_results_dict)) as pbar:
    for cur_frame, results_frame in all_results_dict.items():
        #results_frame_np = np.concatenate(results_frame, axis=0)
        results_frame_np = results_frame
        obj_scores_frame = results_frame_np[:, :1]
        bboxes_frame = results_frame_np[:, 1:5]
        top_indices_frame = results_frame_np[:, 5:10]
        top_values_frame = results_frame_np[:, 10:]
        
        frame = get_frame_from_video(video_path, cur_frame)
        
        vis_frame = cv2.cvtColor(frame.copy(), cv2.COLOR_BGR2RGB)
        vis_inter_frame = cv2.cvtColor(frame.copy(), cv2.COLOR_BGR2RGB)
        
        id_actor = 0
        interesting_frame = False
        
        for object_score, bbox, top_action_indices, top_action_scores in zip(obj_scores_frame, bboxes_frame, top_indices_frame, top_values_frame):
            x1, y1, x2, y2 = bbox.astype(int)
            small_text = ''
            
            interesting_actor = False
            for ind, act in enumerate(top_action_indices):
                if act in interesting_actions_indices:
                    interesting_actor = True
                    interesting_frame = True
                    
                    small_text += '{}_{}'.format(interesting_actions_labels[act], 
                                                 np.round(top_action_scores[ind], 2))
            if interesting_actor:
                long_text = '{}-{}:'.format(id_actor, np.round(object_score, 2)) # add ID of actor for frame visualization
                for act, score in zip(top_action_indices, top_action_scores):
                    long_text += '{}_{}-'.format(label_dict[act].replace('(', '').replace(')', '').split('/')[0], 
                                                 (np.round(score, 2)))
                    
            
            if interesting_actor:
                # plot the bbox of interesting actor and adding interesting action
                cv2.rectangle(vis_frame, (x1, y1), (x2, y2), (255, 0, 0), 2)
                cv2.putText(vis_frame, small_text, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
                
                cv2.rectangle(vis_inter_frame, (x1, y1), (x2, y2), (255, 0, 0), 2)
                #id_text = '{}:{}'.format(id_actor, small_text)
                id_text = '{}'.format(id_actor)
                id_actor += 1
    
                cv2.putText(vis_inter_frame, id_text, (x1+10, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.75, (255, 0, 0), 2)
                cv2.putText(vis_inter_frame, str(cur_frame), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 4)
                
                # add text containing all actions of interesting actor
                if True:
                    cv2.putText(vis_inter_frame, 
                                long_text[:-1], 
                                (20, 100 + 20 * id_actor), 
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 2)
                
                
            else:
                # plot the bbox of other actors
                cv2.rectangle(vis_frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
                cv2.rectangle(vis_inter_frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
            
        
        #cv2.rectangle(vis_frame, (crop_box[0], crop_box[1]), (crop_box[2], crop_box[3]), (255, 255, 0), 2)
        #cv2.rectangle(vis_inter_frame, (crop_box[0], crop_box[1]), (crop_box[2], crop_box[3]), (255, 255, 0), 2)
        
        
        vis_frames_list.append(vis_frame)
        if interesting_frame:
            vis_iteresting_frames_list.append(vis_inter_frame)
            frame_path = os.path.join(output_directory_frames, f"frame_{cur_frame}.jpg")
            cv2.imwrite(frame_path, cv2.cvtColor(vis_inter_frame, cv2.COLOR_BGR2RGB))
        
        # Update tqdm progress bar
        pbar.update(1)


        
    
    


In [None]:
from moviepy.editor import ImageSequenceClip

def create_video_from_frames(frames, output_path, fps=30):
    """
    Create a video from a list of frames using moviepy.

    Parameters:
        frames (list): List of frames (each frame is a numpy array or an image file path).
        output_path (str): Path to save the output video file (including file extension, e.g., 'output.mp4').
        fps (int, optional): Frames per second for the output video (default is 30).

    Returns:
        None
    """
    # Create video clip from frames
    clip = ImageSequenceClip(frames, fps=fps)

    # Write video file
    clip.write_videofile(output_path)

In [None]:
all_results_dict[124].shape

In [None]:
frame_path

In [127]:
import json
import json_tricks as json_tricks

In [128]:
exp_json_path = os.path.join(output_directory, 'exp.json')

# Save the dictionary as a JSON file
with open(exp_json_path, 'w') as f:
    json.dump(exp_dict, f)


In [129]:
result_json_path = os.path.join(output_directory, 'result.json')

# Save the dictionary as a JSON file
with open(result_json_path, 'w') as f:
    json_tricks.dump(temp_results_dict, f, indent=4)

In [132]:
temp_results_dict[74][0].shape

(1, 16)