In [1]:
import os

import cv2
import math
import numpy as np
import moviepy 
import torch

from matplotlib import pyplot as plt



from my_alphaction.config import cfg
from my_alphaction.modeling.detector import build_detection_model
from my_alphaction.utils.checkpoint import ActionCheckpointer
from my_alphaction.utils.comm import get_world_size



from tqdm import tqdm

In [2]:
from my_alphaction.modeling.stm_decoder.stm_decoder import STMDecoder

In [3]:
model_name = 'VMAEv2'


person_threshold = 0.6 # confidence threshold on actor. 0.6 is the defualt
sampling_rate = 3 # sampling rate: 4 is the defualt
top_k = 5 # number of actions per person
video_path = '../input_dir/markt2_fight.mp4'

slice_height = 800
slice_width = 1000
overlap_ratio = 0.1

starting_frame_index = 100
length_input = 200

exp_dict = {'model_name': model_name,
            'model_params': {'person_threshold': person_threshold, 
                             'sampling_rate': sampling_rate},
            'orig_post_processing':{'top_k': top_k},
            'aggregation': {'method': {}, 
                            'params': {}},
            'video_path': video_path,
            'slicing_params': {'slice_height': slice_height, 
                               'slice_width': slice_width, 
                               'overlap_ratio':overlap_ratio},
            'video_params': {'st_frame_index': starting_frame_index, 
                             'length_input':length_input
                             }
           }



In [4]:
if model_name == 'VMAEv2':
    config_file = '../config_files/VMAEv2-ViTB-16x4.yaml'
if model_name == 'VMAE':
    config_file = '../config_files/VMAE-ViTB-16x4.yaml'


In [5]:
cfg.merge_from_file(config_file)

In [6]:
# change model weight path
if model_name == 'VMAEv2':
    cfg.merge_from_list(["MODEL.WEIGHT", "../checkpoints/VMAEv2_ViTB_16x4.pth"])
if model_name == 'VMAE':
    cfg.merge_from_list(["MODEL.WEIGHT", "../checkpoints/VMAE_ViTB_16x4.pth"])

# change output dir
cfg.merge_from_list(["OUTPUT_DIR", "../output_dir/"])

# change person threshold
cfg.merge_from_list(["MODEL.STM.PERSON_THRESHOLD", person_threshold])

# change sampling rate
cfg.merge_from_list(["DATA.SAMPLING_RATE", sampling_rate])

# change path for data_dir
cfg.merge_from_list(["DATA.PATH_TO_DATA_DIR", "/work/ava"])

# folder name of annotations
cfg.merge_from_list(["AVA.ANNOTATION_DIR", "annotations/"])

# file name of  frame_lists
cfg.merge_from_list(["AVA.TRAIN_LISTS", ['sample.csv']])
cfg.merge_from_list(["AVA.TEST_LISTS", ['sample.csv']])

# file name of predicted_bboxes
cfg.merge_from_list(["AVA.TRAIN_GT_BOX_LISTS", ['ava_sample_predicted_boxes.csv']])
cfg.merge_from_list(["AVA.TEST_GT_BOX_LISTS", ['ava_sample_predicted_boxes.csv']])

# file name of exlusions
cfg.merge_from_list(["AVA.EXCLUSION_FILE", 'ava_sample_train_excluded_timestamps_v2.2.csv'])

# number of batches in test scenario
cfg.merge_from_list(["TEST.VIDEOS_PER_BATCH", 1])

# number of workers
cfg.merge_from_list(["DATALOADER.NUM_WORKERS", 1])


In [7]:
cfg.ViT.USE_CHECKPOINT

True

In [8]:
cfg.merge_from_list(["ViT.USE_CHECKPOINT", False])

In [9]:
cfg.ViT.USE_CHECKPOINT

False

In [10]:
debug = True
if debug:
    # The shape of model input should be divisible into this. Otherwise, padding 0 to left and bottum. 
    print("cfg.DATALOADER.SIZE_DIVISIBILITY: ", cfg.DATALOADER.SIZE_DIVISIBILITY)
    
    # Sampling rate in constructing the clips.
    self_sample_rate =  cfg.DATA.SAMPLING_RATE
    print("cfg.DATA.SAMPLING_RATE: ", cfg.DATA.SAMPLING_RATE)
    
    # Length of clip
    self_video_length = cfg.DATA.NUM_FRAMES
    print("cfg.DATA.NUM_FRAMES: ", cfg.DATA.NUM_FRAMES)
    
    # Length of sequence frames from which a clip is constructed.
    self_seq_len = self_video_length * self_sample_rate
    print("self_seq_len: ", self_seq_len)
    
    self_num_classes = cfg.MODEL.STM.ACTION_CLASSES
    print("cfg.MODEL.STM.ACTION_CLASSES: ", self_num_classes)
    
    # Augmentation params.
    self_data_mean = cfg.DATA.MEAN
    self_data_std = cfg.DATA.STD
    self_use_bgr = cfg.AVA.BGR
    print("Augmentation params: ", self_data_mean, self_data_std, self_use_bgr)
    
    self_jitter_min_scale = cfg.DATA.TEST_MIN_SCALES
    self_jitter_max_scale = cfg.DATA.TEST_MAX_SCALE
    self_test_force_flip = cfg.AVA.TEST_FORCE_FLIP

    print("scale and flip params", self_jitter_min_scale, self_jitter_max_scale, self_test_force_flip)

cfg.DATALOADER.SIZE_DIVISIBILITY:  32
cfg.DATA.SAMPLING_RATE:  3
cfg.DATA.NUM_FRAMES:  16
self_seq_len:  48
cfg.MODEL.STM.ACTION_CLASSES:  80
Augmentation params:  [0.45, 0.45, 0.45] [0.225, 0.225, 0.225] False
scale and flip params [256] 1333 False


In [11]:
stm_decoder = STMDecoder(cfg
                )

In [12]:
import torch

def create_random_variables(variable_info):
    variables = {}
    
    def create_tensor(shape):
        return torch.randn(*shape)
    
    for arg_name, info in variable_info.items():
        if "length" in info:
            # If the variable is a list
            item_shapes = []
            for item_info in info["item_shapes"]:
                item_shapes.append(create_tensor(item_info["shape"]))
            variables[arg_name] = item_shapes
        elif "shape" in info:
            # If the variable is a tensor
            variables[arg_name] = create_tensor(info["shape"])
        else:
            raise ValueError(f"Invalid variable info for {arg_name}")
    
    return variables



In [13]:
input_variable_info = {'features': {'length': 4, 'item_shapes': [{'type': 'Tensor', 'shape': (1, 256, 8, 64, 80)}, {'type': 'Tensor', 'shape': (1, 256, 8, 32, 40)}, {'type': 'Tensor', 'shape': (1, 256, 8, 16, 20)}, {'type': 'Tensor', 'shape': (1, 256, 8, 8, 10)}]}, 
                 'proposal_boxes': {'type': 'Tensor', 'shape': (1, 100, 4)}, 
                 'spatial_queries': {'type': 'Tensor', 'shape': (1, 100, 256)}, 
                 'temporal_queries': {'type': 'Tensor', 'shape': (1, 100, 256)}}

In [14]:
input_random_variables = create_random_variables(input_variable_info)

In [15]:
features = input_random_variables['features']


In [16]:
output = stm_decoder(features)

In [17]:
output.shape

torch.Size([1, 100, 2])

In [18]:
example_inputs = {
    'features': [torch.randn(1, 256, 8, 64, 80),
                 torch.randn(1, 256, 8, 32, 40),
                 torch.randn(1, 256, 8, 16, 20),
                 torch.randn(1, 256, 8, 8, 10)]
}

# Export the model to ONNX format
torch.onnx.export(stm_decoder,
                  features,
                  'stm_decoder.onnx',
                  input_names=['features'],
                  output_names=['objectness_score'],
                  
                 opset_version=16)

  whwh = torch.tensor(values, device='cpu')
  batch_size = len(whwh)
  assert (bboxes1.size(-1) == 4 or bboxes1.size(0) == 0)
  assert (bboxes2.size(-1) == 4 or bboxes2.size(0) == 0)
  assert bboxes1.shape[:-2] == bboxes2.shape[:-2]
  if rows * cols == 0:
  assert token_xyzr.size(-1) == 4
  mapping_size = value.new_tensor([value.size(4), value.size(3)]).view(1, 1, 1, 1, -1) * stride
  assert g == G
  _C._jit_pass_onnx_node_shape_type_inference(node, params_dict, opset_version)
  _C._jit_pass_onnx_graph_shape_type_inference(
  _C._jit_pass_onnx_graph_shape_type_inference(
