In [1]:
import os

import cv2
import math
import numpy as np
import moviepy 
import torch

from matplotlib import pyplot as plt



from alphaction.config import cfg
from alphaction.modeling.detector import build_detection_model
from alphaction.utils.checkpoint import ActionCheckpointer
from alphaction.utils.comm import get_world_size


from my_utils.gen_utils import create_experiment_folder
from my_utils.video_processing import get_video_info, get_frame_from_video
from my_utils.slicing import get_slice_bboxes, generate_sliding_window_gif

from my_utils.video_processing import segment_crop_video
from my_utils.my_ava_preprocessing import ava_preprocessing_cv2, clip_constructor, prepare_collated_batches, prepare_collated_batches_v2
from my_utils.ava_postprocessing import concatenate_results
from my_utils.visualization import action_visualizer_frame_index

from my_utils.gen_utils import parse_label_file

from my_utils.ava_postprocessing import clip_boxes_tensor, map_bbox_from_prep_to_crop, map_bbox_from_crop_to_orig


from tqdm import tqdm


### 1. CONFIG
#### 1.1 Main Parameters

In [2]:
model_name = 'VMAEv2'


person_threshold = 0.3 # confidence threshold on actor. 0.6 is the defualt
sampling_rate = 4 # sampling rate: 4 is the defualt
top_k = 5 # number of actions per person
video_path = '../input_dir/Fighting_14.mp4'

slice_height = 600
slice_width = 800
overlap_ratio = 0

starting_frame_index = 900
length_input = 300

exp_dict = {'model_name': model_name,
            'model_params': {'person_threshold': person_threshold, 
                             'sampling_rate': sampling_rate},
            'orig_post_processing':{'top_k': top_k},
            'aggregation': {'method': {}, 
                            'params': {}},
            'video_path': video_path,
            'slicing_params': {'slice_height': slice_height, 
                               'slice_width': slice_width, 
                               'overlap_ratio':overlap_ratio},
            'video_params': {'st_frame_index': starting_frame_index, 
                             'length_input':length_input
                             }
           }

exp_dict


{'model_name': 'VMAEv2',
 'model_params': {'person_threshold': 0.3, 'sampling_rate': 4},
 'orig_post_processing': {'top_k': 5},
 'aggregation': {'method': {}, 'params': {}},
 'video_path': '../input_dir/Fighting_14.mp4',
 'slicing_params': {'slice_height': 600,
  'slice_width': 800,
  'overlap_ratio': 0},
 'video_params': {'st_frame_index': 900, 'length_input': 300}}

In [3]:
video_name = os.path.basename(video_path).split('.')[0]
#output_directory = f'../output_dir/{video_name}/{model_name}/patch_batch/' 
#output_directory = create_experiment_folder(output_directory, 'exp')
#output_directory


#### 1.2 Model Config Setup

In [4]:
def cfg_create(model_name, person_threshold, sampling_rate, test_videos_batch=1, num_workers=1):
    
    if model_name == 'VMAEv2':
        config_file = '../config_files/VMAEv2-ViTB-16x4.yaml'
        
    if model_name == 'VMAE':
        config_file = '../config_files/VMAE-ViTB-16x4.yaml'
        
    cfg.merge_from_file(config_file)
    
    # change model weight path
    if model_name == 'VMAEv2':
        cfg.merge_from_list(["MODEL.WEIGHT", "../checkpoints/VMAEv2_ViTB_16x4.pth"])
    if model_name == 'VMAE':
        cfg.merge_from_list(["MODEL.WEIGHT", "../checkpoints/VMAE_ViTB_16x4.pth"])

    # change output dir
    cfg.merge_from_list(["OUTPUT_DIR", "../output_dir/"])

    # change person threshold
    cfg.merge_from_list(["MODEL.STM.PERSON_THRESHOLD", person_threshold])

    # change sampling rate
    cfg.merge_from_list(["DATA.SAMPLING_RATE", sampling_rate])

    # change path for data_dir
    cfg.merge_from_list(["DATA.PATH_TO_DATA_DIR", "/work/ava"])

    # folder name of annotations
    cfg.merge_from_list(["AVA.ANNOTATION_DIR", "annotations/"])

    # file name of  frame_lists
    cfg.merge_from_list(["AVA.TRAIN_LISTS", ['sample.csv']])
    cfg.merge_from_list(["AVA.TEST_LISTS", ['sample.csv']])

    # file name of predicted_bboxes
    cfg.merge_from_list(["AVA.TRAIN_GT_BOX_LISTS", ['ava_sample_predicted_boxes.csv']])
    cfg.merge_from_list(["AVA.TEST_GT_BOX_LISTS", ['ava_sample_predicted_boxes.csv']])

    # file name of exlusions
    cfg.merge_from_list(["AVA.EXCLUSION_FILE", 'ava_sample_train_excluded_timestamps_v2.2.csv'])

    # number of batches in test scenario
    cfg.merge_from_list(["TEST.VIDEOS_PER_BATCH", test_videos_batch])

    # number of workers
    cfg.merge_from_list(["DATALOADER.NUM_WORKERS", num_workers])
    
    
    return cfg


    

In [5]:
cfg = cfg_create(model_name, person_threshold, sampling_rate)

In [6]:
model = build_detection_model(cfg)

In [7]:
debug = True
if debug:
    # The shape of model input should be divisible into this. Otherwise, padding 0 to left and bottum. 
    print("cfg.DATALOADER.SIZE_DIVISIBILITY: ", cfg.DATALOADER.SIZE_DIVISIBILITY)
    
    # Sampling rate in constructing the clips.
    self_sample_rate =  cfg.DATA.SAMPLING_RATE
    print("cfg.DATA.SAMPLING_RATE: ", cfg.DATA.SAMPLING_RATE)
    
    # Length of clip
    self_video_length = cfg.DATA.NUM_FRAMES
    print("cfg.DATA.NUM_FRAMES: ", cfg.DATA.NUM_FRAMES)
    
    # Length of sequence frames from which a clip is constructed.
    self_seq_len = self_video_length * self_sample_rate
    print("self_seq_len: ", self_seq_len)
    
    self_num_classes = cfg.MODEL.STM.ACTION_CLASSES
    print("cfg.MODEL.STM.ACTION_CLASSES: ", self_num_classes)
    
    # Augmentation params.
    self_data_mean = cfg.DATA.MEAN
    self_data_std = cfg.DATA.STD
    self_use_bgr = cfg.AVA.BGR
    print("Augmentation params: ", self_data_mean, self_data_std, self_use_bgr)
    
    self_jitter_min_scale = cfg.DATA.TEST_MIN_SCALES
    self_jitter_max_scale = cfg.DATA.TEST_MAX_SCALE
    self_test_force_flip = cfg.AVA.TEST_FORCE_FLIP

    print("scale and flip params", self_jitter_min_scale, self_jitter_max_scale, self_test_force_flip)

cfg.DATALOADER.SIZE_DIVISIBILITY:  32
cfg.DATA.SAMPLING_RATE:  4
cfg.DATA.NUM_FRAMES:  16
self_seq_len:  64
cfg.MODEL.STM.ACTION_CLASSES:  80
Augmentation params:  [0.45, 0.45, 0.45] [0.225, 0.225, 0.225] False
scale and flip params [256] 1333 False


In [8]:
seq_len = cfg.DATA.NUM_FRAMES * cfg.DATA.SAMPLING_RATE
print(seq_len)

64


In [9]:
model.to("cuda")

STMDetector(
  (backbone): ViT(
    (patch_embed): PatchEmbed(
      (proj): Conv3d(3, 768, kernel_size=(2, 16, 16), stride=(2, 16, 16))
    )
    (pos_drop): Dropout(p=0.0, inplace=False)
    (blocks): ModuleList(
      (0): Block(
        (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=768, out_features=2304, bias=False)
          (attn_drop): Dropout(p=0.0, inplace=False)
          (proj): Linear(in_features=768, out_features=768, bias=True)
          (proj_drop): Dropout(p=0.0, inplace=False)
        )
        (drop_path): Identity()
        (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (mlp): Mlp(
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (act): GELU(approximate='none')
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (drop): Dropout(p=0.0, inplace=False)
        )
      )
      (1): Block(
        (norm1): La

### 5. loading weight

In [10]:
output_dir = cfg.OUTPUT_DIR
output_dir

'../output_dir/'

In [11]:
checkpointer = ActionCheckpointer(cfg, model, save_dir=output_dir)
checkpointer.load(cfg.MODEL.WEIGHT)

{}

In [12]:
num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
num_gpus

1

In [13]:
mem_active = cfg.MODEL.STM.MEM_ACTIVE
mem_active  

False

In [14]:
device = torch.device("cuda")

In [15]:
num_devices = get_world_size()
num_devices

1

In [16]:
model.eval()

STMDetector(
  (backbone): ViT(
    (patch_embed): PatchEmbed(
      (proj): Conv3d(3, 768, kernel_size=(2, 16, 16), stride=(2, 16, 16))
    )
    (pos_drop): Dropout(p=0.0, inplace=False)
    (blocks): ModuleList(
      (0): Block(
        (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=768, out_features=2304, bias=False)
          (attn_drop): Dropout(p=0.0, inplace=False)
          (proj): Linear(in_features=768, out_features=768, bias=True)
          (proj_drop): Dropout(p=0.0, inplace=False)
        )
        (drop_path): Identity()
        (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (mlp): Mlp(
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (act): GELU(approximate='none')
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (drop): Dropout(p=0.0, inplace=False)
        )
      )
      (1): Block(
        (norm1): La

### 3. VIDEO Info and Slicing Visualization


In [17]:
video_info = get_video_info(video_path)

In [18]:
video_info

{'frame_count': 1525,
 'frame_rate': 24.353,
 'width': 1920,
 'height': 1080,
 'fps': 24.353}

In [19]:
exp_dict['video_params'].update(video_info)
exp_dict

{'model_name': 'VMAEv2',
 'model_params': {'person_threshold': 0.3, 'sampling_rate': 4},
 'orig_post_processing': {'top_k': 5},
 'aggregation': {'method': {}, 'params': {}},
 'video_path': '../input_dir/Fighting_14.mp4',
 'slicing_params': {'slice_height': 600,
  'slice_width': 800,
  'overlap_ratio': 0},
 'video_params': {'st_frame_index': 900,
  'length_input': 300,
  'frame_count': 1525,
  'frame_rate': 24.353,
  'width': 1920,
  'height': 1080,
  'fps': 24.353}}

In [20]:
frame_height = video_info['height']
frame_width = video_info['width']

In [21]:
patches_coordinates = get_slice_bboxes(frame_height, frame_width, slice_height, slice_width, False, overlap_ratio)

In [22]:
len(patches_coordinates)

6

In [23]:
patches_coordinates

[[0, 0, 800, 600],
 [800, 0, 1600, 600],
 [1120, 0, 1920, 600],
 [0, 480, 800, 1080],
 [800, 480, 1600, 1080],
 [1120, 480, 1920, 1080]]

In [24]:
def crop_frame(frame, patches_coordinates):
    frame_slices = [frame[y1:y2, x1:x2] for x1, y1, x2, y2 in patches_coordinates]
    return frame_slices
    
    

### Inference

In [25]:
# Open video file
cap = cv2.VideoCapture(video_path)

# Initialize buffer
buffer = []

frames_tensor_list = []


# Read frames from video
while True:
    
    
    ret, frame = cap.read()
    if not ret:
        break

    frame_slices = crop_frame(frame, patches_coordinates)
    frames_prep = ava_preprocessing_cv2(frame_slices, cfg) # torch.Size([3, n_patches, 256, 455])
    
    print(frames_prep.shape)
    frames_tensor_list.append(frames_prep.permute(1, 0, 2, 3))
    
    if len(frames_tensor_list) == 48:
        video_tensor = torch.stack(frames_tensor_list).permute(1, 2, 0, 3, 4)
        clip_tensor = video_tensor[:,:, 0:-1:3,:,:]
        cente_frame_id = [24 for _ in range(10)]
        list_collated_batches = prepare_collated_batches_v2(clip_tensor, cfg)
        
        slow_video = torch.stack([torch.squeeze(item[0]) for item in list_collated_batches])
        fast_video = None
        whwh = torch.stack([torch.squeeze(item[2]) for item in list_collated_batches])
        boxes = list_collated_batches[0][3]
        labels = list_collated_batches[0][4]
        
        
        
        #slow_video, fast_video, whwh, boxes, labels, metadata, idx = list_collated_batches
        #clips_height, clips_width = slow_video.shape[-2:]
        slow_video = slow_video.to(device)
        if fast_video is not None:
            fast_video = fast_video.to(device)
        whwh = whwh.to(device)
    
        # INFERENCE
        action_score_list, box_list, objectness_score_list = model(slow_video, fast_video, whwh, boxes, labels)
        break
        

    # Add frame to buffer
    #buffer.append(frame)
    
    

    # Process frames if buffer size is equal to 48
    #process_frames(buffer)

# Release video capture
cap.release()


torch.Size([3, 6, 256, 341])
torch.Size([3, 6, 256, 341])
torch.Size([3, 6, 256, 341])
torch.Size([3, 6, 256, 341])
torch.Size([3, 6, 256, 341])
torch.Size([3, 6, 256, 341])
torch.Size([3, 6, 256, 341])
torch.Size([3, 6, 256, 341])
torch.Size([3, 6, 256, 341])
torch.Size([3, 6, 256, 341])
torch.Size([3, 6, 256, 341])
torch.Size([3, 6, 256, 341])
torch.Size([3, 6, 256, 341])
torch.Size([3, 6, 256, 341])
torch.Size([3, 6, 256, 341])
torch.Size([3, 6, 256, 341])
torch.Size([3, 6, 256, 341])
torch.Size([3, 6, 256, 341])
torch.Size([3, 6, 256, 341])
torch.Size([3, 6, 256, 341])
torch.Size([3, 6, 256, 341])
torch.Size([3, 6, 256, 341])
torch.Size([3, 6, 256, 341])
torch.Size([3, 6, 256, 341])
torch.Size([3, 6, 256, 341])
torch.Size([3, 6, 256, 341])
torch.Size([3, 6, 256, 341])
torch.Size([3, 6, 256, 341])
torch.Size([3, 6, 256, 341])
torch.Size([3, 6, 256, 341])
torch.Size([3, 6, 256, 341])
torch.Size([3, 6, 256, 341])
torch.Size([3, 6, 256, 341])
torch.Size([3, 6, 256, 341])
torch.Size([3,

NameError: name 'features' is not defined

In [39]:
slow_video.shape

torch.Size([6, 3, 16, 256, 352])

In [26]:
len(action_score_list)

4

In [29]:
action_score_list[0].shape

torch.Size([9, 80])

In [30]:
action_score_list[1].shape

torch.Size([3, 80])

In [31]:
action_score_list[2].shape

torch.Size([2, 80])

In [32]:
action_score_list[3].shape

torch.Size([2, 80])

In [38]:
box_list[3].shape

torch.Size([2, 4])

In [None]:
len(action_score_list)

In [None]:
slow_video.shape

In [None]:
 ss = torch.stack([torch.squeeze(item[0]) for item in list_collated_batches])

In [None]:
ss.shape

In [None]:
slow_video.shape

In [None]:
features[0].shape