In [1]:
import os
os.chdir('../')

In [7]:
import json
import torch
import numpy as np
from time import sleep

from IPython.display import clear_output
import matplotlib.pyplot as plt
%matplotlib inline 

import math
import random
from collections import defaultdict
from einops.layers.torch import Rearrange

import torch
from torchvision.transforms import Compose
from torch.utils.data import DataLoader

from src.opts.opts import parser
from src.utils.reproducibility import make_reproducible
from src.models.model import VideoModel
from src.dataset.video_dataset import VideoDataset
from src.dataset.video_dataset import prepare_clips_data
from src.dataset.video_transforms import (
    IdentityTransform,
    GroupScale, 
    GroupCenterCrop, 
    GroupRandomCrop,
    GroupMultiScaleCrop,
    Stack, 
    ToTorchFormatTensor, 
    GroupNormalize, 
    GroupRandomHorizontalFlip
)

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
args = parser.parse_args(args=[])
args.base_model = "TimeSformer"
args.__dict__

{'holoassist_dir': '/Users/artemmerinov/data/holoassist',
 'raw_annotation_file': '/Users/artemmerinov/data/holoassist/data-annotation-trainval-v1_1.json',
 'split_dir': '/Users/artemmerinov/data/holoassist/data-splits-v1',
 'fine_grained_actions_map_file': '/Users/artemmerinov/data/holoassist/fine_grained_actions_map.txt',
 'dataset_name': 'holoassist',
 'fusion_mode': None,
 'base_model': 'TimeSformer',
 'num_segments': 8,
 'dropout': 0.5,
 'resume': None,
 'start_epoch': 0,
 'num_epochs': 10,
 'batch_size': 16,
 'lr': 0.01,
 'momentum': 0.9,
 'weight_decay': 0.0005,
 'clip_gradient': None,
 'checkpoint_interval': 3,
 'runs_path': 'runs/',
 'num_workers': 4}

In [9]:
if args.dataset_name == 'holoassist':
    num_classes = 1887 # actions
    tr_list_file = "/Users/artemmerinov/data/holoassist/data-splits-v1.train-v1.txt"
    va_list_file = "/Users/artemmerinov/data/holoassist/data-splits-v1.val-v1.txt"
else:
    raise NotImplementedError()

In [10]:
# Fix random state to make results reproducible
make_reproducible(random_seed=0)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = VideoModel(
    num_classes=num_classes, 
    num_segments=args.num_segments, 
    base_model=args.base_model,
    fusion_mode=args.fusion_mode,
    dropout=args.dropout,
    verbose=False,
).to(device)
# print(model)

input_size = model.input_size
crop_size = model.crop_size
scale_size = model.scale_size
input_mean = model.input_mean
input_std = model.input_std
learnable_named_parameters = model.learnable_named_parameters

Making reproducible on seed 0
######USING ATTENTION STYLE:  divided
self.base_model.last_layer_name head


In [24]:
model

VideoModel(
  (base_model): Timesformer(
    (patch_embed): VideoPatchEmbed(
      (proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    )
    (pos_drop): Dropout(p=0.0, inplace=False)
    (blocks): ModuleList(
      (0-11): 12 x SpaceTimeBlock(
        (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (attn): VarAttention(
          (qkv): Linear(in_features=768, out_features=2304, bias=True)
          (proj): Linear(in_features=768, out_features=768, bias=True)
          (attn_drop): Dropout(p=0.0, inplace=False)
          (proj_drop): Dropout(p=0.0, inplace=False)
        )
        (timeattn): VarAttention(
          (qkv): Linear(in_features=768, out_features=2304, bias=True)
          (proj): Linear(in_features=768, out_features=768, bias=True)
          (attn_drop): Dropout(p=0.0, inplace=False)
          (proj_drop): Dropout(p=0.0, inplace=False)
        )
        (drop_path): Identity()
        (norm2): LayerNorm((768,), eps=1e-06, elementwise_a

In [7]:
tr_clip_path_to_video_arr, tr_clip_start_arr, tr_clip_end_arr, tr_clip_action_id_arr, tr_clip_mistake_arr = prepare_clips_data(
    raw_annotation_file=args.raw_annotation_file,
    holoassist_dir=args.holoassist_dir, 
    split_dir=args.split_dir,
    fine_grained_actions_map_file=args.fine_grained_actions_map_file,
    mode="train",
)

tr_transform = Compose([
    GroupMultiScaleCrop(crop_size, [1, .875]),
    GroupRandomHorizontalFlip(),
    Stack(roll=False),
    ToTorchFormatTensor(div=(args.base_model not in ['BNInception'])),
    GroupNormalize(mean=input_mean, std=input_std),
])

tr_dataset = VideoDataset(
    clip_path_to_video_arr=tr_clip_path_to_video_arr,
    clip_start_arr=tr_clip_start_arr,
    clip_end_arr=tr_clip_end_arr,
    clip_label_arr=tr_clip_action_id_arr,
    num_segments=args.num_segments,
    transform=tr_transform,
    mode="train"
)

There are 7 videos in the list There are 13 videos as video files There are 0 videos that present in the list but are missing as videos.
Number of clips: 507 for mode train


In [18]:
x, y = tr_dataset[123]
x = Rearrange("(t c) h w -> 1 (t c) h w", c=3, t=args.num_segments, h=input_size, w=input_size)(x)
x.size()

torch.Size([1, 24, 224, 224])

In [19]:
# model(
#     torch.rand((2*8, 3, 224, 224)) # n t c h w
# )

In [23]:
model(
    torch.rand((2*8, 3, 224, 224))
)

EinopsError: Shape mismatch, 3 != 0

In [20]:
model(x)

torch.Size([1, 8, 3, 224, 224])


RuntimeError: Output 0 of ReshapeAliasBackward0 is a view and is being modified inplace. This view is the output of a function that returns multiple views. Such functions do not allow the output views to be modified inplace. You should replace the inplace operation by an out-of-place one.