In [1]:
import os
os.chdir('../')

In [2]:
import gc
import os
import time
import numpy as np
import random
from datetime import datetime
from functools import partial
import torch
import torch.nn as nn
from torch.nn.utils import clip_grad_norm_
from torchvision.transforms import Compose
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import CosineAnnealingLR

from src.opts.opts import parser
from src.utils.reproducibility import make_reproducible
from src.utils.model_specs import model_size
from src.models.model import VideoModel
from src.dataset.video_dataset import VideoDataset, prepare_clips_data
from src.dataset.video_transforms import (
    GroupMultiScaleCrop, Stack, ToTorchFormatTensor, GroupNormalize,
)
from src.utils.meters import AverageMeter
from src.utils.metrics import calc_accuracy

%load_ext autoreload
%autoreload 2

In [3]:
# Reproducibility.
# Set up initial random states.
make_reproducible(random_seed=0)

# Load config.
args = parser.parse_args([])
args.base_model = "HORST"
print(args)

if args.dataset_name == 'holoassist':
    num_classes = 1887 # actions
else:
    raise NotImplementedError()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = VideoModel(
    num_classes=num_classes, 
    num_segments=args.num_segments, 
    base_model=args.base_model,
    fusion_mode=args.fusion_mode,
    dropout=args.dropout,
    verbose=True,
).to(device)

Making reproducible on seed 0
Namespace(holoassist_dir='/Users/artemmerinov/data/holoassist', raw_annotation_file='/Users/artemmerinov/data/holoassist/data-annotation-trainval-v1_1.json', split_dir='/Users/artemmerinov/data/holoassist/data-splits-v1', fine_grained_actions_map_file='/Users/artemmerinov/data/holoassist/fine_grained_actions_map.txt', dataset_name='holoassist', fusion_mode=None, base_model='HORST', num_segments=8, dropout=0.5, resume=None, start_epoch=0, num_epochs=10, lr=0.01, momentum=0.9, weight_decay=0.0005, clip_gradient=None, checkpoint_interval=3, runs_path='runs/', batch_size=16, num_workers=4, prefetch_factor=2)


  from .autonotebook import tqdm as notebook_tqdm
  model = create_fn(


#params: 6144 base_model.backbone_model.model.patch_embed.proj.weight
#params: 128 base_model.backbone_model.model.patch_embed.proj.bias
#params: 128 base_model.backbone_model.model.patch_embed.norm.weight
#params: 128 base_model.backbone_model.model.patch_embed.norm.bias
#params: 128 base_model.backbone_model.model.layers.0.blocks.0.norm1.weight
#params: 128 base_model.backbone_model.model.layers.0.blocks.0.norm1.bias
#params: 676 base_model.backbone_model.model.layers.0.blocks.0.attn.relative_position_bias_table
#params: 49152 base_model.backbone_model.model.layers.0.blocks.0.attn.qkv.weight
#params: 384 base_model.backbone_model.model.layers.0.blocks.0.attn.qkv.bias
#params: 16384 base_model.backbone_model.model.layers.0.blocks.0.attn.proj.weight
#params: 128 base_model.backbone_model.model.layers.0.blocks.0.attn.proj.bias
#params: 128 base_model.backbone_model.model.layers.0.blocks.0.norm2.weight
#params: 128 base_model.backbone_model.model.layers.0.blocks.0.norm2.bias
#params: 655

In [4]:
!ls /Users/artemmerinov/data/backbones/FAttentionRNN-anticipation_0.25_6_8_rgb_mt5r_best.pth.tar

/Users/artemmerinov/data/backbones/FAttentionRNN-anticipation_0.25_6_8_rgb_mt5r_best.pth.tar


In [5]:
model

VideoModel(
  (base_model): AttentionRNN(
    (backbone_model): SwinBackbone(
      (model): SwinTransformer(
        (patch_embed): PatchEmbed(
          (proj): Conv2d(3, 128, kernel_size=(4, 4), stride=(4, 4))
          (norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        )
        (layers): Sequential(
          (0): SwinTransformerStage(
            (downsample): Identity()
            (blocks): Sequential(
              (0): SwinTransformerBlock(
                (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
                (attn): WindowAttention(
                  (qkv): Linear(in_features=128, out_features=384, bias=True)
                  (attn_drop): Dropout(p=0.0, inplace=False)
                  (proj): Linear(in_features=128, out_features=128, bias=True)
                  (proj_drop): Dropout(p=0.0, inplace=False)
                  (softmax): Softmax(dim=-1)
                )
                (drop_path1): Identity()
                (norm2)

In [7]:

crop_size = model.crop_size
scale_size = model.scale_size
input_mean = model.input_mean
input_std = model.input_std
div = model.div
learnable_named_parameters = model.learnable_named_parameters

# Parallel!
# model = torch.nn.DataParallel(model)
# model = torch.nn.DataParallel(model).to(device)

#  ========================= TRAIN DATA =========================
# 

print("tr_dataset", flush=True)

tr_clip_path_to_video_arr, tr_clip_start_arr, tr_clip_end_arr, tr_clip_action_id_arr, _ = prepare_clips_data(
    raw_annotation_file=args.raw_annotation_file,
    holoassist_dir=args.holoassist_dir,
    split_dir=args.split_dir,
    fine_grained_actions_map_file=args.fine_grained_actions_map_file,
    mode="train",
)
tr_transform = Compose([
    GroupMultiScaleCrop(input_size=crop_size, scales=[1, .875]),
    Stack(),
    ToTorchFormatTensor(div=div),
    GroupNormalize(mean=input_mean, std=input_std),
])

tr_dataset = VideoDataset(
    clip_path_to_video_arr=tr_clip_path_to_video_arr,
    clip_start_arr=tr_clip_start_arr,
    clip_end_arr=tr_clip_end_arr,
    clip_label_arr=tr_clip_action_id_arr,
    num_segments=args.num_segments,
    transform=tr_transform,
    mode="train"
)
tr_dataloader = DataLoader(
    dataset=tr_dataset, 
    batch_size=args.batch_size, 
    shuffle=True,
    num_workers=args.num_workers, 
    drop_last=True,
    pin_memory=False,
    prefetch_factor=args.prefetch_factor,
    # pin_memory=True,
)

#  ========================= VALIDATION DATA =========================

print("va_dataset", flush=True)

va_clip_path_to_video_arr, va_clip_start_arr, va_clip_end_arr, va_clip_action_id_arr, _ = prepare_clips_data(
    raw_annotation_file=args.raw_annotation_file,
    holoassist_dir=args.holoassist_dir, 
    split_dir=args.split_dir,
    fine_grained_actions_map_file=args.fine_grained_actions_map_file,
    mode="validation",
)
va_transform = Compose([
    GroupMultiScaleCrop(input_size=crop_size, scales=[1, .875]),
    Stack(),
    ToTorchFormatTensor(div=div),
    GroupNormalize(mean=input_mean, std=input_std),
])
va_dataset = VideoDataset(
    clip_path_to_video_arr=va_clip_path_to_video_arr,
    clip_start_arr=va_clip_start_arr,
    clip_end_arr=va_clip_end_arr,
    clip_label_arr=va_clip_action_id_arr,
    num_segments=args.num_segments,
    transform=va_transform,
    mode="validation"
)
va_dataloader = DataLoader(
    dataset=va_dataset, 
    batch_size=args.batch_size, 
    shuffle=False,
    num_workers=args.num_workers, 
    drop_last=False, 
    pin_memory=False,
    prefetch_factor=args.prefetch_factor,
    # pin_memory=True,
)


tr_dataset


There are 7 videos in the list There are 13 videos as video files There are 0 videos that present in the list but are missing as videos.
Number of clips: 135 for mode train
va_dataset
There are 213 videos in the list There are 13 videos as video files There are 206 videos that present in the list but are missing as videos.
Number of clips: 135 for mode validation


In [8]:

# =====================================================================

criterion = nn.CrossEntropyLoss().to(device)

optimizer = torch.optim.SGD(
    params=model.parameters(),
    lr=args.lr,
    momentum=args.momentum,
    weight_decay=args.weight_decay
)
lr_scheduler = CosineAnnealingLR(
    optimizer=optimizer, 
    T_max=args.num_epochs, 
    eta_min=1e-7, 
    last_epoch=-1
)

if args.resume:
    if os.path.isfile(args.resume):

        # Load checkpoint file that contains all the states
        print(f"=> Loading checkpoint {args.resume}")
        checkpoint = torch.load(f=args.resume)

        # Load state from checkpoint
        model.load_state_dict(state_dict=checkpoint['model_state_dict'])
        optimizer.load_state_dict(state_dict=checkpoint['optimizer_state_dict'])
        lr_scheduler.load_state_dict(state_dict=checkpoint['lr_scheduler_state_dict'])
        args.start_epoch = checkpoint['epoch'] + 1
    else:
        raise ValueError(f"=> No checkpoint found at {args.resume}")

In [9]:
    
# ==================== Main train-validation loop =================================

for epoch in range(args.start_epoch, args.num_epochs):

    # Reproducibility.
    # Set up random seed to current epoch
    # This is important to preserve reproducibility 
    # in case when we load model checkpoint.
    make_reproducible(random_seed=epoch)

    print(f"\nEpoch {epoch}", 
            f"LR={optimizer.param_groups[0]['lr']:.7f}",
            f"time={datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')}",
            flush=True)

    # TRAIN
    # =====
    print(f"\nTRAIN")

    tr_epoch_loss = AverageMeter()
    tr_epoch_acc1 = AverageMeter()
    tr_epoch_acc5 = AverageMeter()

    model.train()
    for tr_batch_id, tr_batch in enumerate(tr_dataloader):
        
        tr_x = tr_batch[0].to(device) # video batch with image sequences [n, t_c, h, w]
        tr_y = tr_batch[1].to(device) # video batch labels [n]

        # Make predictions for train batch
        tr_preds = model(tr_x)
        tr_loss = criterion(tr_preds, tr_y)
        tr_acc1, tr_acc5  = calc_accuracy(preds=tr_preds, labels=tr_y, topk=(1,5))

        # Zero the gradients
        optimizer.zero_grad(set_to_none=True)

        # Compute gradient of the loss wrt all learnable parameters
        tr_loss.backward()

        # Clip computed gradients
        if args.clip_gradient is not None:
            total_norm = clip_grad_norm_(parameters=model.parameters(), max_norm=args.clip_gradient)
            # if total_norm > args.clip_gradient:
            #     print(f"Clipping gradient: {total_norm} with coef {args.clip_gradient / total_norm}")
        
        # Update the weights using optimizer
        optimizer.step()


        # Keep track of epoch metrics (for each batch)
        tr_epoch_loss.update(value=tr_loss.detach().item(), n=tr_x.size(0))
        tr_epoch_acc1.update(value=tr_acc1, n=tr_x.size(0))
        tr_epoch_acc5.update(value=tr_acc5, n=tr_x.size(0))

        if tr_batch_id % 20 == 0:

            print(f"tr_batch_id={tr_batch_id:04d}/{len(tr_dataloader):04d}",
                    f"tr_batch_loss={tr_loss.detach().item():.3f}",
                    f"tr_batch_acc@1={tr_acc1:.3f}",
                    f"tr_batch_acc@5={tr_acc5:.3f}",
                    f"|",
                    f"tr_epoch_loss={tr_epoch_loss.avg:.3f}",
                    f"tr_epoch_acc@1={tr_epoch_acc1.avg:.3f}",
                    f"tr_epoch_acc@5={tr_epoch_acc5.avg:.3f}",
                    flush=True)
            
        del tr_preds, tr_loss, tr_acc1, tr_acc5, tr_batch, tr_x, tr_y
        gc.collect()
        # torch.cuda.empty_cache() # expensive call
    
    # Adjust learning rate after training epoch
    lr_scheduler.step()

Making reproducible on seed 0

Epoch 0 LR=0.0100000 time=2024-05-02 22:11:58

TRAIN
tr_batch_id=0000/0008 tr_batch_loss=7.715 tr_batch_acc@1=0.000 tr_batch_acc@5=0.000 | tr_epoch_loss=7.715 tr_epoch_acc@1=0.000 tr_epoch_acc@5=0.000
