In [1]:
import os
import sys
sys.path.append(os.path.abspath("../Video-Swin-Transformer"))

In [2]:
# Change teh working directory to a location that the code prefers
os.chdir("../Video-Swin-Transformer")

In [3]:
import argparse
import copy
import os.path as osp
import time
import warnings

import mmcv
import torch
from mmcv import Config, DictAction
from mmcv.runner import get_dist_info, init_dist, set_random_seed
from mmcv.utils import get_git_hash

from mmaction import __version__
from mmaction.apis import train_model
from mmaction.datasets import build_dataset
from mmaction.models import build_model
from mmaction.utils import collect_env, get_root_logger, register_module_hooks

In [4]:
!pip3 install wandb -qqq

In [5]:
import wandb
# Log in to your W&B account
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33maswin_thiru[0m (use `wandb login --relogin` to force relogin)


True

In [6]:
wandb_project_name = 'bsl'

In [7]:
# TODO import test functions from mmcv and delete them from mmaction2
try:
    from mmcv.engine import multi_gpu_test, single_gpu_test
except (ImportError, ModuleNotFoundError):
    warnings.warn(
        'DeprecationWarning: single_gpu_test, multi_gpu_test, '
        'collect_results_cpu, collect_results_gpu from mmaction2 will be '
        'deprecated. Please install mmcv through master branch.')
    from mmaction.apis import multi_gpu_test, single_gpu_test

In [8]:
def parse_args(parse_options=None):
    parser = argparse.ArgumentParser(description='Train a recognizer')
    parser.add_argument('config', help='train config file path')
    parser.add_argument('--work-dir', help='the dir to save logs and models')
    parser.add_argument(
        '--resume-from', help='the checkpoint file to resume from')
    parser.add_argument(
        '--load-from', help='the checkpoint file to load from')
    parser.add_argument(
        '--validate',
        action='store_true',
        help='whether to evaluate the checkpoint during training')
    parser.add_argument(
        '--test-last',
        action='store_true',
        help='whether to test the checkpoint after training')
    parser.add_argument(
        '--test-best',
        action='store_true',
        help=('whether to test the best checkpoint (if applicable) after '
              'training'))
    group_gpus = parser.add_mutually_exclusive_group()
    group_gpus.add_argument(
        '--gpus',
        type=int,
        help='number of gpus to use '
        '(only applicable to non-distributed training)')
    group_gpus.add_argument(
        '--gpu-ids',
        type=int,
        nargs='+',
        help='ids of gpus to use '
        '(only applicable to non-distributed training)')
    parser.add_argument('--seed', type=int, default=None, help='random seed')
    parser.add_argument(
        '--deterministic',
        action='store_true',
        help='whether to set deterministic options for CUDNN backend.')
    parser.add_argument(
        '--cfg-options',
        nargs='+',
        action=DictAction,
        default={},
        help='override some settings in the used config, the key-value pair '
        'in xxx=yyy format will be merged into config file. For example, '
        "'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'")
    parser.add_argument(
        '--launcher',
        choices=['none', 'pytorch', 'slurm', 'mpi'],
        default='none',
        help='job launcher')
    parser.add_argument('--local_rank', type=int, default=0)
    
    if parse_options is None: 
        args = parser.parse_args()
    else:
        args = parser.parse_args(parse_options)
        
    if 'LOCAL_RANK' not in os.environ:
        os.environ['LOCAL_RANK'] = str(args.local_rank)

    return args

In [9]:
# Setup the cofiguration and data file
config_file = '../configs/bsl_config.py'
check_point_file = '../configs/swin_tiny_patch244_window877_kinetics400_1k.pth'
# , "model.backbone.pretrained="+check_point_file
# cmd_options = [config_file, "--cfg-options", "model.backbone.use_checkpoint=True", "--load-from", check_point_file,
#                "--seed", "12345"]
cmd_options = [config_file, "--cfg-options", "model.backbone.use_checkpoint=True", "--load-from", check_point_file,
              "--validate", "--seed", "12345"]

In [10]:
distributed = False

In [11]:
# Create a configuration object that describes the training and testing
args = parse_args(cmd_options)
cfg = Config.fromfile(args.config)
cfg.merge_from_dict(args.cfg_options)

# Customization for training the BSL data set
# https://mmcv.readthedocs.io/en/latest/_modules/mmcv/runner/epoch_based_runner.html
cfg.workflow = [('train', 1), ('val', 1)]
# cfg.workflow = [('train', 1), ]
cfg.model.cls_head.num_classes = 5

# Resume from this pyhton checkpoint file
cfg.resume_from = args.resume_from
cfg.load_from = args.load_from

# One GPU
cfg.gpu_ids = range(1)

# The flag is used to determine whether it is omnisource training
# Omnisource reference: https://arxiv.org/abs/2003.13042
cfg.setdefault('omnisource', False)

# The flag is used to register module's hooks
cfg.setdefault('module_hooks', [])

# create work_dir
mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir))

# dump config
cfg.dump(osp.join(cfg.work_dir, osp.basename(args.config)))

# init logger before other steps
timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
log_file = osp.join(cfg.work_dir, f'{timestamp}.log')
logger = get_root_logger(log_file=log_file, log_level=cfg.log_level)

# init the meta dict to record some important information such as
# environment info and seed, which will be logged
meta = dict()
# log env info
env_info_dict = collect_env()
env_info = '\n'.join([f'{k}: {v}' for k, v in env_info_dict.items()])
dash_line = '-' * 60 + '\n'
logger.info('Environment info:\n' + dash_line + env_info + '\n' +
            dash_line)
meta['env_info'] = env_info

# log some basic info
logger.info(f'Distributed training: {distributed}')
logger.info(f'Config: {cfg.pretty_text}')

# Set seed for training
logger.info(f'Set random seed to {args.seed}, '
            f'deterministic: {args.deterministic}')
set_random_seed(args.seed, deterministic=args.deterministic)

cfg.seed = args.seed
meta['seed'] = args.seed
meta['config_name'] = osp.basename(args.config)
meta['work_dir'] = osp.basename(cfg.work_dir.rstrip('/\\'))

../Video-Swin-Transformer/configs/_base_/models/swin/swin_tiny.py
../Video-Swin-Transformer/configs/_base_/default_runtime.py


2022-04-08 00:35:12,321 - mmaction - INFO - Environment info:
------------------------------------------------------------
sys.platform: linux
Python: 3.8.12 | packaged by conda-forge | (default, Oct 12 2021, 21:59:51) [GCC 9.4.0]
CUDA available: True
GPU 0: Tesla T4
CUDA_HOME: /usr/local/cuda
NVCC: Build cuda_11.6.r11.6/compiler.30794723_0
GCC: gcc (Ubuntu 9.3.0-17ubuntu1~20.04) 9.3.0
PyTorch: 1.11.0a0+17540c5
PyTorch compiling details: PyTorch built with:
  - GCC 9.3
  - C++ Version: 201402
  - Intel(R) Math Kernel Library Version 2019.0.5 Product Build 20190808 for Intel(R) 64 architecture applications
  - Intel(R) MKL-DNN v2.3.3 (Git Hash N/A)
  - OpenMP 201511 (a.k.a. OpenMP 4.5)
  - LAPACK is enabled (usually provided by MKL)
  - NNPACK is enabled
  - CPU capability usage: AVX512
  - CUDA Runtime 11.6
  - NVCC architecture flags: -gencode;arch=compute_52,code=sm_52;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_61,code=sm_61;-gencode;arch=compute_70,code=sm_70;-gencode

In [12]:
# Create the dataset
datasets = [build_dataset(cfg.data.train)]

# Validation is setup as a hook that kicks off every 5 iterations
# This is required for wandb to kickk off every iteration
if 1:
    # Create the validation dataset
    val_dataset = copy.deepcopy(cfg.data.val)
    datasets.append(build_dataset(val_dataset))

In [13]:
for batch in val_dataset:
    break

In [14]:
# Which model to test after training, best or last?
test_option = dict(test_last=args.test_last, test_best=args.test_best)

In [15]:
# Build the model for 
model = build_model(cfg.model, train_cfg=cfg.get('train_cfg'), test_cfg=cfg.get('test_cfg'))

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


# Train the model

In [16]:
import apex
from mmaction.core import DistEvalHook, EvalHook
from mmaction.datasets import build_dataloader, build_dataset
from mmcv_custom.runner import EpochBasedRunnerAmp
from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
from mmcv.runner import DistSamplerSeedHook, EpochBasedRunner, OptimizerHook, build_optimizer, get_dist_info

In [17]:
# Get the root logger
logger = get_root_logger(log_level=cfg.log_level)


# Load the data using the GPU
dataloader_setting = dict(
    videos_per_gpu=cfg.data.get('videos_per_gpu', 1) // cfg.optimizer_config.get('update_interval', 1),
    workers_per_gpu=cfg.data.get('workers_per_gpu', 1),
    num_gpus=len(cfg.gpu_ids),
    dist=distributed,
    seed=cfg.seed)

# 
dataloader_setting = dict(dataloader_setting, **cfg.data.get('train_dataloader', {}))
data_loaders = [build_dataloader(ds, **dataloader_setting) for ds in datasets]

# 
val_dataset = build_dataset(cfg.data.val, dict(test_mode=True))
dataloader_setting = dict(
    videos_per_gpu=cfg.data.get('videos_per_gpu', 1),
    workers_per_gpu=cfg.data.get('workers_per_gpu', 1),
    # cfg.gpus will be ignored if distributed
    num_gpus=len(cfg.gpu_ids),
    dist=distributed,
    shuffle=False)

dataloader_setting = dict(dataloader_setting, **cfg.data.get('val_dataloader', {}))
val_dataloader = build_dataloader(val_dataset, **dataloader_setting)

In [18]:
# build optimizer
optimizer = build_optimizer(model, cfg.optimizer)
model, optimizer = apex.amp.initialize(model.cuda(), optimizer, opt_level="O1")

Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic


In [19]:
for m in model.modules():
    if hasattr(m, "fp16_enabled"):
        m.fp16_enabled = True

In [20]:
# Put the model on GPU's for training
if distributed:
    find_unused_parameters = cfg.get('find_unused_parameters', False)
    # Sets the `find_unused_parameters` parameter in
    # torch.nn.parallel.DistributedDataParallel
    model = MMDistributedDataParallel(
        model.cuda(),
        device_ids=[torch.cuda.current_device()],
        broadcast_buffers=False,
        find_unused_parameters=find_unused_parameters)
else:
    model = MMDataParallel(
        model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids)

In [21]:
# Create the class that will run the code 
Runner = EpochBasedRunnerAmp
runner = Runner(model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta)

# an ugly workaround to make .log and .log.json filenames the same
runner.timestamp = timestamp

# 
optimizer_config = cfg.optimizer_config

# register hooks
runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config, cfg.get('momentum_config', None))

if distributed:
    runner.register_hook(DistSamplerSeedHook())

In [22]:
# 
# eval_cfg = cfg.get('evaluation', {})
# eval_hook = DistEvalHook if distributed else EvalHook
# runner.register_hook(eval_hook(val_dataloader, **eval_cfg))

In [23]:
if cfg.resume_from:
    runner.resume(cfg.resume_from, resume_amp=use_amp)
elif cfg.get("auto_resume", False) and osp.exists(osp.join(runner.work_dir, 'latest.pth')):
    runner.auto_resume()
elif cfg.load_from:
    runner.load_checkpoint(cfg.load_from)

2022-04-08 00:35:14,731 - mmaction - INFO - load checkpoint from local path: ../configs/swin_tiny_patch244_window877_kinetics400_1k.pth

size mismatch for cls_head.fc_cls.weight: copying a param with shape torch.Size([400, 768]) from checkpoint, the shape in current model is torch.Size([5, 768]).
size mismatch for cls_head.fc_cls.bias: copying a param with shape torch.Size([400]) from checkpoint, the shape in current model is torch.Size([5]).


## Dashboarding using wandb

In [24]:
wandb.init(project=wandb_project_name, config=cfg)

In [25]:
from mmcv.runner import Hook
from torch.utils.data import DataLoader
from mmaction.apis import single_gpu_test


class WandBHook(Hook):  # noqa: F811
    """Non-Distributed evaluation hook.

    Notes:
        If new arguments are added for EvalHook, tools/test.py,
        tools/eval_metric.py may be effected.

    This hook will regularly perform evaluation in a given interval when
    performing in non-distributed environment.

    Args:
        dataloader (DataLoader): A PyTorch dataloader.
        wandb_obj: A wandb object
        optimizer_obj: optimizer object
        **eval_kwargs: Evaluation arguments fed into the evaluate function
            of the dataset.
    """

    def __init__(self,
                 dataloader,
                 wandb_obj,
                 optimizer_obj,
                 **eval_kwargs):

        if not isinstance(dataloader, DataLoader):
            raise TypeError(f'dataloader must be a pytorch DataLoader, '
                            f'but got {type(dataloader)}')

        self.dataloader = dataloader
        self.wandb = wandb
        self.eval_kwargs = eval_kwargs
    
    def before_train_epoch(self, runner):
        """Called after every train epoch to save learning rate"""
        self.wandb.log({"lr": optimizer.param_groups[0]['lr']})

    def after_val_epoch(self, runner):
        """Called after every validation epoch to evaluate the results."""
        self._do_evaluate(runner)

    def _do_evaluate(self, runner):
        results = single_gpu_test(runner.model, self.dataloader)
        eval_res = self.dataloader.dataset.evaluate(results, logger=runner.logger, **self.eval_kwargs)
        self.wandb.log(eval_res)

In [26]:
runner.register_hook(WandBHook(val_dataloader, wandb, optimizer))

In [27]:
runner_kwargs = dict()
runner.run(data_loaders, cfg.workflow, cfg.total_epochs, **runner_kwargs)

2022-04-08 00:35:16,724 - mmaction - INFO - Start running, host: root@ip-10-0-0-144, work_dir: /workspace/Video-Swin-Transformer/work_dirs/k400_swin_tiny_patch244_window877.py
2022-04-08 00:35:16,724 - mmaction - INFO - Hooks will be executed in the following order:
before_run:
(VERY_HIGH   ) CosineAnnealingLrUpdaterHook       
(ABOVE_NORMAL) DistOptimizerHook                  
(NORMAL      ) CheckpointHook                     
(VERY_LOW    ) TextLoggerHook                     
 -------------------- 
before_train_epoch:
(VERY_HIGH   ) CosineAnnealingLrUpdaterHook       
(NORMAL      ) WandBHook                          
(LOW         ) IterTimerHook                      
(VERY_LOW    ) TextLoggerHook                     
 -------------------- 
before_train_iter:
(VERY_HIGH   ) CosineAnnealingLrUpdaterHook       
(LOW         ) IterTimerHook                      
 -------------------- 
after_train_iter:
(ABOVE_NORMAL) DistOptimizerHook                  
(NORMAL      ) CheckpointHook     

[>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] 55/55, 2.9 task/s, elapsed: 19s, ETA:     0s

2022-04-08 00:37:03,966 - mmaction - INFO - Evaluating top_k_accuracy ...
2022-04-08 00:37:03,967 - mmaction - INFO - 
top1_acc	0.4182
top5_acc	1.0000
2022-04-08 00:37:03,969 - mmaction - INFO - Epoch(val) [1][28]	top1_acc: 0.4182, top5_acc: 1.0000, loss_cls: 1.5456, loss: 1.5456
2022-04-08 00:37:19,872 - mmaction - INFO - Epoch [2][20/111]	lr: 5.159e-05, eta: 0:16:52, time: 0.795, data_time: 0.144, memory: 2833, top1_acc: 0.5250, top5_acc: 1.0000, loss_cls: 1.5009, loss: 1.5009
2022-04-08 00:37:32,880 - mmaction - INFO - Epoch [2][40/111]	lr: 5.801e-05, eta: 0:16:37, time: 0.650, data_time: 0.001, memory: 2833, top1_acc: 0.5500, top5_acc: 1.0000, loss_cls: 1.4020, loss: 1.4020
2022-04-08 00:37:45,885 - mmaction - INFO - Epoch [2][60/111]	lr: 6.442e-05, eta: 0:16:22, time: 0.650, data_time: 0.000, memory: 2835, top1_acc: 0.6500, top5_acc: 1.0000, loss_cls: 1.3075, loss: 1.3075
2022-04-08 00:37:58,914 - mmaction - INFO - Epoch [2][80/111]	lr: 7.084e-05, eta: 0:16:08, time: 0.651, data_t

[>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] 55/55, 2.9 task/s, elapsed: 19s, ETA:     0s

2022-04-08 00:38:48,875 - mmaction - INFO - Evaluating top_k_accuracy ...
2022-04-08 00:38:48,877 - mmaction - INFO - 
top1_acc	0.5818
top5_acc	1.0000
2022-04-08 00:38:48,879 - mmaction - INFO - Epoch(val) [2][28]	top1_acc: 0.5818, top5_acc: 1.0000, loss_cls: 1.0687, loss: 1.0687


Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 32768.0


2022-04-08 00:39:04,757 - mmaction - INFO - Epoch [3][20/111]	lr: 8.435e-05, eta: 0:15:07, time: 0.794, data_time: 0.139, memory: 2835, top1_acc: 0.6000, top5_acc: 1.0000, loss_cls: 1.1830, loss: 1.1830
2022-04-08 00:39:17,834 - mmaction - INFO - Epoch [3][40/111]	lr: 9.056e-05, eta: 0:14:56, time: 0.654, data_time: 0.000, memory: 2835, top1_acc: 0.7250, top5_acc: 1.0000, loss_cls: 0.8476, loss: 0.8476
2022-04-08 00:39:30,936 - mmaction - INFO - Epoch [3][60/111]	lr: 9.552e-05, eta: 0:14:45, time: 0.655, data_time: 0.000, memory: 2835, top1_acc: 0.6250, top5_acc: 1.0000, loss_cls: 0.9983, loss: 0.9983
2022-04-08 00:39:44,118 - mmaction - INFO - Epoch [3][80/111]	lr: 9.552e-05, eta: 0:14:34, time: 0.659, data_time: 0.000, memory: 2835, top1_acc: 0.6250, top5_acc: 1.0000, loss_cls: 0.9227, loss: 0.9227
2022-04-08 00:39:57,377 - mmaction - INFO - Epoch [3][100/111]	lr: 9.552e-05, eta: 0:14:23, time: 0.663, data_time: 0.001, memory: 2835, top1_acc: 0.6000, top5_acc: 1.0000, loss_cls: 0.877

[>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] 55/55, 2.9 task/s, elapsed: 19s, ETA:     0s

2022-04-08 00:40:34,036 - mmaction - INFO - Evaluating top_k_accuracy ...
2022-04-08 00:40:34,037 - mmaction - INFO - 
top1_acc	0.7455
top5_acc	1.0000
2022-04-08 00:40:34,039 - mmaction - INFO - Epoch(val) [3][28]	top1_acc: 0.7455, top5_acc: 1.0000, loss_cls: 0.5884, loss: 0.5884
2022-04-08 00:40:50,455 - mmaction - INFO - Epoch [4][20/111]	lr: 9.045e-05, eta: 0:13:50, time: 0.821, data_time: 0.151, memory: 2835, top1_acc: 0.7000, top5_acc: 1.0000, loss_cls: 0.8423, loss: 0.8423
2022-04-08 00:41:03,859 - mmaction - INFO - Epoch [4][40/111]	lr: 9.045e-05, eta: 0:13:40, time: 0.670, data_time: 0.000, memory: 2835, top1_acc: 0.6750, top5_acc: 1.0000, loss_cls: 0.8360, loss: 0.8360
2022-04-08 00:41:17,313 - mmaction - INFO - Epoch [4][60/111]	lr: 9.045e-05, eta: 0:13:30, time: 0.673, data_time: 0.001, memory: 2835, top1_acc: 0.8000, top5_acc: 1.0000, loss_cls: 0.4526, loss: 0.4526
2022-04-08 00:41:30,813 - mmaction - INFO - Epoch [4][80/111]	lr: 9.045e-05, eta: 0:13:19, time: 0.675, data_t

[>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] 55/55, 2.9 task/s, elapsed: 19s, ETA:     0s

2022-04-08 00:42:21,803 - mmaction - INFO - Evaluating top_k_accuracy ...
2022-04-08 00:42:21,804 - mmaction - INFO - 
top1_acc	0.7273
top5_acc	1.0000
2022-04-08 00:42:21,807 - mmaction - INFO - Epoch(val) [4][28]	top1_acc: 0.7273, top5_acc: 1.0000, loss_cls: 0.7427, loss: 0.7427
2022-04-08 00:42:38,999 - mmaction - INFO - Epoch [5][20/111]	lr: 8.346e-05, eta: 0:12:42, time: 0.859, data_time: 0.174, memory: 2835, top1_acc: 0.7750, top5_acc: 1.0000, loss_cls: 0.5533, loss: 0.5533
2022-04-08 00:42:52,787 - mmaction - INFO - Epoch [5][40/111]	lr: 8.346e-05, eta: 0:12:32, time: 0.689, data_time: 0.001, memory: 2835, top1_acc: 0.7250, top5_acc: 1.0000, loss_cls: 0.7710, loss: 0.7710
2022-04-08 00:43:06,582 - mmaction - INFO - Epoch [5][60/111]	lr: 8.346e-05, eta: 0:12:22, time: 0.690, data_time: 0.000, memory: 2835, top1_acc: 0.7750, top5_acc: 1.0000, loss_cls: 0.6118, loss: 0.6118
2022-04-08 00:43:20,440 - mmaction - INFO - Epoch [5][80/111]	lr: 8.346e-05, eta: 0:12:11, time: 0.693, data_t

[>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] 55/55, 2.8 task/s, elapsed: 19s, ETA:     0s

2022-04-08 00:44:11,646 - mmaction - INFO - Evaluating top_k_accuracy ...
2022-04-08 00:44:11,648 - mmaction - INFO - 
top1_acc	0.8364
top5_acc	1.0000
2022-04-08 00:44:11,650 - mmaction - INFO - Epoch(val) [5][28]	top1_acc: 0.8364, top5_acc: 1.0000, loss_cls: 0.3528, loss: 0.3528
2022-04-08 00:44:28,258 - mmaction - INFO - Epoch [6][20/111]	lr: 7.500e-05, eta: 0:11:34, time: 0.830, data_time: 0.145, memory: 2835, top1_acc: 0.7500, top5_acc: 1.0000, loss_cls: 0.4963, loss: 0.4963
2022-04-08 00:44:42,033 - mmaction - INFO - Epoch [6][40/111]	lr: 7.500e-05, eta: 0:11:23, time: 0.689, data_time: 0.000, memory: 2835, top1_acc: 0.7500, top5_acc: 1.0000, loss_cls: 0.5577, loss: 0.5577
2022-04-08 00:44:55,858 - mmaction - INFO - Epoch [6][60/111]	lr: 7.500e-05, eta: 0:11:12, time: 0.691, data_time: 0.000, memory: 2835, top1_acc: 0.7250, top5_acc: 1.0000, loss_cls: 0.6180, loss: 0.6180
2022-04-08 00:45:09,717 - mmaction - INFO - Epoch [6][80/111]	lr: 7.500e-05, eta: 0:11:01, time: 0.693, data_t

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 16384.0


2022-04-08 00:45:23,528 - mmaction - INFO - Epoch [6][100/111]	lr: 7.500e-05, eta: 0:10:50, time: 0.691, data_time: 0.000, memory: 2835, top1_acc: 0.7500, top5_acc: 1.0000, loss_cls: 0.7327, loss: 0.7327
2022-04-08 00:45:30,826 - mmaction - INFO - Saving checkpoint at 6 epochs


[>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] 55/55, 2.7 task/s, elapsed: 20s, ETA:     0s

2022-04-08 00:46:01,609 - mmaction - INFO - Evaluating top_k_accuracy ...
2022-04-08 00:46:01,610 - mmaction - INFO - 
top1_acc	0.7818
top5_acc	1.0000
2022-04-08 00:46:01,612 - mmaction - INFO - Epoch(val) [6][28]	top1_acc: 0.7818, top5_acc: 1.0000, loss_cls: 0.4892, loss: 0.4892
2022-04-08 00:46:18,011 - mmaction - INFO - Epoch [7][20/111]	lr: 6.545e-05, eta: 0:10:25, time: 0.820, data_time: 0.132, memory: 2835, top1_acc: 0.7750, top5_acc: 1.0000, loss_cls: 0.5482, loss: 0.5482
2022-04-08 00:46:31,801 - mmaction - INFO - Epoch [7][40/111]	lr: 6.545e-05, eta: 0:10:13, time: 0.690, data_time: 0.000, memory: 2835, top1_acc: 0.8500, top5_acc: 1.0000, loss_cls: 0.4332, loss: 0.4332
2022-04-08 00:46:45,642 - mmaction - INFO - Epoch [7][60/111]	lr: 6.545e-05, eta: 0:10:02, time: 0.692, data_time: 0.000, memory: 2835, top1_acc: 0.7750, top5_acc: 1.0000, loss_cls: 0.5290, loss: 0.5290
2022-04-08 00:46:59,459 - mmaction - INFO - Epoch [7][80/111]	lr: 6.545e-05, eta: 0:09:50, time: 0.691, data_t

[>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] 55/55, 2.7 task/s, elapsed: 20s, ETA:     0s

2022-04-08 00:47:51,681 - mmaction - INFO - Evaluating top_k_accuracy ...
2022-04-08 00:47:51,683 - mmaction - INFO - 
top1_acc	0.8000
top5_acc	1.0000
2022-04-08 00:47:51,684 - mmaction - INFO - Epoch(val) [7][28]	top1_acc: 0.8000, top5_acc: 1.0000, loss_cls: 0.5578, loss: 0.5578
2022-04-08 00:48:07,943 - mmaction - INFO - Epoch [8][20/111]	lr: 5.523e-05, eta: 0:09:14, time: 0.813, data_time: 0.150, memory: 2835, top1_acc: 0.8250, top5_acc: 1.0000, loss_cls: 0.4175, loss: 0.4175
2022-04-08 00:48:21,059 - mmaction - INFO - Epoch [8][40/111]	lr: 5.523e-05, eta: 0:09:02, time: 0.656, data_time: 0.000, memory: 2835, top1_acc: 0.8750, top5_acc: 1.0000, loss_cls: 0.3960, loss: 0.3960
2022-04-08 00:48:34,127 - mmaction - INFO - Epoch [8][60/111]	lr: 5.523e-05, eta: 0:08:50, time: 0.653, data_time: 0.000, memory: 2835, top1_acc: 0.9500, top5_acc: 1.0000, loss_cls: 0.2567, loss: 0.2567
2022-04-08 00:48:47,168 - mmaction - INFO - Epoch [8][80/111]	lr: 5.523e-05, eta: 0:08:37, time: 0.652, data_t

[>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] 55/55, 2.8 task/s, elapsed: 20s, ETA:     0s

2022-04-08 00:49:38,643 - mmaction - INFO - Evaluating top_k_accuracy ...
2022-04-08 00:49:38,644 - mmaction - INFO - 
top1_acc	0.8364
top5_acc	1.0000
2022-04-08 00:49:38,646 - mmaction - INFO - Epoch(val) [8][28]	top1_acc: 0.8364, top5_acc: 1.0000, loss_cls: 0.5330, loss: 0.5330
2022-04-08 00:49:54,602 - mmaction - INFO - Epoch [9][20/111]	lr: 4.477e-05, eta: 0:08:01, time: 0.798, data_time: 0.144, memory: 2835, top1_acc: 0.8250, top5_acc: 1.0000, loss_cls: 0.3992, loss: 0.3992
2022-04-08 00:50:07,605 - mmaction - INFO - Epoch [9][40/111]	lr: 4.477e-05, eta: 0:07:49, time: 0.650, data_time: 0.001, memory: 2835, top1_acc: 0.7750, top5_acc: 1.0000, loss_cls: 0.4382, loss: 0.4382
2022-04-08 00:50:20,621 - mmaction - INFO - Epoch [9][60/111]	lr: 4.477e-05, eta: 0:07:36, time: 0.651, data_time: 0.000, memory: 2835, top1_acc: 0.8500, top5_acc: 1.0000, loss_cls: 0.4290, loss: 0.4290
2022-04-08 00:50:33,654 - mmaction - INFO - Epoch [9][80/111]	lr: 4.477e-05, eta: 0:07:24, time: 0.652, data_t

[>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] 55/55, 2.7 task/s, elapsed: 20s, ETA:     0s

2022-04-08 00:51:24,495 - mmaction - INFO - Evaluating top_k_accuracy ...
2022-04-08 00:51:24,496 - mmaction - INFO - 
top1_acc	0.8364
top5_acc	1.0000
2022-04-08 00:51:24,498 - mmaction - INFO - Epoch(val) [9][28]	top1_acc: 0.8364, top5_acc: 1.0000, loss_cls: 0.4637, loss: 0.4637
2022-04-08 00:51:41,326 - mmaction - INFO - Epoch [10][20/111]	lr: 3.455e-05, eta: 0:06:49, time: 0.841, data_time: 0.187, memory: 2835, top1_acc: 0.9250, top5_acc: 1.0000, loss_cls: 0.3097, loss: 0.3097
2022-04-08 00:51:54,420 - mmaction - INFO - Epoch [10][40/111]	lr: 3.455e-05, eta: 0:06:37, time: 0.655, data_time: 0.000, memory: 2835, top1_acc: 0.8750, top5_acc: 1.0000, loss_cls: 0.2690, loss: 0.2690
2022-04-08 00:52:07,573 - mmaction - INFO - Epoch [10][60/111]	lr: 3.455e-05, eta: 0:06:25, time: 0.658, data_time: 0.000, memory: 2835, top1_acc: 0.9500, top5_acc: 1.0000, loss_cls: 0.2009, loss: 0.2009
2022-04-08 00:52:20,771 - mmaction - INFO - Epoch [10][80/111]	lr: 3.455e-05, eta: 0:06:12, time: 0.660, da

[>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] 55/55, 2.8 task/s, elapsed: 20s, ETA:     0s

2022-04-08 00:53:12,438 - mmaction - INFO - Evaluating top_k_accuracy ...
2022-04-08 00:53:12,439 - mmaction - INFO - 
top1_acc	0.8000
top5_acc	1.0000
2022-04-08 00:53:12,441 - mmaction - INFO - Epoch(val) [10][28]	top1_acc: 0.8000, top5_acc: 1.0000, loss_cls: 0.5908, loss: 0.5908
2022-04-08 00:53:28,541 - mmaction - INFO - Epoch [11][20/111]	lr: 2.500e-05, eta: 0:05:38, time: 0.805, data_time: 0.134, memory: 2835, top1_acc: 0.9000, top5_acc: 1.0000, loss_cls: 0.2518, loss: 0.2518
2022-04-08 00:53:41,977 - mmaction - INFO - Epoch [11][40/111]	lr: 2.500e-05, eta: 0:05:26, time: 0.672, data_time: 0.000, memory: 2835, top1_acc: 0.8250, top5_acc: 1.0000, loss_cls: 0.3926, loss: 0.3926
2022-04-08 00:53:55,463 - mmaction - INFO - Epoch [11][60/111]	lr: 2.500e-05, eta: 0:05:14, time: 0.674, data_time: 0.000, memory: 2835, top1_acc: 0.9250, top5_acc: 1.0000, loss_cls: 0.2422, loss: 0.2422
2022-04-08 00:54:09,011 - mmaction - INFO - Epoch [11][80/111]	lr: 2.500e-05, eta: 0:05:01, time: 0.677, d

[>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] 55/55, 2.7 task/s, elapsed: 20s, ETA:     0s

2022-04-08 00:55:01,202 - mmaction - INFO - Evaluating top_k_accuracy ...
2022-04-08 00:55:01,203 - mmaction - INFO - 
top1_acc	0.8909
top5_acc	1.0000
2022-04-08 00:55:01,205 - mmaction - INFO - Epoch(val) [11][28]	top1_acc: 0.8909, top5_acc: 1.0000, loss_cls: 0.2615, loss: 0.2615
2022-04-08 00:55:17,660 - mmaction - INFO - Epoch [12][20/111]	lr: 1.654e-05, eta: 0:04:28, time: 0.823, data_time: 0.148, memory: 2835, top1_acc: 0.9250, top5_acc: 1.0000, loss_cls: 0.2321, loss: 0.2321
2022-04-08 00:55:31,156 - mmaction - INFO - Epoch [12][40/111]	lr: 1.654e-05, eta: 0:04:16, time: 0.675, data_time: 0.000, memory: 2835, top1_acc: 0.9250, top5_acc: 1.0000, loss_cls: 0.2572, loss: 0.2572
2022-04-08 00:55:44,684 - mmaction - INFO - Epoch [12][60/111]	lr: 1.654e-05, eta: 0:04:03, time: 0.676, data_time: 0.000, memory: 2835, top1_acc: 0.9250, top5_acc: 1.0000, loss_cls: 0.2598, loss: 0.2598
2022-04-08 00:55:58,236 - mmaction - INFO - Epoch [12][80/111]	lr: 1.654e-05, eta: 0:03:51, time: 0.678, d

[>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] 55/55, 2.8 task/s, elapsed: 20s, ETA:     0s

2022-04-08 00:56:50,005 - mmaction - INFO - Evaluating top_k_accuracy ...
2022-04-08 00:56:50,006 - mmaction - INFO - 
top1_acc	0.8545
top5_acc	1.0000
2022-04-08 00:56:50,008 - mmaction - INFO - Epoch(val) [12][28]	top1_acc: 0.8545, top5_acc: 1.0000, loss_cls: 0.2637, loss: 0.2637
2022-04-08 00:57:06,511 - mmaction - INFO - Epoch [13][20/111]	lr: 9.549e-06, eta: 0:03:18, time: 0.825, data_time: 0.150, memory: 2835, top1_acc: 0.9000, top5_acc: 1.0000, loss_cls: 0.2457, loss: 0.2457
2022-04-08 00:57:20,010 - mmaction - INFO - Epoch [13][40/111]	lr: 9.549e-06, eta: 0:03:05, time: 0.675, data_time: 0.001, memory: 2835, top1_acc: 0.9500, top5_acc: 1.0000, loss_cls: 0.1283, loss: 0.1283
2022-04-08 00:57:33,527 - mmaction - INFO - Epoch [13][60/111]	lr: 9.549e-06, eta: 0:02:53, time: 0.676, data_time: 0.000, memory: 2835, top1_acc: 0.9000, top5_acc: 1.0000, loss_cls: 0.2692, loss: 0.2692
2022-04-08 00:57:47,085 - mmaction - INFO - Epoch [13][80/111]	lr: 9.549e-06, eta: 0:02:40, time: 0.678, d

[>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] 55/55, 2.9 task/s, elapsed: 19s, ETA:     0s

2022-04-08 00:58:37,617 - mmaction - INFO - Evaluating top_k_accuracy ...
2022-04-08 00:58:37,618 - mmaction - INFO - 
top1_acc	0.8545
top5_acc	1.0000
2022-04-08 00:58:37,620 - mmaction - INFO - Epoch(val) [13][28]	top1_acc: 0.8545, top5_acc: 1.0000, loss_cls: 0.3119, loss: 0.3119
2022-04-08 00:58:53,812 - mmaction - INFO - Epoch [14][20/111]	lr: 4.323e-06, eta: 0:02:07, time: 0.809, data_time: 0.134, memory: 2835, top1_acc: 0.9000, top5_acc: 1.0000, loss_cls: 0.3167, loss: 0.3167
2022-04-08 00:59:07,322 - mmaction - INFO - Epoch [14][40/111]	lr: 4.323e-06, eta: 0:01:55, time: 0.675, data_time: 0.001, memory: 2835, top1_acc: 0.9250, top5_acc: 1.0000, loss_cls: 0.2064, loss: 0.2064
2022-04-08 00:59:20,856 - mmaction - INFO - Epoch [14][60/111]	lr: 4.323e-06, eta: 0:01:42, time: 0.677, data_time: 0.000, memory: 2835, top1_acc: 0.9000, top5_acc: 1.0000, loss_cls: 0.3490, loss: 0.3490
2022-04-08 00:59:34,381 - mmaction - INFO - Epoch [14][80/111]	lr: 4.323e-06, eta: 0:01:30, time: 0.676, d

[>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] 55/55, 2.8 task/s, elapsed: 19s, ETA:     0s

2022-04-08 01:00:25,399 - mmaction - INFO - Evaluating top_k_accuracy ...
2022-04-08 01:00:25,400 - mmaction - INFO - 
top1_acc	0.8545
top5_acc	1.0000
2022-04-08 01:00:25,402 - mmaction - INFO - Epoch(val) [14][28]	top1_acc: 0.8545, top5_acc: 1.0000, loss_cls: 0.2554, loss: 0.2554
2022-04-08 01:00:41,849 - mmaction - INFO - Epoch [15][20/111]	lr: 1.093e-06, eta: 0:00:57, time: 0.822, data_time: 0.147, memory: 2835, top1_acc: 0.8000, top5_acc: 1.0000, loss_cls: 0.3674, loss: 0.3674
2022-04-08 01:00:55,335 - mmaction - INFO - Epoch [15][40/111]	lr: 1.093e-06, eta: 0:00:45, time: 0.674, data_time: 0.000, memory: 2835, top1_acc: 0.9250, top5_acc: 1.0000, loss_cls: 0.2326, loss: 0.2326
2022-04-08 01:01:08,837 - mmaction - INFO - Epoch [15][60/111]	lr: 1.093e-06, eta: 0:00:32, time: 0.675, data_time: 0.000, memory: 2835, top1_acc: 0.9250, top5_acc: 1.0000, loss_cls: 0.2053, loss: 0.2053
2022-04-08 01:01:22,388 - mmaction - INFO - Epoch [15][80/111]	lr: 1.093e-06, eta: 0:00:19, time: 0.678, d

[>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] 55/55, 2.9 task/s, elapsed: 19s, ETA:     0s

2022-04-08 01:02:13,020 - mmaction - INFO - Evaluating top_k_accuracy ...
2022-04-08 01:02:13,021 - mmaction - INFO - 
top1_acc	0.8545
top5_acc	1.0000
2022-04-08 01:02:13,023 - mmaction - INFO - Epoch(val) [15][28]	top1_acc: 0.8545, top5_acc: 1.0000, loss_cls: 0.2477, loss: 0.2477


In [28]:
wandb.finish()




VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
lr,███▇▇▆▆▅▄▃▃▂▂▁▁
top1_acc,▁▃▆▆▇▆▇▇▇▇█▇▇▇▇
top5_acc,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
lr,0.0
top1_acc,0.85455
top5_acc,1.0


## Unused

In [29]:
# # init distributed env first, since logger depends on the dist info.
# if args.launcher == 'none':
#     distributed = False
# else:
#     distributed = True
#     init_dist(args.launcher, **cfg.dist_params)
#     _, world_size = get_dist_info()
#     cfg.gpu_ids = range(world_size)