In [1]:
import os
import sys
sys.path.append(os.path.abspath("../Video-Swin-Transformer"))

In [2]:
# Change teh working directory to a location that the code prefers
os.chdir("../Video-Swin-Transformer")

In [3]:
import torch
import warnings
import argparse
import numpy as np
import os.path as osp
from sklearn.metrics import confusion_matrix

import mmcv
from mmcv import Config, DictAction
from mmcv.cnn import fuse_conv_bn
from mmcv.fileio.io import file_handlers
from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
from mmcv.runner import get_dist_info, init_dist, load_checkpoint
from mmcv.runner.fp16_utils import wrap_fp16_model

from mmaction.models import build_model
from mmaction.utils import register_module_hooks
from mmaction.datasets import build_dataloader, build_dataset

In [4]:
# TODO import test functions from mmcv and delete them from mmaction2
try:
    from mmcv.engine import multi_gpu_test, single_gpu_test
except (ImportError, ModuleNotFoundError):
    warnings.warn(
        'DeprecationWarning: single_gpu_test, multi_gpu_test, '
        'collect_results_cpu, collect_results_gpu from mmaction2 will be '
        'deprecated. Please install mmcv through master branch.')
    from mmaction.apis import multi_gpu_test, single_gpu_test

In [5]:
# Setup the cofiguration and data file
config_file = '../configs/bsl_config.py'
check_point_file = './work_dirs/k400_swin_tiny_patch244_window877.py/best_top1_acc_epoch_10.pth'
cmd_options = [config_file, check_point_file, "--eval", "top_k_accuracy", "--average-clips", "score"]

In [6]:
def parse_args(parse_options=None):
    parser = argparse.ArgumentParser(
        description='MMAction2 test (and eval) a model')
    parser.add_argument('config', help='test config file path')
    parser.add_argument('checkpoint', help='checkpoint file')
    parser.add_argument(
        '--out',
        default=None,
        help='output result file in pkl/yaml/json format')
    parser.add_argument(
        '--fuse-conv-bn',
        action='store_true',
        help='Whether to fuse conv and bn, this will slightly increase'
        'the inference speed')
    parser.add_argument(
        '--eval',
        type=str,
        nargs='+',
        help='evaluation metrics, which depends on the dataset, e.g.,'
        ' "top_k_accuracy", "mean_class_accuracy" for video dataset')
    parser.add_argument(
        '--gpu-collect',
        action='store_true',
        help='whether to use gpu to collect results')
    parser.add_argument(
        '--tmpdir',
        help='tmp directory used for collecting results from multiple '
        'workers, available when gpu-collect is not specified')
    parser.add_argument(
        '--options',
        nargs='+',
        action=DictAction,
        default={},
        help='custom options for evaluation, the key-value pair in xxx=yyy '
        'format will be kwargs for dataset.evaluate() function (deprecate), '
        'change to --eval-options instead.')
    parser.add_argument(
        '--eval-options',
        nargs='+',
        action=DictAction,
        default={},
        help='custom options for evaluation, the key-value pair in xxx=yyy '
        'format will be kwargs for dataset.evaluate() function')
    parser.add_argument(
        '--cfg-options',
        nargs='+',
        action=DictAction,
        default={},
        help='override some settings in the used config, the key-value pair '
        'in xxx=yyy format will be merged into config file. For example, '
        "'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'")
    parser.add_argument(
        '--average-clips',
        choices=['score', 'prob', None],
        default=None,
        help='average type when averaging test clips')
    parser.add_argument(
        '--launcher',
        choices=['none', 'pytorch', 'slurm', 'mpi'],
        default='none',
        help='job launcher')
    parser.add_argument('--local_rank', type=int, default=0)
    parser.add_argument(
        '--onnx',
        action='store_true',
        help='Whether to test with onnx model or not')
    parser.add_argument(
        '--tensorrt',
        action='store_true',
        help='Whether to test with TensorRT engine or not')
    
    if parse_options is None: 
        args = parser.parse_args()
    else:
        args = parser.parse_args(parse_options)
        
    if 'LOCAL_RANK' not in os.environ:
        os.environ['LOCAL_RANK'] = str(args.local_rank)

    if args.options and args.eval_options:
        raise ValueError(
            '--options and --eval-options cannot be both '
            'specified, --options is deprecated in favor of --eval-options')
    if args.options:
        warnings.warn('--options is deprecated in favor of --eval-options')
        args.eval_options = args.options
    return args

In [7]:
args = parse_args(cmd_options)

if args.tensorrt and args.onnx:
    raise ValueError(
        'Cannot set onnx mode and tensorrt mode at the same time.')

# Get the conifguration from the file
cfg = Config.fromfile(args.config)
cfg.merge_from_dict(args.cfg_options)

# Customization for training the BSL data set
cfg.model.cls_head.num_classes = 5

# Load output_config from cfg
output_config = cfg.get('output_config', {})
if args.out:
    # Overwrite output_config from args.out
    output_config = Config._merge_a_into_b(dict(out=args.out), output_config)

# Load eval_config from cfg
eval_config = cfg.get('eval_config', {})
if args.eval:
    # Overwrite eval_config from args.eval
    eval_config = Config._merge_a_into_b(dict(metrics=args.eval), eval_config)
    
if args.eval_options:
    # Add options from args.eval_options
    eval_config = Config._merge_a_into_b(args.eval_options, eval_config)


assert output_config or eval_config, ('Please specify at least one operation (save or eval the '
     'results) with the argument "--out" or "--eval"')

dataset_type = cfg.data.test.type
if output_config.get('out', None):
    if 'output_format' in output_config:
        # ugly workround to make recognition and localization the same
        warnings.warn('Skip checking `output_format` in localization task.')
    else:
        out = output_config['out']
        # make sure the dirname of the output path exists
        mmcv.mkdir_or_exist(osp.dirname(out))
        _, suffix = osp.splitext(out)
        if dataset_type == 'AVADataset':
            assert suffix[1:] == 'csv', ('For AVADataset, the format of '
                                         'the output file should be csv')
        else:
            assert suffix[1:] in file_handlers, (
                'The format of the output '
                'file should be json, pickle or yaml')


cfg.data.test.test_mode = True

# The flag is used to register module's hooks
cfg.setdefault('module_hooks', [])

../Video-Swin-Transformer/configs/_base_/models/swin/swin_tiny.py
../Video-Swin-Transformer/configs/_base_/default_runtime.py


[]

In [8]:
distributed = False

In [9]:
# build the dataloader
dataset = build_dataset(cfg.data.test, dict(test_mode=True))

dataloader_setting = dict(
    videos_per_gpu=cfg.data.get('videos_per_gpu', 1),
    workers_per_gpu=cfg.data.get('workers_per_gpu', 1),
    dist=distributed,
    shuffle=False)
dataloader_setting = dict(dataloader_setting, **cfg.data.get('test_dataloader', {}))
data_loader = build_dataloader(dataset, **dataloader_setting)

In [10]:
for data in data_loader:
    break

In [11]:
data['imgs'].dtype

torch.float32

In [12]:
def turn_off_pretrained(cfg):
    # recursively find all pretrained in the model config,
    # and set them None to avoid redundant pretrain steps for testing
    if 'pretrained' in cfg:
        cfg.pretrained = None

    # recursively turn off pretrained value
    for sub_cfg in cfg.values():
        if isinstance(sub_cfg, dict):
            turn_off_pretrained(sub_cfg)

In [13]:
# remove redundant pretrain steps for testing
turn_off_pretrained(cfg.model)

# build the model and load checkpoint
model = build_model(cfg.model, train_cfg=None, test_cfg=cfg.get('test_cfg'))

if len(cfg.module_hooks) > 0:
    register_module_hooks(model, cfg.module_hooks)

fp16_cfg = cfg.get('fp16', None)
if fp16_cfg is not None:
    wrap_fp16_model(model)
load_checkpoint(model, args.checkpoint, map_location='cpu')

model = MMDataParallel(model, device_ids=[0])

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


load checkpoint from local path: ./work_dirs/k400_swin_tiny_patch244_window877.py/best_top1_acc_epoch_10.pth


In [14]:
cfg.model

{'type': 'Recognizer3D',
 'backbone': {'type': 'SwinTransformer3D',
  'patch_size': (2, 4, 4),
  'embed_dim': 96,
  'depths': [2, 2, 6, 2],
  'num_heads': [3, 6, 12, 24],
  'window_size': (8, 7, 7),
  'mlp_ratio': 4.0,
  'qkv_bias': True,
  'qk_scale': None,
  'drop_rate': 0.0,
  'attn_drop_rate': 0.0,
  'drop_path_rate': 0.1,
  'patch_norm': True},
 'cls_head': {'type': 'I3DHead',
  'in_channels': 768,
  'num_classes': 5,
  'spatial_type': 'avg',
  'dropout_ratio': 0.5},
 'test_cfg': {'average_clips': 'prob', 'max_testing_views': 4}}

In [14]:
cfg.img_norm_cfg

{'mean': [123.675, 116.28, 103.53],
 'std': [58.395, 57.12, 57.375],
 'to_bgr': False}

In [15]:
# These are the expected outputs from the dataset
expected_classes = [x['label'] for x in dataset.video_infos]

# These are the actual outputs
outputs = single_gpu_test(model, data_loader)

[>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] 10/10, 0.3 task/s, elapsed: 33s, ETA:     0s

In [16]:
# Summary of results
eval_res = dataset.evaluate(outputs, **eval_config)
for name, val in eval_res.items():
    print(f'{name}: {val:.04f}')

predicted_classes = [np.argmax(x) for x in outputs]
confusion_matrix(expected_classes, predicted_classes)


Evaluating top_k_accuracy ...

top1_acc	1.0000
top5_acc	1.0000
top1_acc: 1.0000
top5_acc: 1.0000


array([[2, 0, 0, 0, 0],
       [0, 2, 0, 0, 0],
       [0, 0, 2, 0, 0],
       [0, 0, 0, 2, 0],
       [0, 0, 0, 0, 2]])

In [17]:
mean = np.reshape(cfg.img_norm_cfg['mean'], [len(cfg.img_norm_cfg['mean']), 1, 1])
std = np.reshape(cfg.img_norm_cfg['std'], [len(cfg.img_norm_cfg['std']), 1, 1])

In [18]:
import matplotlib.pyplot as plt

In [19]:
# 
predicted_classes = np.array(predicted_classes)
expected_classes = np.array(expected_classes)

# These are were the predictions were incorrect
mismatches = predicted_classes != expected_classes

# These are the mismatched images
for cl in range(5):
    # Location of all the videos that were wrongly classified
    index = (expected_classes == cl) & mismatches
    # For this class find if there were any mismatches
    if np.any(index):
        # Get the first set of images
        img_loc = np.where(index)[0][0]
        vid = dataset[img_loc]['imgs'].numpy()
        
        # For each set of images
        for vid_loc in range(vid.shape[0]):
            # This is one video
            one_vid = vid[vid_loc, :].squeeze()
            
            plt.figure(figsize=(4*8, 4*4))
            # Now extract each frame from the video
            for frm_loc in range(one_vid.shape[1]):
                # 
                one_frm = one_vid[:, frm_loc, :, :].squeeze()
                
                one_frm *= std
                one_frm += mean
                
                plt.subplot(4, 8, frm_loc+1)
                plt.imshow(one_frm.transpose(1, 2, 0).astype('int'))
                plt.axis('off')
            print(one_frm.shape)
            plt.show()

# Unused

In [20]:

# if args.tensorrt:
#     outputs = inference_tensorrt(args.checkpoint, distributed, data_loader,
#                                  dataloader_setting['videos_per_gpu'])
# elif args.onnx:
#     outputs = inference_onnx(args.checkpoint, distributed, data_loader,
#                              dataloader_setting['videos_per_gpu'])
# else:
#     outputs = inference_pytorch(args, cfg, distributed, data_loader)

# rank, _ = get_dist_info()
# if rank == 0:
#     if output_config.get('out', None):
#         out = output_config['out']
#         print(f'\nwriting results to {out}')
#         dataset.dump_results(outputs, **output_config)
#     if eval_config:
#         eval_res = dataset.evaluate(outputs, **eval_config)
#         for name, val in eval_res.items():
#             print(f'{name}: {val:.04f}')