In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import argparse

In [3]:
# !export CUDA_VISIBLE_DEVICES=1

In [4]:
from os.path import basename
from args import parser
import yaml
from collections import defaultdict
import torch
from torch.autograd import Variable
import numpy as np
import json

from data.utils import update_values

from scripts.test_utils import get_dataset, get_model

In [5]:
# Ref: https://stackoverflow.com/questions/44542605/python-how-to-get-all-default-values-from-argparse

def get_argparse_defaults(parser):
    defaults = {}
    for action in parser._actions:
        if not action.required and action.dest != "help":
            defaults[action.dest] = action.default
    return defaults

def get_argparse_required(parser):
    required = []
    for action in parser._actions:
        if action.required:
            required.append(action.dest)
    return required

In [6]:
# set default arguments
default_args = get_argparse_defaults(parser)
args = argparse.Namespace(**default_args)

In [7]:
# set required and provided arguments

args.cfgs_file = 'cfgs/anet.yml'
args.split = 'validation'
args.epoch=19

_id = "anet-2L-gt-mask"
args.densecap_eval_file = "./tools/densevid_eval/evaluate.py"
args.start_from = f"./checkpoint/{_id}/model_epoch_{args.epoch}.t7"
args.id = f"{_id}-{args.epoch}"

args.val_data_folder = args.split
args.batch_size = 1
args.cuda = True

In [8]:
with open(args.cfgs_file, 'r') as handle:
    options_yaml = yaml.load(handle)
update_values(options_yaml, vars(args))

  


In [9]:
args

Namespace(attn_dropout=0.2, batch_size=1, cap_dropout=0.2, cfgs_file='cfgs/anet.yml', cuda=True, d_hidden=2048, d_model=1024, dataset='anet', dataset_file='./data/anet/anet_annotations_trainval.json', densecap_eval_file='./tools/densevid_eval/evaluate.py', densecap_references=['./data/anet/val_1.json', './data/anet/val_2.json'], dur_file='./data/anet/anet_duration_frame.csv', epoch=19, feature_root='./data/anet/', gated_mask=False, id='anet-2L-gt-mask-19', image_feat_size=3072, in_emb_dropout=0.1, kernel_list=[1, 2, 3, 4, 5, 7, 9, 11, 15, 21, 29, 41, 57, 71, 111, 161, 211, 251], learn_mask=False, max_prop_num=500, max_sentence_len=20, min_prop_before_nms=200, min_prop_num=50, n_heads=8, n_layers=2, num_workers=2, pos_thresh=0.7, sampling_sec=0.5, slide_window_size=480, slide_window_stride=20, split='validation', start_from='./checkpoint/anet-2L-gt-mask/model_epoch_19.t7', stride_factor=50, val_data_folder='validation', vis_emb_dropout=0.1)

### Load dataset and model

In [10]:
print('loading dataset')
test_loader, text_proc = get_dataset(args)

loading dataset


    Only loading the 'en' tokenizer.

# of words in the vocab: 4563
# of sentences in training: 37421, # of sentences in validation: 17505
# of training videos: 10009
total number of samples (unique videos): 4915
total number of sentences: 17499


In [11]:
print('building model')
model = get_model(text_proc, args)

building model
Initializing weights from ./checkpoint/anet-2L-gt-mask/model_epoch_19.t7


In [12]:
def get_frame_to_second(dur_file, dataset, sampling_sec):
    frame_to_second = {}
    with open(dur_file) as f:
        if dataset == 'anet':
            for line in f:
                vid_name, vid_dur, vid_frame = [l.strip() for l in line.split(',')]
                vid_fps = int(float(vid_frame)*1./int(float(vid_dur)))
                frame_to_second[vid_name] = float(vid_dur) * \
                    int(float(vid_frame)*1./int(float(vid_dur)) * sampling_sec)*1./float(vid_frame)
            frame_to_second['_0CqozZun3U'] = sampling_sec # a missing video in anet
        else:
            raise NotImplementedError
    
    return frame_to_second

In [13]:
frame_to_second = get_frame_to_second(
    args.dur_file, args.dataset, args.sampling_sec,
)


In [14]:
!ls data/anet/

anet_annotations_trainval.json	val_1.json  validation
anet_duration_frame.csv		val_2.json


In [15]:
with open("./data/anet/val_1.json") as f:
    annotations = json.load(f)

In [16]:
len(annotations)

4917

In [17]:
list(annotations.keys())[:10]

['v_uqiMw7tQ1Cc',
 'v_bXdq2zI1Ms0',
 'v_FsS_NCZEfaI',
 'v_K6Tm5xHkJ5c',
 'v_4Lu8ECLHvK4',
 'v_HWV_ccmZVPA',
 'v_GGSY1Qvo990',
 'v_frePM0YGtQE',
 'v_JTrwGfPJNzU',
 'v_gOniW-yEZ0k']

In [18]:
video_id = 'v_PLek2e8NlKc'

annotations[video_id]

{'duration': 184.6,
 'timestamps': [[0, 4.61],
  [4.61, 82.15],
  [45.23, 58.15],
  [83.07, 85.84],
  [85.84, 148.6],
  [148.6, 184.6],
  [152.29, 166.14]],
 'sentences': ['We see the blue opening screen.',
  ' A lady is guiding a young lady through an exercise.',
  ' The girl sits down, then gets on one knee and sits back down.',
  ' We see a title screen on beige.',
  ' The lady gets up from the floor slowly with instructions on the screen.',
  ' The lady is sitting on a table and shoes how to sit, and stand slowly holding your abdomen.',
  ' The couple behind the lady laugh.']}

In [23]:
def validate_per_clip(model, loader, annotations, args, debug=False):
    model.eval()
    densecap_result = defaultdict(list)
    prop_result = defaultdict(list)

    avg_prop_num = 0
    frame_to_second = get_frame_to_second(
        args.dur_file, args.dataset, args.sampling_sec,
    )
    
    ground_truth = defaultdict(list)
    generated = defaultdict(list)


    for data in loader:
        # loads entire video
        image_feat, original_num_frame, video_prefix = data
        # index 0 because batch size is 1
        video_id = "v_" + basename(video_prefix[0])
        
        if video_prefix[0].split('/')[-1] not in frame_to_second:
            frame_to_second[video_prefix[0].split('/')[-1]] = args.sampling_sec
            print("cannot find frame_to_second for video {}".format(video_prefix[0].split('/')[-1]))
        sampling_sec = frame_to_second[video_prefix[0].split('/')[-1]] # batch_size has to be 1
        
        # sample a random clip
        num_frames_in_video = original_num_frame[0].item()
        fps = int(np.round(1 / sampling_sec, decimals=1))
        duration_video = num_frames_in_video / fps
        
        video_annotations = annotations[video_id]
        

        iterator = zip(video_annotations["timestamps"], video_annotations["sentences"])
        for (clip_start_time, clip_end_time), clip_gt_caption in iterator:
            
            clip_start_frame = int((clip_start_time / duration_video) * num_frames_in_video)
            clip_end_frame = int((clip_end_time / duration_video) * num_frames_in_video)
            
            image_feat_clip = image_feat[:, clip_start_frame:clip_end_frame, :]
            original_num_frame[0] = clip_end_frame - clip_start_frame

            # predict caption for clip (segment)
            with torch.no_grad():
                image_feat_clip = Variable(image_feat_clip)

                # ship data to gpu
                if args.cuda:
                    image_feat_clip = image_feat_clip.cuda()

                dtype = image_feat_clip.data.type()

                all_proposal_results = model.inference(image_feat_clip,
                                                       original_num_frame,
                                                       sampling_sec,
                                                       args.min_prop_num,
                                                       args.max_prop_num,
                                                       args.min_prop_before_nms,
                                                       args.pos_thresh,
                                                       args.stride_factor,
                                                       entire_video=True,
                                                       gated_mask=args.gated_mask)
                clip_pred_caption = all_proposal_results[0][0][-1]
            
            
                ground_truth[video_id].append(
                    {
                        "caption": clip_gt_caption,
                        "start_time": clip_start_time,
                        "end_time": clip_end_time,
                        "video_id": video_id,
                        "fps": fps,
                    }
                )
                generated[video_id].append(
                    {
                        "caption": clip_pred_caption,
                        "start_time": clip_start_time,
                        "start_frame": clip_start_frame,
                        "end_time": clip_end_time,
                        "end_frame": clip_end_frame,
                        "video_id": video_id,
                        "fps": fps,
                    }
                )
        
        if debug:
            break
    
    return ground_truth, generated

In [25]:
ground_truth, generated = validate_per_clip(model, test_loader, annotations, args, debug=True)

skipping kernel sizes greater than 11
Processing time for tIoU: 0.02, mask: 0.00, caption: 0.11
skipping kernel sizes greater than 161
Processing time for tIoU: 0.05, mask: 0.00, caption: 0.11
skipping kernel sizes greater than 29
Processing time for tIoU: 0.06, mask: 0.00, caption: 0.11
skipping kernel sizes greater than 7
Processing time for tIoU: 0.00, mask: 0.00, caption: 0.11
skipping kernel sizes greater than 161
Processing time for tIoU: 0.05, mask: 0.00, caption: 0.11
skipping kernel sizes greater than 111
Processing time for tIoU: 0.05, mask: 0.00, caption: 0.11
skipping kernel sizes greater than 29
Processing time for tIoU: 0.07, mask: 0.00, caption: 0.11


In [26]:
ground_truth

defaultdict(list,
            {'v_PLek2e8NlKc': [{'caption': 'We see the blue opening screen.',
               'start_time': 0,
               'end_time': 4.61,
               'video_id': 'v_PLek2e8NlKc',
               'fps': 2},
              {'caption': ' A lady is guiding a young lady through an exercise.',
               'start_time': 4.61,
               'end_time': 82.15,
               'video_id': 'v_PLek2e8NlKc',
               'fps': 2},
              {'caption': ' The girl sits down, then gets on one knee and sits back down.',
               'start_time': 45.23,
               'end_time': 58.15,
               'video_id': 'v_PLek2e8NlKc',
               'fps': 2},
              {'caption': ' We see a title screen on beige.',
               'start_time': 83.07,
               'end_time': 85.84,
               'video_id': 'v_PLek2e8NlKc',
               'fps': 2},
              {'caption': ' The lady gets up from the floor slowly with instructions on the screen.',
            

In [27]:
generated

defaultdict(list,
            {'v_PLek2e8NlKc': [{'caption': 'a man is standing in a <unk> in a <unk>',
               'start_time': 0,
               'start_frame': 0,
               'end_time': 4.61,
               'end_frame': 9,
               'video_id': 'v_PLek2e8NlKc',
               'fps': 2},
              {'caption': 'a man is standing in a room in a room',
               'start_time': 4.61,
               'start_frame': 9,
               'end_time': 82.15,
               'end_frame': 164,
               'video_id': 'v_PLek2e8NlKc',
               'fps': 2},
              {'caption': 'a man is standing in a room in a <unk>',
               'start_time': 45.23,
               'start_frame': 90,
               'end_time': 58.15,
               'end_frame': 116,
               'video_id': 'v_PLek2e8NlKc',
               'fps': 2},
              {'caption': 'a close up of a video of a video of a video of an intro intro intro intro',
               'start_time': 83.07,
           