# MAD grounding data walkthrough

In [1]:
import json
import h5py
import pprint

In [2]:
# load annotations
train = json.load(open('./annotations/MAD_train.json','r'))
val   = json.load(open('./annotations/MAD_val.json','r'))
test  = json.load(open('./annotations/MAD_test.json','r'))

In [3]:
# Number of annotations 
print(f'Number of annotations in train set {len(train)}')
print(f'Number of annotations in validation set {len(val)}')
print(f'Number of annotations in test set {len(test)}')
print(f'Total number of annotations {len(train)+len(val)+len(test)}\n')

# Structure of annotations: 
# - Each json contains a dictionary, the key of the dictionary is the sentence ID 
# - Each sentence key is unique and is required to read sentences embeddings from features file
sentences_ids = list(train.keys()) + list(val.keys()) + list(test.keys())

# Movies stuff
train_movies = list(set([v['movie'] for v in train.values()]))
val_movies   = list(set([v['movie'] for v in val.values()]))
test_movies  = list(set([v['movie'] for v in test.values()]))
movies_ids   = train_movies + val_movies + test_movies
print(f'Number of movies in train set {len(train_movies)}')
print(f'Number of movies in validation set {len(val_movies)}')
print(f'Number of movies in test set {len(test_movies)}')
print(f'Total number of movies: {len(movies_ids)}\n')

# Asociated with each sentence ID we have a dictionary containing the following keys: 
print(f'Inner dictionary keys: {list(train[sentences_ids[1000]].keys())}\n')
print(f'Example:')
pprint.pprint(train[sentences_ids[1000]])

# Inner Dictionary Documentation
# movie          = unique ID for each movie (not the title). We need this value to read frames embeddings from features file.
# movie_duration = duration in seconds
# sentence       = textual annotation
# tokens         = tokenized sentences produced with spacy (Different from tokens features. They come from CLIP tokenizer. These tokens are not used in training but only for statistics)
# timestamps     = raw temporal grounding 
# ext_timestamps = extended temporal timestamps USED FOR TRAINING AND EVALUATE (Each annotation shorter than 2 seconds was extended to 2 seconds. Similar approach was used in LSMDC.)


Number of annotations in train set 280183
Number of annotations in validation set 32064
Number of annotations in test set 72044
Total number of annotations 384291

Number of movies in train set 488
Number of movies in validation set 50
Number of movies in test set 112
Total number of movies: 650

Inner dictionary keys: ['movie', 'sentence', 'timestamps', 'ext_timestamps', 'movie_duration', 'tokens']

Example:
{'ext_timestamps': [1888.785879, 1892.375879],
 'movie': '10202',
 'movie_duration': 7556.0,
 'sentence': ' SOMEONE hurries over to the road and sees a second horseman, '
             'thundering toward them.',
 'timestamps': [1888.785879, 1892.375879],
 'tokens': ['Willow',
            'hurries',
            'over',
            'to',
            'the',
            'road',
            'and',
            'sees',
            'a',
            'second',
            'horseman',
            ',',
            'thundering',
            'toward',
            'them',
            '.']}


In [5]:
# Features
lang   = h5py.File('./features/CLIP_language_tokens_features.h5','r') 
visual = h5py.File('./features/CLIP_frames_features_5fps.h5','r') 

In [6]:
# Features documentation
print(f'Number of sentences embeddings: {len(lang)}')
print(f'Sentence tokens embeddings shape [Num tokens, Embedding dim] (example): {lang[sentences_ids[1000]][:].shape} ')

print(f'Number of movies embeddings: {len(visual)}')
print(f'Movies frames embeddings shape [Num frames, Embedding dim] (example): {visual[movies_ids[0]][:].shape} ')

Number of sentences embeddings: 384291
Sentence tokens embeddings shape [Num tokens, Embedding dim] (example): (18, 512) 
Number of movies embeddings: 650
Movies frames embeddings shape [Num frames, Embedding dim] (example): (26086, 512) 
