In [2]:
import os
import json
from glob import glob
import random
# from tqdm import tqdm
# tqdm for notebooks

from tqdm import tqdm_notebook as tqdm
import pandas as pd
from fuzzywuzzy import fuzz
from moviepy.editor import VideoFileClip



# create folder for each dataset first    

In [3]:
def save_json(content, save_path):
    # if no such directory, create one
    if not os.path.exists(os.path.dirname(save_path)):
        os.makedirs(os.path.dirname(save_path))
    with open(save_path, 'w') as f:
        f.write(json.dumps(content))
def load_jsonl(filename):
    with open(filename, "r") as f:
        return [json.loads(l.strip("\n")) for l in f.readlines()]
def load_json(filename):
    with open(filename, "r") as f:
        return json.load(f)

# qvh

In [None]:
ann_root = 'Your/path/to/QVHighlights'
train_path = ann_root + '/highlight_train_release.jsonl'
val_path = ann_root + '/highlight_val_release.jsonl'
test_path = ann_root + '/highlight_test_release.jsonl'

In [None]:
train = load_jsonl(train_path)
val = load_jsonl(val_path)
test = load_jsonl(test_path)

In [None]:
def process_QVH(data, relative_time=False, save_float=False, is_test=False):
    out = []
    for d in data:
        sample = {}
        sample['video'] = d['vid']
        sample['qid'] = 'QVHighlight_' + str(d['qid'])
        sample['query'] = d['query']
        duration = d['duration']
        sample['duration'] = duration

        if not is_test:
            windows = d['relevant_windows']
            if relative_time:
                relative_time_windows = []
                for window in windows:
                    start = window[0] / duration
                    end = window[1] / duration

                    if save_float:
                        relative_time_windows.append([round(start, 2), round(end, 2)])
                    else:
                        relative_time_windows.append([int(round(start, 2) * 100), int(round(end, 2) * 100)])
                sample['relevant_windows'] = relative_time_windows
            else:
                sample['relevant_windows'] = windows
        else:
            sample['relevant_windows'] = [[0, 150]] # dummy value

        out.append(sample)

    return out

In [None]:
save_float = False
relative_time = False

new_train = process_QVH(train, relative_time=relative_time, save_float=save_float)
new_val = process_QVH(val, relative_time=relative_time, save_float=save_float)
new_test = process_QVH(test, relative_time=relative_time, save_float=save_float, is_test=True)

In [None]:
# save data
if save_float and relative_time:
    save_json(new_train, ann_root + '/lavis/train_relative_float.json')
    save_json(new_val, ann_root + '/lavis/val_relative_float.json')
    save_json(new_test, ann_root + '/lavis/test_relative_float_dummy.json')
elif save_float and not relative_time:
    save_json(new_train, ann_root + '/lavis/train_float.json')
    save_json(new_val, ann_root + '/lavis/val_float.json')
    save_json(new_test, ann_root + '/lavis/test_float_dummy.json')
elif not save_float and relative_time:
    save_json(new_train, ann_root + '/lavis/train_relative.json')
    save_json(new_val, ann_root + '/lavis/val_relative.json')
    save_json(new_test, ann_root + '/lavis/test_relative_dummy.json')
else:
    save_json(new_train, ann_root + '/lavis/train.json')
    save_json(new_val, ann_root + '/lavis/val.json')
    save_json(new_test, ann_root + '/lavis/test_dummy.json')

# Charades

In [None]:
train_df = pd.read_csv('Your/path/to/Charades_v1_train.csv', delimiter=',')
test_df = pd.read_csv('Your/path/to/Charades_v1_test.csv', delimiter=',')

In [None]:
# get all unique ids
train_ids = train_df["id"].unique()
print(len(train_ids))

# randomly select 800 ids for validation
random.seed(42)
random.shuffle(train_ids)

val_ids = train_ids[:800]
train_ids = train_ids[800:]

len(val_ids), len(train_ids)

In [None]:
ann_root = 'Your/path/to/Charades_STA'
train_path = ann_root + '/train.txt'
test_path = ann_root + '/test.txt'

In [None]:
def process_charades_STA(data_path, df, video_ids=None, relative_time=False, save_float=False):
    # read txt and put each line into new element in list
    with open(data_path) as f:
        content = f.readlines()
    content = [x.strip() for x in content]
    out = []

    for s in content:
        # format "id start end##query"
        s = s.split('##') # -> [id start end, query]
        query = s[1] # -> query
        s = s[0] # -> id start end
        s = s.split(' ') # -> [[id], [start], [end]]
        id = s[0] # -> id

        if video_ids is not None and id not in video_ids:
            continue
        
        # get meta data from df using id
        # get row with id == id
        row = df.loc[df["id"] == id]
        values = row.values[0]

        # get duration
        duration = values[10]

        # convert to float
        s[1] = float(s[1])
        s[2] = float(s[2])
        if s[2] > duration:
            s[2] = duration

        if relative_time:
            # convert to relative time
            s[1] = s[1] / duration
            s[2] = s[2] / duration

            if save_float:
                # For float conversion
                window = [round(s[1], 2), round(s[2], 2)] # -> [start, end]
                assert window[0] >= 0 and window[1] <= 1
            else:
                # For int conversion -> round to nearest int
                window = [int(s[1] * 100), int(s[2] * 100)]
                assert window[0] >= 0 and window[1] <= 100
        else:
            if save_float:
                # For float conversion
                window = [float(s[1]), float(s[2])] # -> [start, end]
            else:
                # For int conversion -> round to nearest int
                window = [round(float(s[1])), round(float(s[2]))]

        # get objects
        objects = values[7]
        # only split if objects is not nan or contains ; (which means multiple objects)
        try:
            objects = objects.split(';')
        except:
            print('no objects: ', objects, ' for id: ', id)
            objects = []

        out.append(
            {
                'id': id,
                'query': query,
                'window': [window],
                'duration': duration,
                'objects': objects
            }
        )

    return out

In [None]:
save_float = True
relative_time = False

train = process_charades_STA(train_path, train_df, train_ids, relative_time=relative_time, save_float=save_float)
val = process_charades_STA(train_path, train_df, val_ids, save_float=save_float)
test = process_charades_STA(test_path, test_df, save_float=save_float)

new_train = []
new_val = []
new_test = []
for i, qa in enumerate(train):
    qa_dict = {}
    qa_dict['video'] = qa['id']
    qa_dict['qid'] = 'Charades-STA_' + str(qa['id'])
    qa_dict['query'] = qa['query']
    qa_dict['duration'] = qa['duration']
    qa_dict['relevant_windows'] = qa['window']
    qa_dict['objects'] = qa['objects']
    new_train.append(qa_dict)

for i, qa in enumerate(val):
    qa_dict = {}
    qa_dict['video'] = qa['id']
    qa_dict['qid'] = 'Charades-STA_' + str(qa['id'])
    qa_dict['query'] = qa['query']
    qa_dict['duration'] = qa['duration']
    qa_dict['relevant_windows'] = qa['window']
    qa_dict['objects'] = qa['objects']
    new_val.append(qa_dict)

for i, qa in enumerate(test):
    qa_dict = {}
    qa_dict['video'] = qa['id']
    qa_dict['qid'] = 'Charades-STA_' + str(qa['id'])
    qa_dict['query'] = qa['query']
    qa_dict['duration'] = qa['duration']
    qa_dict['relevant_windows'] = qa['window']
    qa_dict['objects'] = qa['objects']
    new_test.append(qa_dict)

len(new_train), len(new_val), len(new_test)

In [None]:
# save data
if save_float and relative_time:
    save_json(new_train, ann_root + '/lavis/new_train_relative_float.json')
    save_json(new_val, ann_root + '/lavis/new_val_relative_float.json')
    save_json(new_test, ann_root + '/lavis/test_relative_float.json')
elif save_float and not relative_time:
    save_json(new_train, ann_root + '/lavis/new_train_float.json')
    save_json(new_val, ann_root + '/lavis/new_val_float.json')
    save_json(new_test, ann_root + '/lavis/test_float.json')
elif not save_float and relative_time:
    save_json(new_train, ann_root + '/lavis/new_train_relative.json')
    save_json(new_val, ann_root + '/lavis/new_val_relative.json')
    save_json(new_test, ann_root + '/lavis/test_relative.json')
else:
    save_json(new_train, ann_root + '/lavis/new_train.json')
    save_json(new_val, ann_root + '/lavis/new_val.json')
    save_json(new_test, ann_root + '/lavis/test.json')

In [None]:
# For processing without the custom data slipt, i.e. having only the original train and test split

save_float = True
relative_time = False

train = process_charades_STA(train_path, train_df, None, relative_time=relative_time, save_float=save_float)
test = process_charades_STA(test_path, test_df, save_float=save_float)

new_train = []
new_val = []
new_test = []
for i, qa in enumerate(train):
    qa_dict = {}
    qa_dict['video'] = qa['id']
    qa_dict['qid'] = 'Charades-STA_' + str(qa['id'])
    qa_dict['query'] = qa['query']
    qa_dict['duration'] = qa['duration']
    qa_dict['relevant_windows'] = qa['window']
    qa_dict['objects'] = qa['objects']
    new_train.append(qa_dict)

for i, qa in enumerate(test):
    qa_dict = {}
    qa_dict['video'] = qa['id']
    qa_dict['qid'] = 'Charades-STA_' + str(qa['id'])
    qa_dict['query'] = qa['query']
    qa_dict['duration'] = qa['duration']
    qa_dict['relevant_windows'] = qa['window']
    qa_dict['objects'] = qa['objects']
    new_test.append(qa_dict)

len(new_train), len(new_test)

In [None]:
# save data
if save_float and relative_time:
    save_json(new_train, ann_root + '/lavis/train_relative_float.json')
    save_json(new_test, ann_root + '/lavis/test_relative_float.json')
elif save_float and not relative_time:
    save_json(new_train, ann_root + '/lavis/train_float.json')
    save_json(new_test, ann_root + '/lavis/test_float.json')
elif not save_float and relative_time:
    save_json(new_train, ann_root + '/lavis/train_relative.json')
    save_json(new_test, ann_root + '/lavis/test_relative.json')
else:
    save_json(new_train, ann_root + '/lavis/train.json')
    save_json(new_test, ann_root + '/lavis/test.json')

# NextQA

In [None]:
ann_root = 'Your/path/to/NExT_QA'
raw_root = 'Your/path/to/raw/NExT'
train_path = ann_root + '/nextqa/train.csv'
val_path = ann_root + '/nextqa/val.csv'
test_path = ann_root + '/nextqa/test.csv'
map_vid_vidorID_path = ann_root + '/map_vid_vidorID.json'

In [4]:
raw_train = pd.read_csv(train_path, delimiter=',')
raw_val = pd.read_csv(val_path, delimiter=',')
train = []
val = []
key = ['video', 'question', 'a0', 'a1', 'a2', 'a3', 'a4', 'answer', 'qid', 'type'] 
for i in range(len(raw_train)):
    data = {}
    for k in key:
        data[k] = raw_train.iloc[i][k]
    train.append(data)

for i in range(len(raw_val)):
    data = {}
    for k in key:
        data[k] = raw_val.iloc[i][k]
    val.append(data) 

In [5]:
vid_map = load_json(map_vid_vidorID_path)

In [9]:
def get_video_duration(vid):
    vid_path = raw_root + "/" + vid_map[vid] + '.mp4'
    clip = VideoFileClip(vid_path)
    return clip.duration

In [10]:
new_train = []
new_val = []

print('Processing train and val data...')
print('This could lake a while (100 min), because we need to extract the video durations for each video')

for qa in train:
    qa_dict = {}
    qa_dict['video'] = vid_map[str(qa['video'])]
    qa_dict['duration'] = get_video_duration(str(qa['video']))
    qa_dict['num_option'] = int(5)
    qa_dict['qid'] = '_'.join([qa['type'], str(qa['video']), str(qa['qid'])])
    for i in range(5):
        qa_dict['a{}'.format(str(i))] = qa['a{}'.format(str(i))]+'.'
    qa_dict['answer'] = int(qa['answer'])
    qa_dict['question'] = qa['question']+'?'
    new_train.append(qa_dict)

for qa in val:
    qa_dict = {}
    qa_dict['video'] = vid_map[str(qa['video'])]
    qa_dict['duration'] = get_video_duration(str(qa['video']))
    qa_dict['num_option'] = int(5)
    qa_dict['qid'] = '_'.join([qa['type'], str(qa['video']), str(qa['qid'])])
    for i in range(5):
        qa_dict['a{}'.format(str(i))] = qa['a{}'.format(str(i))]+'.'
    qa_dict['answer'] = int(qa['answer'])
    qa_dict['question'] = qa['question']+'?'
    new_val.append(qa_dict)

In [11]:
save_json(new_train, ann_root + '/lavis/train.json')
save_json(new_val, ann_root + '/lavis/val.json')

# NExT-GQA

In [None]:
ann_root = 'Your/path/to/NExT_QA'
raw_root = 'Your/path/to/raw/NExT'
# train_path = ann_root + '/nextgqa/train.csv'
val_path = ann_root + '/nextgqa/val.csv'
test_path = ann_root + '/nextgqa/test.csv'
map_vid_vidorID_path = ann_root + '/map_vid_vidorID.json'

In [5]:
raw_val = pd.read_csv(val_path, delimiter=',')
raw_test = pd.read_csv(test_path, delimiter=',')
val = []
test = []
key = ['video_id', 'question', 'a0', 'a1', 'a2', 'a3', 'a4', 'answer', 'qid', 'type'] 

for i in range(len(raw_val)):
    data = {}
    for k in key:
        data[k] = raw_val.iloc[i][k]
    val.append(data)

for i in range(len(raw_test)):
    data = {}
    for k in key:
        data[k] = raw_test.iloc[i][k]
    test.append(data)

In [6]:
vid_map = load_json(map_vid_vidorID_path)

In [7]:
time_val = load_json(ann_root + '/nextgqa/gsub_val.json')
time_test = load_json(ann_root + '/nextgqa/gsub_test.json')

In [8]:
def get_video_duration(vid):
    vid_path = raw_root + "/" + vid_map[vid] + '.mp4'
    clip = VideoFileClip(vid_path)
    return clip.duration

def get_answer_idx(answer, options):
    for i, option in enumerate(options):
        if option == answer:
            return i
    print('Error: answer not in options')
    return -1

In [9]:
def get_moment_timespan(time_data, video_id, qid):
    data = time_data[str(video_id)]
    location = data['location'][str(qid)]
    duration = data['duration']

    return location, duration

In [10]:
new_val = []
new_test = []

# print('Processing train and val data...')
# print('This could lake a while (100 min), because we need to extract the video durations for each video')

for qa in val:
    qa_dict = {}
    qa_dict['video'] = vid_map[str(qa['video_id'])]
    # qa_dict['duration'] = get_video_duration(str(qa['video_id']))
    qa_dict['num_option'] = int(5)
    qa_dict['qid'] = '_'.join([qa['type'], str(qa['video_id']), str(qa['qid'])])
    for i in range(5):
        qa_dict['a{}'.format(str(i))] = qa['a{}'.format(str(i))]+'.'
    qa_dict['answer'] = get_answer_idx(qa['answer'], [qa['a0'], qa['a1'], qa['a2'], qa['a3'], qa['a4']])
    qa_dict['question'] = qa['question']+'?'

    ### GQA specific
    relevant_windows, duration = get_moment_timespan(time_val, qa['video_id'], qa['qid'])
    qa_dict['relevant_windows'] = relevant_windows
    qa_dict['duration'] = duration

    new_val.append(qa_dict)

for qa in test:
    qa_dict = {}
    qa_dict['video'] = vid_map[str(qa['video_id'])]
    # qa_dict['duration'] = get_video_duration(str(qa['video_id']))
    qa_dict['num_option'] = int(5)
    qa_dict['qid'] = '_'.join([qa['type'], str(qa['video_id']), str(qa['qid'])])
    for i in range(5):
        qa_dict['a{}'.format(str(i))] = qa['a{}'.format(str(i))]+'.'
    qa_dict['answer'] = get_answer_idx(qa['answer'], [qa['a0'], qa['a1'], qa['a2'], qa['a3'], qa['a4']])
    qa_dict['question'] = qa['question']+'?'

    ### GQA specific
    relevant_windows, duration = get_moment_timespan(time_test, qa['video_id'], qa['qid'])
    qa_dict['relevant_windows'] = relevant_windows
    qa_dict['duration'] = duration

    new_test.append(qa_dict)

In [11]:
save_json(new_val, ann_root + '/lavis/nextgqa/val.json')
save_json(new_test, ann_root + '/lavis/nextgqa/test.json')

# ActivityNet Captions

In [None]:
ann_root = "Your/path/to/ActivityNet"
train_path = os.path.join(ann_root, "train.json")
val_path = os.path.join(ann_root, "val_1.json")
test_path = os.path.join(ann_root, "val_2.json")


In [None]:
train = load_json(train_path)
val = load_json(val_path)
test = load_json(test_path)

In [None]:
def process_activitynet(data, relative_time=False, save_float=False):
    out = []

    for video_id, sample in data.items():
        duration = sample['duration']
        sentences = sample['sentences']
        timestamps = sample['timestamps']
        for j, (start, end) in enumerate(timestamps):

            if relative_time:
                # convert to relative time
                start = start / duration
                end = end / duration

                if save_float:
                    # For float conversion
                    window = [round(start, 2), round(end, 2)]
                    assert window[0] >= 0 and window[1] <= 1
                else:
                    # For int conversion -> round to nearest int
                    window = [int(round(start, 2) * 100), int(round(end, 2) * 100)]
                    assert window[0] >= 0 and window[1] <= 100
            else:
                if save_float:
                    # For float conversion
                    window = [float(start), float(end)]
                else:
                    # For int conversion -> round to nearest int
                    window = [round(float(start)), round(float(end))]

            new_sample = {
                'video': video_id,
                'qid': f'ActivityNet_{video_id}_{j}',
                'query': sentences[j],
                'duration': duration,
                'relevant_windows': [window]
            }

            out.append(new_sample)
    
    return out

In [None]:
save_float = False
relative_time = False

new_train = process_activitynet(train, relative_time=relative_time, save_float=save_float)
new_val = process_activitynet(val, save_float=save_float)
new_test = process_activitynet(test, save_float=save_float)

len(new_train), len(new_val), len(new_test)

In [None]:
# save data
if save_float and relative_time:
    save_json(new_train, ann_root + '/lavis/train_relative_float.json')
    save_json(new_val, ann_root + '/lavis/val_relative_float.json')
    save_json(new_test, ann_root + '/lavis/test_relative_float.json')
elif save_float and not relative_time:
    save_json(new_train, ann_root + '/lavis/train_float.json')
    save_json(new_val, ann_root + '/lavis/val_float.json')
    save_json(new_test, ann_root + '/lavis/test_float.json')
elif not save_float and relative_time:
    save_json(new_train, ann_root + '/lavis/train_relative.json')
    save_json(new_val, ann_root + '/lavis/val_relative.json')
    save_json(new_test, ann_root + '/lavis/test_relative.json')
else:
    save_json(new_train, ann_root + '/lavis/train.json')
    save_json(new_val, ann_root + '/lavis/val.json')
    save_json(new_test, ann_root + '/lavis/test.json')