In [1]:
import json
import os
import pandas as pd

FPS = 29.97
transcript_dir = '../../data/transcripts'
annotations = json.load(open('../../data/annotation.json'))
video_lengths = {date: x[-1]['video_end'] for date, x in annotations.items()}

In [2]:
def construct(split, window_length):
    '''
        Construct the start and end times and corresponding transcripts for split `split` with w = `window_length` seconds
    '''
    if window_length.endswith('min'):
        window_radius = int(window_length[:-3]) * 60 / 2
    else:
        # end with 's'
        window_radius = int(window_length[:-1]) / 2
    frames_fn = f'{split}.json'
    frames = json.load(open(frames_fn))

    data = {}
    for date in frames:
        print('Processing:', date)
        transcripts = pd.read_csv(f'{transcript_dir}/{date}.csv', encoding='cp1252')
        data[date] = []
        v_length = video_lengths[date]

        for frame in frames[date]:
            start = frame['frame_index'] / FPS - window_radius
            end = frame['frame_index'] / FPS + window_radius
            start = max(start, 0.0)
            end = min(end, v_length)
            assert start < end
            transcript = transcripts[(transcripts['Global start'] < end) * (transcripts['Global end'] > start)]
            transcript = ' '.join(transcript['Sentence'].tolist())
            frame['video_start'] = start
            frame['video_end'] = end
            frame['transcript'] = transcript
            data[date].append(frame)
            
    os.makedirs('./context', exist_ok=True)
    with open(f'context/SR_w{window_length}_{split}.json', 'w') as f:
        f.write(json.dumps(data, indent = 4))

In [3]:
# Construct context windows for the validation set with w = 1min
construct(split = 'val', window_length = '1min')

Processing: 03152022
Processing: 11152022


In [3]:
# Construct context windows for the test set with w = 1min
construct(split = 'test', window_length = '1min')

Processing: 02282021
Processing: 06092023
Processing: 06262020
Processing: 09122021
Processing: 11152019
Processing: 12022019


In [5]:
for split in ['train', 'val', 'test']:
    for w in [1, 2, 3, 5, 10, 15, 20]:
        construct(split = split, window_length = f'{w}min')

Processing: 01152020
Processing: 01252020
Processing: 01272021
Processing: 02012021
Processing: 03232022
Processing: 06162021
Processing: 10062019
Processing: 11222019
Processing: 12022021
Processing: 12032022
Processing: 01152020
Processing: 01252020
Processing: 01272021
Processing: 02012021
Processing: 03232022
Processing: 06162021
Processing: 10062019
Processing: 11222019
Processing: 12022021
Processing: 12032022
Processing: 01152020
Processing: 01252020
Processing: 01272021
Processing: 02012021
Processing: 03232022
Processing: 06162021
Processing: 10062019
Processing: 11222019
Processing: 12022021
Processing: 12032022
Processing: 01152020
Processing: 01252020
Processing: 01272021
Processing: 02012021
Processing: 03232022
Processing: 06162021
Processing: 10062019
Processing: 11222019
Processing: 12022021
Processing: 12032022
Processing: 01152020
Processing: 01252020
Processing: 01272021
Processing: 02012021
Processing: 03232022
Processing: 06162021
Processing: 10062019
Processing: 1