# Creating the dataset a-temporal-upgrade

The dataset is coming from activitynet 1.3 captions dataset. The original dataset has been used heavily in the industry.

The resulting dataset are divided into 2.

- Action ordering : presents a question to order the scrambled actions based on what action come earlier in the video
- Moment retrieval : presents a question to retrieve the time of the requested action

In [3]:
# import necessary libraries
import torch 
import datasets
from datasets import load_dataset, DatasetDict, load_from_disk, concatenate_datasets
import yt_dlp
import os
import random
from huggingface_hub import HfApi
import json
import shutil

## Preprocess Dataset

In [4]:
SEED = 18220053 # seed for any randomization
dirs = {
    "videos_download": "videos/download",
    "videos_zip": "videos/zip",
    "saved_processed": "saved/processed",
    "saved": "saved"
}
VIDEOS_DOWNLOAD_PATH = "videos/download"
VIDEOS_ZIP_PATH = 'videos/zip'
DATASET_REPO = 'jwnt4/a-temporal-upgrade'

In [3]:
# load dataset using huggingface datasets
ds = load_dataset('Leyo/ActivityNet_Captions', trust_remote_code=True)
ds = ds.filter(lambda e: e["duration"] >= 20 and e["duration"] <= 50)
ds = ds.filter(lambda e: len(e["en_captions"]) >= 3 and len(e["en_captions"]) <= 4) 
ds 

DatasetDict({
    train: Dataset({
        features: ['video_id', 'video_path', 'duration', 'captions_starts', 'captions_ends', 'en_captions'],
        num_rows: 856
    })
    validation: Dataset({
        features: ['video_id', 'video_path', 'duration', 'captions_starts', 'captions_ends', 'en_captions'],
        num_rows: 484
    })
    test: Dataset({
        features: ['video_id', 'video_path', 'duration', 'captions_starts', 'captions_ends', 'en_captions'],
        num_rows: 667
    })
})

In [95]:
# check the dataset
ds['train'][0]

{'video_id': 'v_ogQozSI5V8U',
 'video_path': 'https://www.youtube.com/watch?v=ogQozSI5V8U',
 'duration': 36.54999923706055,
 'captions_starts': [0.0, 7.489999771118164, 19.3700008392334],
 'captions_ends': [7.489999771118164, 18.09000015258789, 36.54999923706055],
 'en_captions': ['We see a hallway with a wooden floor.',
  ' A dog in socks walks slowly out onto the floor as a lady films him.',
  ' The dog turns around and goes back to the other room.']}

### Downloading videos

In [None]:
unavailable_videos = set()

class MyLogger:
    def __init__(self, suppress_warning):
        self.supress_warning = suppress_warning

    def debug(self, msg):
        if not msg.startswith('[debug] '):
            self.info(msg)

    def info(self, msg):
        pass
        
    def warning(self, msg):
        pass

    def error(self, msg):
        unavailable_videos.add(msg.split(' ')[2].strip("':"))
        if not self.supress_warning:
            print(msg.split(' ')[2].strip("':"), "is unavailable")


def download_videos(urls, suppress_warning = False):
    ytdlp_config = {
        'extract_flat': 'discard_in_playlist',
        'format': '[height>=360][height<=1080]',
        'fragment_retries': 10,
        'format_sort': ['ext'],
        'ignoreerrors': 'only_download',
        'postprocessors': [{
            'key': 'FFmpegConcat',
            'only_multi_video': True,
            'when': 'playlist'
            }],
        'outtmpl': {'default': f'{dirs["videos_download"]}/v_%(id)s.%(ext)s'},
        'retries': 10,
        'logger' : MyLogger(suppress_warning)
    }

    vid_downloaded = set(os.listdir(dirs['videos_download']))
    print("total vid in download dir:", len(vid_downloaded))
    vid_all = set(["v_" + u.split("=")[1] + ".mp4" for u in urls])
    print("all vid to download:", len(vid_all))
    vid_downloaded = vid_all & vid_downloaded
    vid_to_download = vid_all - vid_downloaded

    num_download, num_downloaded = len(vid_to_download), len(vid_downloaded)
    print("vid already downloaded:", num_downloaded)
    print("remaining vid to download:", num_download)

    vid_to_download = list(vid_to_download)
    vid_to_download = [f'https://www.youtube.com/watch?v={v[2:-4]}' for v in vid_to_download]
    with yt_dlp.YoutubeDL(ytdlp_config) as ydl:
        ydl.download(vid_to_download)

    success = num_download - len(unavailable_videos)
    return {
        "success_downloads": success,
        "fail_downloads": len(unavailable_videos),
        "total_in_disk": success + num_downloaded
    }

In [33]:
download_videos(ds['train']['video_path'] + ds['test']['video_path'] + ds['validation']['video_path'])

total vid in dir: 990
vid to download: 1548
vid already downloaded: 972
remaining vid to download: 576
srARxP_ocyg is unavailable
tbKBKWCh6rs is unavailable
rIqITS6qMB0 is unavailable
y3Zq6RZZNtc is unavailable
bDiwuABU45I is unavailable
rtJTJ10ppRc is unavailable
jmS3NFo4XCc is unavailable
9UpVdljXQ4E is unavailable
6wTk8QqWxuo is unavailable
1_YFTTzzLrI is unavailable
Nh-RdjyfGNA is unavailable
GBdj6erXjDM is unavailable
54K2F3zAZ0o is unavailable
3YiGMRp-7B4 is unavailable
tMM166j4YEw is unavailable
unLrTQt07kI is unavailable
IJER0EpbxW4 is unavailable
jFZRNe7xFY8 is unavailable
ZlwU7HKcoYs is unavailable
Pmt3R5olRP0 is unavailable
Wyr2o0lsSTU is unavailable
Quj1J31xQFM is unavailable
HVKveVRZ-JY is unavailable
DJyfOeZc2lI is unavailable
N1JcXEim40g is unavailable
TomBet77rDc is unavailable
zMrUSfQ_mzo is unavailable
bHAzuAnnvcU is unavailable
fU4EgYmISro is unavailable
6czh95dpwAA is unavailable
KgfKmcsEMK0 is unavailable
RNrxxPOyHo4 is unavailable
nSuPseBeQI0 is unavailable
701qhm

{'success_downloads': 20, 'fail_downloads': 556, 'total_in_disk': 992}

In [7]:
# Filter only the available videos
ds = ds.filter(lambda e: os.path.isfile(f"{dirs['videos_download']}/{e['video_id']}.mp4"))
ds

DatasetDict({
    train: Dataset({
        features: ['video_id', 'video_path', 'duration', 'captions_starts', 'captions_ends', 'en_captions'],
        num_rows: 521
    })
    validation: Dataset({
        features: ['video_id', 'video_path', 'duration', 'captions_starts', 'captions_ends', 'en_captions'],
        num_rows: 336
    })
    test: Dataset({
        features: ['video_id', 'video_path', 'duration', 'captions_starts', 'captions_ends', 'en_captions'],
        num_rows: 455
    })
})

In [None]:
# Make sure the test and validation dataset does not contain any videos from splits other then itself

train_val = set(ds['train']['video_id'] + ds['validation']['video_id'])
train_test = set(ds['train']['video_id'] + ds['test']['video_id'])
val_set = []
val_to_train = []
for vid_id in range(len(ds['validation'])):
    if ds['validation'][vid_id]['video_id'] not in train_test:
        val_set.append(vid_id)
    else:
        val_to_train.append(vid_id)

test_set = []
test_to_train = []
for vid_id in range(len(ds['test'])):
    if ds['test'][vid_id]['video_id'] not in train_val:
        test_set.append(vid_id)
    else:
        test_to_train.append(vid_id)

len(val_set), len(test_set)

(16, 135)

In [9]:
test_dataset = ds['test'].select(test_set[:70])
val_dataset = concatenate_datasets([
    ds['validation'].select(val_set),
    ds['test'].select(test_set[70:104]) # move some test set to val set
])
train_dataset = concatenate_datasets([
    ds['train'],
    ds['validation'].select(val_to_train),
    ds['test'].select(test_to_train),
    ds['test'].select(test_set[104:]) # remaining test set to train
])
ds_processed = DatasetDict({
    "train": train_dataset,
    "test": test_dataset,
    "validation": val_dataset
})
ds_processed

DatasetDict({
    train: Dataset({
        features: ['video_id', 'video_path', 'duration', 'captions_starts', 'captions_ends', 'en_captions'],
        num_rows: 1192
    })
    test: Dataset({
        features: ['video_id', 'video_path', 'duration', 'captions_starts', 'captions_ends', 'en_captions'],
        num_rows: 70
    })
    validation: Dataset({
        features: ['video_id', 'video_path', 'duration', 'captions_starts', 'captions_ends', 'en_captions'],
        num_rows: 50
    })
})

In [45]:
ds_processed = ds_processed.shuffle(seed=SEED)

In [48]:
ds_processed.save_to_disk("dataset/saved/processed2")

Saving the dataset (0/1 shards):   0%|          | 0/1192 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/70 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]

In [49]:
ds_processed = load_from_disk("dataset/saved/processed2") 

## Action Ordering Dataset

We use half of the first training set for action ordering and half for moment retrieval.

We use the same test and validation set for the two dataset since they never contribut to the gradients.

It means that the model does not learn from the test / validation set.

It also means that we now have more validation and test sets.

In [53]:
ao = DatasetDict({
    "train": ds_processed['train'].select(range(1192//2)),
    "validation": ds_processed['validation'],
    "test": ds_processed['test']
})
ao

DatasetDict({
    train: Dataset({
        features: ['video_id', 'video_path', 'duration', 'captions_starts', 'captions_ends', 'en_captions'],
        num_rows: 596
    })
    validation: Dataset({
        features: ['video_id', 'video_path', 'duration', 'captions_starts', 'captions_ends', 'en_captions'],
        num_rows: 50
    })
    test: Dataset({
        features: ['video_id', 'video_path', 'duration', 'captions_starts', 'captions_ends', 'en_captions'],
        num_rows: 70
    })
})

In [135]:
ao_maxes = {
    "num_max_caption_len": 0,
}
def find_longest(e):
    if len(''.join(e['en_captions'])) > ao_maxes['num_max_caption_len']:
        ao_maxes['num_max_caption_len'] = len(''.join(e['en_captions']))
    return True
ao.filter(find_longest)
ao_maxes

Filter:   0%|          | 0/596 [00:00<?, ? examples/s]

Filter:   0%|          | 0/50 [00:00<?, ? examples/s]

Filter:   0%|          | 0/70 [00:00<?, ? examples/s]

{'num_max_caption_len': 618}

The robust prompt has more instruction, detail, and example but is more lengthy than the normal prompt

The wording can be choosen later, but 'actions' is synonymous and related closely to 'scenes' and 'moments' but is more commonly used

In [171]:
robust_prompt = """Your task is to determine the correct chronological order of the scrambled actions from the video.\
 Provide your answer as a comma-separated letters, where each letter ({letters}) represents an action in the video.\
 The order of letters must match the timeline of actions, with earlier letters representing earlier actions. Do not provide any other explanation in your response.
Here is an example: The action E is "a robber meets a cop" and the action F is "the robber runs across the street".\
 If in the video the robber runs after meeting the cop, then action E happens before action F. Therefore, your answer should be "E, F" (this is an example).
Here is the video context: {context}
Here is the question: What is the correct chronological order of the scrambled actions based on when they happen in the video?
Here are the scrambled actions:
{actions}"""

normal_prompt = """The actions below are from the video but the ordering of them is scrambled. Arrange them in the correct chronological order based on when they happen.
The video context is: {context}
The scrambled actions are:
{actions}
The answer format: provide your answer as a comma-separated letters, for example: "A, B". Each letter represent an action, and earlier actions should appear earlier in your answer.\
 Ensure that your answer includes all actions ({letters}) and matches the number of actions provided.
Question: What is the correct chronological order of the scrambled actions based on when they happen in the video?"""

def transform_action_ordering(example):
    capt = example['en_captions']
    ans_key = list(range(1, len(capt)))
    ans_val = list(range(1, len(capt)))
    random.shuffle(ans_key) # scramble 
    ans = {ans_key[i]:ans_val[i] for i in range(len(capt) - 1)} # map answers
    ans_key = [chr(i + 64) for i in ans_key] # make answer
    actions = '\n'.join([f'{chr(i + 64)}. {capt[ans[i]].strip()}' for i in sorted(list(ans.keys()))]) # assemble actions
    letters = 'A and B' if len(capt) == 3 else 'A, B, and C'
    context = capt[0].strip(" ")
    context = context[0].upper() + context[1:] 
    question_normal = normal_prompt.format(context=context, letters=letters, actions=actions)
    question_robust = robust_prompt.format(context=context, letters=letters, actions=actions)

    ratio_option = len(capt) / 3 
    ratio_token = len(''.join(example['en_captions'])) / ao_maxes['num_max_caption_len']
    ratio_duration = example['duration'] / 50
    complexity = (ratio_option * 1.5 + ratio_duration + ratio_token *1.5 ) / 5

    return {
        "video_id": example['video_id'],
        "question_normal" : question_normal,
        "question_robust": question_robust,
        "answer": ans_key,
        "complexity": complexity # adds an option for the model to learn easier questions first
    }


In [172]:
# Before
ao['train'][15]

{'video_id': 'v_VlLq4bAHCXI',
 'video_path': 'https://www.youtube.com/watch?v=VlLq4bAHCXI',
 'duration': 44.560001373291016,
 'captions_starts': [0.0, 7.800000190734863, 35.650001525878906],
 'captions_ends': [7.800000190734863, 35.650001525878906, 44.560001373291016],
 'en_captions': ['These people are outside talking and  walking along the sidewalk.',
  ' There are also people running in their short outfits.',
  " There's also people watching them as they run in the middle of the street."]}

In [173]:
# after
after = transform_action_ordering(ao['train'][15])
for k,v in after.items():
    print(f"{k}:\n{v}\n")

video_id:
v_VlLq4bAHCXI

question_normal:
The actions below are from the video but the ordering of them is scrambled. Arrange them in the correct chronological order based on when they happen.
The video context is: These people are outside talking and  walking along the sidewalk.
The scrambled actions are:
A. There are also people running in their short outfits.
B. There's also people watching them as they run in the middle of the street.
The answer format: provide your answer as a comma-separated letters, for example: "A, B". Each letter represent an action, and earlier actions should appear earlier in your answer. Ensure that your answer includes all actions (A and B) and matches the number of actions provided.
Question: What is the correct chronological order of the scrambled actions based on when they happen in the video?

question_robust:
Your task is to determine the correct chronological order of the scrambled actions from the video. Provide your answer as a comma-separated lett

In [174]:
ao_tf = ao.map(
    function=transform_action_ordering,
    remove_columns=['video_path', 'en_captions']
)
ao_tf

Map:   0%|          | 0/596 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/70 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['video_id', 'duration', 'captions_starts', 'captions_ends', 'question_normal', 'question_robust', 'answer', 'complexity'],
        num_rows: 596
    })
    validation: Dataset({
        features: ['video_id', 'duration', 'captions_starts', 'captions_ends', 'question_normal', 'question_robust', 'answer', 'complexity'],
        num_rows: 50
    })
    test: Dataset({
        features: ['video_id', 'duration', 'captions_starts', 'captions_ends', 'question_normal', 'question_robust', 'answer', 'complexity'],
        num_rows: 70
    })
})

In [175]:
for k, v in ao_tf['train'][0].items():
    print(f"{k}:\n{v}\n")

video_id:
v_tMTvOaUYNeg

duration:
23.959999084472656

captions_starts:
[0.23999999463558197, 3.4700000286102295, 9.350000381469727]

captions_ends:
[23.360000610351562, 8.630000114440918, 23.719999313354492]

question_normal:
The actions below are from the video but the ordering of them is scrambled. Arrange them in the correct chronological order based on when they happen.
The video context is: One adult and two small children work together to put leaves on a yard into a paper bag.
The scrambled actions are:
A. The boy puts leaves in the brown bag and the girl picks the leaves up.
B. One adult stands with two children on a lawn amidst many leaves with a black truck in the background.
The answer format: provide your answer as a comma-separated letters, for example: "A, B". Each letter represent an action, and earlier actions should appear earlier in your answer. Ensure that your answer includes all actions (A and B) and matches the number of actions provided.
Question: What is the cor

In [189]:
ao_tf.save_to_disk('dataset/saved/action_ordering_v2')

Saving the dataset (0/1 shards):   0%|          | 0/596 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/70 [00:00<?, ? examples/s]

### Upload dataset and video to hub

In [190]:
ao_tf = load_from_disk('dataset/saved/action_ordering_v2')
ao_tf

DatasetDict({
    train: Dataset({
        features: ['video_id', 'duration', 'captions_starts', 'captions_ends', 'question_normal', 'question_robust', 'answer', 'complexity'],
        num_rows: 596
    })
    validation: Dataset({
        features: ['video_id', 'duration', 'captions_starts', 'captions_ends', 'question_normal', 'question_robust', 'answer', 'complexity'],
        num_rows: 50
    })
    test: Dataset({
        features: ['video_id', 'duration', 'captions_starts', 'captions_ends', 'question_normal', 'question_robust', 'answer', 'complexity'],
        num_rows: 70
    })
})

In [191]:
videos_to_upload = set(ao_tf['train']['video_id'] + ao_tf['test']['video_id'] + ao_tf['validation']['video_id'])
len(videos_to_upload)

634

In [None]:
if not os.path.isdir("dataset/videos/action_ordering_v2"):
    os.makedirs("dataset/videos/action_ordering_v2")
for vid_id in videos_to_upload:
    shutil.copy(f"dataset/videos/download/{vid_id}.mp4", f"dataset/videos/action_ordering_v2/{vid_id}.mp4")

In [192]:
ao_tf.push_to_hub(DATASET_REPO, 'action_ordering_v2')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/jwnt4/a-temporal-upgrade/commit/cd2bf1e721ccf8ff089db02c5ee50b8fdb37f6bb', commit_message='Upload dataset', commit_description='', oid='cd2bf1e721ccf8ff089db02c5ee50b8fdb37f6bb', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/jwnt4/a-temporal-upgrade', endpoint='https://huggingface.co', repo_type='dataset', repo_id='jwnt4/a-temporal-upgrade'), pr_revision=None, pr_num=None)

In [195]:
# Create the archive
shutil.make_archive('dataset/videos/zip/action_ordering_v2', 'zip', "dataset/videos", 'action_ordering_v2')

'/Users/t-i.jiwanta/dev/ai/a-temporal-upgrade/dataset/videos/zip/action_ordering_v2.zip'

In [196]:
api = HfApi()
api.upload_file(
    path_or_fileobj=f"dataset/videos/zip/action_ordering_v2.zip",
    path_in_repo="videos/action_ordering_v2.zip",
    repo_id=DATASET_REPO,
    repo_type="dataset",
)

action_ordering_v2.zip:   0%|          | 0.00/1.50G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/jwnt4/a-temporal-upgrade/commit/5458158640f39bc45cd68e4c7baf2bb3122bcd39', commit_message='Upload videos/action_ordering_v2.zip with huggingface_hub', commit_description='', oid='5458158640f39bc45cd68e4c7baf2bb3122bcd39', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/jwnt4/a-temporal-upgrade', endpoint='https://huggingface.co', repo_type='dataset', repo_id='jwnt4/a-temporal-upgrade'), pr_revision=None, pr_num=None)

## Moment retrieval dataset

In [226]:
# load dataset using huggingface datasets
ds_processed = load_from_disk("dataset/saved/processed2")
mr = DatasetDict({
    "train": ds_processed['train'].select(range(1192//2, 1192)),
    "test": ds_processed['test'],
    "validation": ds_processed['validation']
})
mr

DatasetDict({
    train: Dataset({
        features: ['video_id', 'video_path', 'duration', 'captions_starts', 'captions_ends', 'en_captions'],
        num_rows: 596
    })
    test: Dataset({
        features: ['video_id', 'video_path', 'duration', 'captions_starts', 'captions_ends', 'en_captions'],
        num_rows: 70
    })
    validation: Dataset({
        features: ['video_id', 'video_path', 'duration', 'captions_starts', 'captions_ends', 'en_captions'],
        num_rows: 50
    })
})

In [256]:
frame_prompt = """Your task is to determine the frame range that best represents an action in the video.\
 Provide your answer as two numbers separated by a comma, where the first number is the first frame correlated to the action and the second number is the last frame correlated to the action.\
 Do not provide any other explanation in your response.
Here is an example: Suppose the video has frames numbered from 1 to N (N is the number of frames sampled). \
If the action in question is "a man sings on the street" and the frames that has the most similarities with this action are 5, 6, and 7 your answer should be: "5, 7" (this is an example).
Number of frames sampled in this video: <num_frames>
Here is the video context: {context}
Here is the action in question: <action>
Here is the question: What is the frame range (start, end) in the video that best represents the action asked?"""

timestamp_prompt = """Your task is to determine the timestamp range that best represents an action in the video. \
Use the provided frame-to-timestamp mapping to associate the timestamps with the actual video frames. \
Find the most similar continuous sequence of timestamp with the action asked.
Provide your answer as two timestamps in the format "mm:ss, mm:ss" (e.g. "00:10, 00:30"), where the first timestamp is the start time of the action and the second timestamp is the end time of the action. \
Do not provide any other explanation in your response. 
Duration of the video: <duration>
Number of frames sampled in this video: <num_frames>
Here is the frame-to-timestamp mapping for this video:
<frame_info>
Here is the video context: {context} 
The action in question is: <action>
The question: What is the timestamp range (start, end) in the video that best represents the action asked?"""

def transform_moment_retrieval(sample):
    capt = sample['en_captions']
    starts, ends = sample['captions_starts'], sample['captions_ends']
    capt_len = sorted([[i, ends[i] - starts[i]] for i in range(1, len(capt))], key=lambda e: e[1], reverse=True)
    answers = []
    actions = []
    for cl in capt_len:
        if cl[1] <= 0.95 * sample['duration'] and cl[1] >= 2: # filter out too-short or too-long actions
            answers.append([starts[cl[0]], ends[cl[0]]])
            actions.append(capt[cl[0]].strip())
    
    complexity = [capt_len[i][1] / sample['duration'] for i in range(len(capt_len))]
    complexity = 1 - (sum(complexity) / len(complexity))
    prompt_frame = frame_prompt.format(context=capt[0].strip())
    prompt_timestamp = timestamp_prompt.format(context=capt[0].strip())
    return {
        "prompt_frame": prompt_frame, 
        "prompt_timestamp": prompt_timestamp,
        "complexity":complexity, 
        "actions": actions,
        "answers": answers,
        "complexity": complexity 
    }

In [257]:
x = transform_moment_retrieval(mr['train'][1])
for k, v in x.items():
    print(f"{k}:\n{v}\n")

prompt_frame:
Your task is to determine the frame range that best represents an action in the video. Provide your answer as two numbers separated by a comma, where the first number is the first frame correlated to the action and the second number is the last frame correlated to the action. Do not provide any other explanation in your response.
Here is an example: Suppose the video has frames numbered from 1 to N (N is the number of frames sampled). If the action in question is "a man sings on the street" and the frames that has the most similarities with this action are 5, 6, and 7 your answer should be: "5, 7" (this is an example).
Number of frames sampled in this video: <num_frames>
Here is the video context: A mom is sitting at the top of a slide with her little baby.
Here is the action in question: <action>
Here is the question: What is the frame range (start, end) in the video that best represents the action asked?

prompt_timestamp:
Your task is to determine the timestamp range t

In [258]:
mr_transformed = mr.map(
    function=transform_moment_retrieval,
    remove_columns=['video_path', 'captions_starts', 'captions_ends', 'en_captions']
)
for k, v in mr_transformed['train'][2].items():
    print(f"{k}:\n{v}\n")

Map:   0%|          | 0/596 [00:00<?, ? examples/s]

Map:   0%|          | 0/70 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

video_id:
v_uIl5Tj74sLw

duration:
38.56999969482422

prompt_frame:
Your task is to determine the frame range that best represents an action in the video. Provide your answer as two numbers separated by a comma, where the first number is the first frame correlated to the action and the second number is the last frame correlated to the action. Do not provide any other explanation in your response.
Here is an example: Suppose the video has frames numbered from 1 to N (N is the number of frames sampled). If the action in question is "a man sings on the street" and the frames that has the most similarities with this action are 5, 6, and 7 your answer should be: "5, 7" (this is an example).
Number of frames sampled in this video: <num_frames>
Here is the video context: a girl stands in front of a bathroom mirror and vigorously rubs her face.
Here is the action in question: <action>
Here is the question: What is the frame range (start, end) in the video that best represents the action asked?

In [265]:
for k, v in mr_transformed['train'][10].items():
    print(f"{k}:\n{v}\n")

video_id:
v__UPD2IvdQ_M

duration:
49.7400016784668

prompt_frame:
Your task is to determine the frame range that best represents an action in the video. Provide your answer as two numbers separated by a comma, where the first number is the first frame correlated to the action and the second number is the last frame correlated to the action. Do not provide any other explanation in your response.
Here is an example: Suppose the video has frames numbered from 1 to N (N is the number of frames sampled). If the action in question is "a man sings on the street" and the frames that has the most similarities with this action are 5, 6, and 7 your answer should be: "5, 7" (this is an example).
Number of frames sampled in this video: <num_frames>
Here is the video context: An audience is gathered in a gymnasium to watch the gymnast on the pommel horse.
Here is the action in question: <action>
Here is the question: What is the frame range (start, end) in the video that best represents the action 

In [260]:
mr_transformed.save_to_disk('dataset/saved/moment_retrieval_v2')

Saving the dataset (0/1 shards):   0%|          | 0/596 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/70 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]

In [261]:
mr_transformed = load_from_disk('dataset/saved/moment_retrieval_v2')

In [263]:
videos_to_upload = set(
    mr_transformed['train']['video_id'] + 
    mr_transformed['test']['video_id'] + 
    mr_transformed['validation']['video_id']
)
len(videos_to_upload)

624

In [264]:
if not os.path.isdir("dataset/videos/moment_retrieval_v2"):
    os.makedirs("dataset/videos/moment_retrieval_v2")
for vid_id in videos_to_upload:
    shutil.copy(f"dataset/videos/download/{vid_id}.mp4", f"dataset/videos/moment_retrieval_v2/{vid_id}.mp4")

In [266]:
mr_transformed.push_to_hub(DATASET_REPO, 'moment_retrieval_v2')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/2.33k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/jwnt4/a-temporal-upgrade/commit/f7acc632236cf072eeb6470fca16c1db30616490', commit_message='Upload dataset', commit_description='', oid='f7acc632236cf072eeb6470fca16c1db30616490', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/jwnt4/a-temporal-upgrade', endpoint='https://huggingface.co', repo_type='dataset', repo_id='jwnt4/a-temporal-upgrade'), pr_revision=None, pr_num=None)

In [267]:
# Create the archive
shutil.make_archive('dataset/videos/zip/moment_retrieval_v2', 'zip', "dataset/videos", 'moment_retrieval_v2')

'/Users/t-i.jiwanta/dev/ai/a-temporal-upgrade/dataset/videos/zip/moment_retrieval_v2.zip'

In [268]:
api = HfApi()
api.upload_file(
    path_or_fileobj=f"dataset/videos/zip/moment_retrieval_v2.zip",
    path_in_repo="videos/moment_retrieval_v2.zip",
    repo_id=DATASET_REPO,
    repo_type="dataset",
)

moment_retrieval_v2.zip:   0%|          | 0.00/1.48G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/jwnt4/a-temporal-upgrade/commit/9404fca2e897f4bd4b994872a45e118328c1d0df', commit_message='Upload videos/moment_retrieval_v2.zip with huggingface_hub', commit_description='', oid='9404fca2e897f4bd4b994872a45e118328c1d0df', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/jwnt4/a-temporal-upgrade', endpoint='https://huggingface.co', repo_type='dataset', repo_id='jwnt4/a-temporal-upgrade'), pr_revision=None, pr_num=None)

## Expanding data for moment retrieval v2

In [5]:
mr = load_dataset("jwnt4/a-temporal-upgrade", "moment_retrieval_v2")
mr

DatasetDict({
    train: Dataset({
        features: ['video_id', 'duration', 'prompt_frame', 'prompt_timestamp', 'complexity', 'actions', 'answers'],
        num_rows: 596
    })
    test: Dataset({
        features: ['video_id', 'duration', 'prompt_frame', 'prompt_timestamp', 'complexity', 'actions', 'answers'],
        num_rows: 70
    })
    validation: Dataset({
        features: ['video_id', 'duration', 'prompt_frame', 'prompt_timestamp', 'complexity', 'actions', 'answers'],
        num_rows: 50
    })
})

In [6]:
train_1s = mr['train'].filter(lambda e: len(e['actions']) == 1).map(lambda e: {**e, "action": e['actions'][0], "answer": e['answers'][0]})
train_1s

Dataset({
    features: ['video_id', 'duration', 'prompt_frame', 'prompt_timestamp', 'complexity', 'actions', 'answers', 'action', 'answer'],
    num_rows: 56
})

In [7]:
train_2 = mr['train'].filter(lambda e: len(e['actions']) == 2)
train_2s_arr = []
for i in range(2):
    train_2s_arr.append(train_2.map(lambda e: {**e, "action": e['actions'][i], "answer": e['answers'][i]}))
train_2s = concatenate_datasets(train_2s_arr)
train_2s

Dataset({
    features: ['video_id', 'duration', 'prompt_frame', 'prompt_timestamp', 'complexity', 'actions', 'answers', 'action', 'answer'],
    num_rows: 842
})

In [8]:
train_3 = mr['train'].filter(lambda e: len(e['actions']) == 3)
train_3s_arr = []
for i in range(3):
    train_3s_arr.append(train_3.map(lambda e: {**e, "action": e['actions'][i], "answer": e['answers'][i]}))
train_3s = concatenate_datasets(train_3s_arr)
train_3s

Dataset({
    features: ['video_id', 'duration', 'prompt_frame', 'prompt_timestamp', 'complexity', 'actions', 'answers', 'action', 'answer'],
    num_rows: 315
})

In [9]:
mr_train = concatenate_datasets((train_1s, train_2s, train_3s))
mr_train = mr_train.map(lambda e: e, remove_columns=['answers', 'actions', 'complexity'])
mr_train = mr_train.shuffle(seed=SEED)
mr_train

Dataset({
    features: ['video_id', 'duration', 'prompt_frame', 'prompt_timestamp', 'action', 'answer'],
    num_rows: 1213
})

In [10]:
mr_train[1]

{'video_id': 'v_0JgcRWHCi4c',
 'duration': 22.780000686645508,
 'prompt_frame': 'Your task is to determine the frame range that best represents an action in the video. Provide your answer as two numbers separated by a comma, where the first number is the first frame correlated to the action and the second number is the last frame correlated to the action. Do not provide any other explanation in your response.\nHere is an example: Suppose the video has frames numbered from 1 to N (N is the number of frames sampled). If the action in question is "a man sings on the street" and the frames that has the most similarities with this action are 5, 6, and 7 your answer should be: "5, 7" (this is an example).\nNumber of frames sampled in this video: <num_frames>\nHere is the video context: A woman is seen standing in a living room with a dog holding a frisbee.\nHere is the action in question: <action>\nHere is the question: What is the frame range (start, end) in the video that best represents t

In [11]:
test_1s = mr['test'].filter(lambda e: len(e['actions']) == 1).map(lambda e: {**e, "action": e['actions'][0], "answer": e['answers'][0]})

test_2 = mr['test'].filter(lambda e: len(e['actions']) == 2)
test_2s_arr = []
for i in range(2):
    test_2s_arr.append(test_2.map(lambda e: {**e, "action": e['actions'][i], "answer": e['answers'][i]}))
test_2s = concatenate_datasets(test_2s_arr)

test_3 = mr['test'].filter(lambda e: len(e['actions']) == 3)
test_3s_arr = []
for i in range(3):
    test_3s_arr.append(test_3.map(lambda e: {**e, "action": e['actions'][i], "answer": e['answers'][i]}))
test_3s = concatenate_datasets(test_3s_arr)

mr_test = concatenate_datasets((test_1s, test_2s, test_3s))
mr_test = mr_test.map(lambda e: e, remove_columns=['answers', 'actions', 'complexity'])
mr_test

Dataset({
    features: ['video_id', 'duration', 'prompt_frame', 'prompt_timestamp', 'action', 'answer'],
    num_rows: 135
})

In [12]:
val_1s = mr['validation'].filter(lambda e: len(e['actions']) == 1).map(lambda e: {**e, "action": e['actions'][0], "answer": e['answers'][0]})

val_2 = mr['validation'].filter(lambda e: len(e['actions']) == 2)
val_2s_arr = []
for i in range(2):
    val_2s_arr.append(val_2.map(lambda e: {**e, "action": e['actions'][i], "answer": e['answers'][i]}))
val_2s = concatenate_datasets(val_2s_arr)

val_3 = mr['validation'].filter(lambda e: len(e['actions']) == 3)
val_3s_arr = []
for i in range(3):
    val_3s_arr.append(val_3.map(lambda e: {**e, "action": e['actions'][i], "answer": e['answers'][i]}))
val_3s = concatenate_datasets(val_3s_arr)

mr_val = concatenate_datasets((val_1s, val_2s, val_3s))
mr_val = mr_val.map(lambda e: e, remove_columns=['answers', 'actions', 'complexity'])
mr_val

Dataset({
    features: ['video_id', 'duration', 'prompt_frame', 'prompt_timestamp', 'action', 'answer'],
    num_rows: 104
})

In [13]:
mr = DatasetDict({
    "train": mr_train, 
    "test": mr_test,
    "validation": mr_val
})

In [14]:
mr

DatasetDict({
    train: Dataset({
        features: ['video_id', 'duration', 'prompt_frame', 'prompt_timestamp', 'action', 'answer'],
        num_rows: 1213
    })
    test: Dataset({
        features: ['video_id', 'duration', 'prompt_frame', 'prompt_timestamp', 'action', 'answer'],
        num_rows: 135
    })
    validation: Dataset({
        features: ['video_id', 'duration', 'prompt_frame', 'prompt_timestamp', 'action', 'answer'],
        num_rows: 104
    })
})

In [15]:
mr['train'][17]

{'video_id': 'v_LSCQ1yqocHg',
 'duration': 24.59000015258789,
 'prompt_frame': 'Your task is to determine the frame range that best represents an action in the video. Provide your answer as two numbers separated by a comma, where the first number is the first frame correlated to the action and the second number is the last frame correlated to the action. Do not provide any other explanation in your response.\nHere is an example: Suppose the video has frames numbered from 1 to N (N is the number of frames sampled). If the action in question is "a man sings on the street" and the frames that has the most similarities with this action are 5, 6, and 7 your answer should be: "5, 7" (this is an example).\nNumber of frames sampled in this video: <num_frames>\nHere is the video context: There\'s a man standing in a kitchen and washing his hands in steel kitchen sink.\nHere is the action in question: <action>\nHere is the question: What is the frame range (start, end) in the video that best rep

In [16]:
mr.push_to_hub("jwnt4/a-temporal-upgrade", "moment_retrieval")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/jwnt4/a-temporal-upgrade/commit/3b24df1d52d409244af14bc6ec500a4dbc63c708', commit_message='Upload dataset', commit_description='', oid='3b24df1d52d409244af14bc6ec500a4dbc63c708', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/jwnt4/a-temporal-upgrade', endpoint='https://huggingface.co', repo_type='dataset', repo_id='jwnt4/a-temporal-upgrade'), pr_revision=None, pr_num=None)