# Annotations generation
This notebook uses the Gemma LLM to generate new annotations over previously unused narrations.  
In particular, for each group of 3 consecutive narrations sampled, two annotations are generated.

## Setup and load data

In [1]:
# list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/gemma/pytorch/2b-it/2/config.json
/kaggle/input/gemma/pytorch/2b-it/2/gemma-2b-it.ckpt
/kaggle/input/gemma/pytorch/2b-it/2/tokenizer.model
/kaggle/input/annotations-ego4d/annotations/ego4d.json
/kaggle/input/annotations-ego4d/annotations/v1/annotations/av_test_unannotated.json
/kaggle/input/annotations-ego4d/annotations/v1/annotations/nlq_val.json
/kaggle/input/annotations-ego4d/annotations/v1/annotations/fho_hands_test_unannotated.json
/kaggle/input/annotations-ego4d/annotations/v1/annotations/vq_test_unannotated.json
/kaggle/input/annotations-ego4d/annotations/v1/annotations/nlq_train.json
/kaggle/input/annotations-ego4d/annotations/v1/annotations/av_train.json
/kaggle/input/annotations-ego4d/annotations/v1/annotations/fho_sta_train.json
/kaggle/input/annotations-ego4d/annotations/v1/annotations/manifest.ver
/kaggle/input/annotations-ego4d/annotations/v1/annotations/fho_oscc-pnr_test_unannotated.json
/kaggle/input/annotations-ego4d/annotations/v1/annotations/fho_hands_t

In [2]:
# setup the environment and install necessary packages
!pip install -q -U immutabledict sentencepiece 
!git clone https://github.com/google/gemma_pytorch.git
!mkdir /kaggle/working/gemma/
!mv /kaggle/working/gemma_pytorch/gemma/* /kaggle/working/gemma/

import sys
import contextlib
import os
import torch

# add the gemma_pytorch directory to the system path
sys.path.append("/kaggle/working/gemma_pytorch/") 

# import necessary modules from Gemma
from gemma.config import GemmaConfig, get_config_for_7b, get_config_for_2b
from gemma.model import GemmaForCausalLM
from gemma.tokenizer import Tokenizer

# load the model

# define constants
VARIANT = "2b-it" 
MACHINE_TYPE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Machine type: {MACHINE_TYPE}")
weights_dir = '/kaggle/input/gemma/pytorch/2b-it/2/' 
# kagglehub.download('gemma/pytorch/1.1-2b-it/1/')

# verify the weights directory and its contents
if os.path.exists(weights_dir):
    print("Weights directory exists. Listing contents:")
    print(os.listdir(weights_dir))
else:
    print("Weights directory does not exist.")

# context manager to set the default tensor type
@contextlib.contextmanager
def _set_default_tensor_type(dtype: torch.dtype):
  """Sets the default torch dtype to the given dtype."""
  torch.set_default_dtype(dtype)
  yield
  torch.set_default_dtype(torch.float)

# load the model configuration
model_config = get_config_for_2b() if "2b" in VARIANT else get_config_for_7b()
model_config.tokenizer = os.path.join(weights_dir, "tokenizer.model")

# set the device
device = torch.device(MACHINE_TYPE)

# load the model weights and set the model to evaluation mode
with _set_default_tensor_type(model_config.get_dtype()):
  model = GemmaForCausalLM(model_config)
  ckpt_path = os.path.join(weights_dir, f'gemma-{VARIANT}.ckpt')
  model.load_weights(ckpt_path)
  model = model.to(device).eval()

Cloning into 'gemma_pytorch'...
remote: Enumerating objects: 177, done.[K
remote: Counting objects: 100% (109/109), done.[K
remote: Compressing objects: 100% (61/61), done.[K
remote: Total 177 (delta 64), reused 68 (delta 46), pack-reused 68[K
Receiving objects: 100% (177/177), 2.16 MiB | 2.39 MiB/s, done.
Resolving deltas: 100% (91/91), done.
Machine type: cuda
Weights directory exists. Listing contents:
['config.json', 'gemma-2b-it.ckpt', 'tokenizer.model']


  return self.fget.__get__(instance, owner)()


In [3]:
# check input list
input_directory = '/kaggle/input/'
for root, dirs, files in os.walk(input_directory):
    for name in files:
        print(os.path.join(root, name))

/kaggle/input/gemma/pytorch/2b-it/2/config.json
/kaggle/input/gemma/pytorch/2b-it/2/gemma-2b-it.ckpt
/kaggle/input/gemma/pytorch/2b-it/2/tokenizer.model
/kaggle/input/annotations-ego4d/annotations/ego4d.json
/kaggle/input/annotations-ego4d/annotations/v1/annotations/av_test_unannotated.json
/kaggle/input/annotations-ego4d/annotations/v1/annotations/nlq_val.json
/kaggle/input/annotations-ego4d/annotations/v1/annotations/fho_hands_test_unannotated.json
/kaggle/input/annotations-ego4d/annotations/v1/annotations/vq_test_unannotated.json
/kaggle/input/annotations-ego4d/annotations/v1/annotations/nlq_train.json
/kaggle/input/annotations-ego4d/annotations/v1/annotations/av_train.json
/kaggle/input/annotations-ego4d/annotations/v1/annotations/fho_sta_train.json
/kaggle/input/annotations-ego4d/annotations/v1/annotations/manifest.ver
/kaggle/input/annotations-ego4d/annotations/v1/annotations/fho_oscc-pnr_test_unannotated.json
/kaggle/input/annotations-ego4d/annotations/v1/annotations/fho_hands_t

In [None]:
# useful imports

import pandas as pd
import numpy as np
import json
from matplotlib import pyplot as plt
from tqdm import tqdm

In [None]:
# get video uids of the val file

nlq_val_path = '/kaggle/input/annotations-ego4d/annotations/v1/annotations/nlq_val.json'
with open(nlq_val_path, 'r') as f:
    data = json.load(f)

videos_uids_val = []
for video in data['videos']:    
    video_uid = video.get('video_uid')
    videos_uids_val.append(video_uid)

In [5]:
# get video uids of the test file

nlq_test_path = '/kaggle/input/annotations-ego4d/annotations/v1/annotations/nlq_test_unannotated.json'
with open(nlq_test_path, 'r') as f:
    data = json.load(f)

videos_uids_test = []
for video in data['videos']:    
    video_uid = video.get('video_uid')
    videos_uids_test.append(video_uid)

In [6]:
# get video uids present in version 1 but not 2 (egovlp pretrained features only for videos of second version)
not_present_videos_path = '/kaggle/input/not-present-videos/ego4d_uids_video_difference_v1_v2.txt'
with open(not_present_videos_path, 'r') as file:    
    lines = file.readlines()

not_present_videos = [line.strip() for line in lines]

print(not_present_videos)

['4a08e95b-cb52-4621-b679-87f617893e19', 'cbacc898-22f4-4a83-81ae-8d421791feb2', '20164675-63be-45a8-93d1-6ab5cad76012', '0b3fb06d-aed7-44ce-ba44-c079aea84da2', '864a2391-63d5-4f64-9ba2-cf1367c178c2', '24f035b3-fa40-4846-9546-fb61271e5caa', '2c3173b5-08d1-4051-a700-c677574df125', '29bc686e-8f5c-49de-a23e-c0b802e47d4d', 'fb6ff052-5ea7-4647-a0a3-ebc74133c86b', 'e7ef4970-4d3e-4fd3-9a90-bb81a0873c86', '34b0812d-21a9-48ad-a2f3-435455f169f0', '6bbea7d8-d5fb-4684-9a53-278fbffad5a1', 'a66da989-08eb-4f6a-98ea-d6402b81ad89', 'ff3a727c-2bcd-4a33-95eb-5d18d385ea70', '136f0c55-b87c-45cc-bcaa-ff2c39bae889', '703d550a-0a84-4bcf-9b45-e25c864ade70', 'e2454ff3-44f1-45d0-9e77-011657400b01', '7a39a702-dcd1-46c4-82aa-00c97bda423e', '54c5d370-6738-48b2-8eab-39d75138e118', '7f267263-b89f-43a7-b591-faa33ef5affe', '32a80da2-6b71-472a-92f3-9e2e5596146c', 'ea27cc27-037f-4c63-b418-faea630faf8e', '865733f5-97b6-4380-a418-1fd6510e0f5e', '6d2c4747-1232-48d3-8922-afcafc112bce', '0a74808e-4f55-4bd4-abbd-78f3435ea5bc',

In [7]:
# create a dataframe of narrations and their info
narrations_path = '/kaggle/input/annotations-ego4d/annotations/v1/annotations/narration.json'
with open(narrations_path, 'r') as f:
    data = json.load(f)

# list of narrations with their info
records = []
for video_uid, content in tqdm(data.items()):
        #Filter out videos which are "not complete", videos belonging to nlq_val.json, nlq_test.json and videos for which we don't have visual features
        if (content.get('status') != 'complete') or (video_uid in videos_uids_val) or (video_uid in videos_uids_test) or (video_uid in not_present_videos):
            continue
        
        # retrieve narrations of the first annotator
        narration_pass_1 = content.get('narration_pass_1', {})
        narrations = narration_pass_1.get('narrations', [])
        
        for narration in narrations:
            # get the text of the narration
            narration_text = narration.get('narration_text')
            # remove the #C or similar patterns at the beginning of each narration
            if narration_text.startswith("#"):
                narration_text = narration_text[2:]
            
            # create a record with all the narration info
            record = {
                'video_uid': video_uid,
                'annotation_uid': narration.get('annotation_uid'),
                'narration_text': narration_text,
                'timestamp_sec': narration.get('timestamp_sec'),
                'timestamp_frame': narration.get('timestamp_frame'),
            }
            records.append(record)

df = pd.DataFrame(records)

100%|██████████| 9645/9645 [00:04<00:00, 2372.20it/s]


In [8]:
pd.set_option('display.max_columns', None)
df

Unnamed: 0,video_uid,annotation_uid,narration_text,timestamp_sec,timestamp_frame
0,77cc4654-4eec-44c6-af05-dbdf71f9a401,920182f7-5385-488b-99f9-caf8f0d9fe6b,C interacts with a woman X,0.00000,0
1,77cc4654-4eec-44c6-af05-dbdf71f9a401,920182f7-5385-488b-99f9-caf8f0d9fe6b,C walks into the kitchen,4.53806,136
2,77cc4654-4eec-44c6-af05-dbdf71f9a401,920182f7-5385-488b-99f9-caf8f0d9fe6b,C opens a shelf,12.92098,388
3,77cc4654-4eec-44c6-af05-dbdf71f9a401,920182f7-5385-488b-99f9-caf8f0d9fe6b,C brings out a basket from the shelf,15.10264,453
4,77cc4654-4eec-44c6-af05-dbdf71f9a401,920182f7-5385-488b-99f9-caf8f0d9fe6b,C puts back the basket into the shelf,17.15749,515
...,...,...,...,...,...
2329802,ad4f61f0-4c0f-4ce1-bdaa-e57b79250527,9cdb0f18-81e5-4288-bf2e-3b44e5977107,C arranges a duvet on the bed,8.31072,249
2329803,ad4f61f0-4c0f-4ce1-bdaa-e57b79250527,9cdb0f18-81e5-4288-bf2e-3b44e5977107,C picks a blanket from the bed,39.01209,1170
2329804,ad4f61f0-4c0f-4ce1-bdaa-e57b79250527,9cdb0f18-81e5-4288-bf2e-3b44e5977107,C arranges the duvet on the bed,35.01310,1050
2329805,ad4f61f0-4c0f-4ce1-bdaa-e57b79250527,9cdb0f18-81e5-4288-bf2e-3b44e5977107,C folds the blanket on the bed,42.24332,1267


In [9]:
# sort narrations inside the same clip (some narrations are not in the correct order in the original file)
df_sorted = df.groupby('annotation_uid', group_keys=False).apply(lambda x: x.sort_values('timestamp_sec'))
df_sorted = df_sorted.reset_index(drop=True)
df_sorted

  df_sorted = df.groupby('annotation_uid', group_keys=False).apply(lambda x: x.sort_values('timestamp_sec'))


Unnamed: 0,video_uid,annotation_uid,narration_text,timestamp_sec,timestamp_frame
0,268b8232-2e94-47c6-928b-99ec175340c5,0003222a-d2cb-4e5d-9846-cd0c5ca9ae9c,C flips the page,2715.831482,81474
1,268b8232-2e94-47c6-928b-99ec175340c5,0003222a-d2cb-4e5d-9846-cd0c5ca9ae9c,C wipes the book with the cloth,2717.087122,81512
2,268b8232-2e94-47c6-928b-99ec175340c5,0003222a-d2cb-4e5d-9846-cd0c5ca9ae9c,C puts the book in the shelf,2725.421392,81762
3,268b8232-2e94-47c6-928b-99ec175340c5,0003222a-d2cb-4e5d-9846-cd0c5ca9ae9c,C picks the book,2729.706112,81891
4,268b8232-2e94-47c6-928b-99ec175340c5,0003222a-d2cb-4e5d-9846-cd0c5ca9ae9c,C flips the page,2732.238932,81967
...,...,...,...,...,...
2329802,a06ac945-e65f-4259-9563-00bbcf62436f,ffff7cd6-7d12-42d4-ae92-f1bfc2fd6e88,The man Q walks around the dining table.,533.928115,16017
2329803,a06ac945-e65f-4259-9563-00bbcf62436f,ffff7cd6-7d12-42d4-ae92-f1bfc2fd6e88,The man Q fetches water with a water bottle f...,541.369925,16240
2329804,a06ac945-e65f-4259-9563-00bbcf62436f,ffff7cd6-7d12-42d4-ae92-f1bfc2fd6e88,C opens a refrigerator with his right hand.,542.337345,16269
2329805,a06ac945-e65f-4259-9563-00bbcf62436f,ffff7cd6-7d12-42d4-ae92-f1bfc2fd6e88,C places the plate into the middle section of...,545.366795,16360


In [10]:
# retrieve information about clips, useful to compute starting and ending time for the queries ground thruth based on the clip reference system

# load ego4d.json
ego4d_path = '/kaggle/input/annotations-ego4d/annotations/ego4d.json'
with open(ego4d_path, 'r') as f:
    ego4d_data = json.load(f)

ego4d_data.keys()
# create a DataFrame for clips
clip_records = []
for clip in tqdm(ego4d_data['clips']):    
    clip_record = {
        'clip_uid': clip['clip_uid'],
        'video_uid': clip['video_uid'],
        'video_start_sec': clip['video_start_sec'],
        'video_end_sec': clip['video_end_sec']
    }
    clip_records.append(clip_record)
clip_df = pd.DataFrame(clip_records)

100%|██████████| 12283/12283 [00:00<00:00, 766760.47it/s]


In [11]:
clip_df

Unnamed: 0,clip_uid,video_uid,video_start_sec,video_end_sec
0,000eba33-8d14-446a-b016-19bd50e9a3b9,ab2bf67b-efc0-4448-8c91-a4cecb29691f,0.021029,480.021029
1,0014331f-18b1-4200-b4cd-bf55a08aa4fe,a67789f8-3788-4a8d-aba8-9b2c2945d457,1868.087695,2168.087695
2,00182baf-e3fe-4bee-9416-825555bc4506,786fdc37-a1a2-4576-83f7-0f8e5da4579a,0.021029,480.021029
3,001fe47b-c00a-4fc7-9f94-40ede6b009f5,b99ceea4-7fa3-407c-9b9f-1347645d23f2,719.954362,1199.954362
4,0021cb1c-2009-4469-b4c4-76829b9c1cda,0836e1a4-11e6-4b31-bd39-f8e083fdadb3,1079.987695,1379.987695
...,...,...,...,...
12278,ffddab3a-dbec-49b0-b114-5a1e3d805efa,8b1d4210-a174-4617-8311-df4b32785d28,612.987695,912.987695
12279,ffe18e54-5a16-4067-9376-e3d4acb6f8dc,8f0f8606-b696-4bb2-909d-703e4e076d80,270.000000,570.000000
12280,ffe2261f-b973-4fbd-8824-06f8334afdc5,f7b7c31e-cfdb-4029-b850-9f7ba98e42c2,180.021029,660.021029
12281,ffe5e9b1-bd91-4744-aa9e-c2f560412ca9,edf2c742-f89c-402b-9228-11fda87201f1,0.000000,480.000000


## Group n consecutive narrations

In [12]:
# check if all the narrations in each sample belong to the same clip (annotation_uid and clip_uid don't match exactly)

def check_narrations_within_clip(sample_narrations, clip_df):
    
    if sample_narrations is None:
        return False, None, None, None
    
    video_uid = sample_narrations['video_uid'].iloc[0]
    min_timestamp = sample_narrations['timestamp_sec'].min()
    max_timestamp = sample_narrations['timestamp_sec'].max()
    
    # filter clips by video_uid and timestamp range
    valid_clips = clip_df[(clip_df['video_uid'] == video_uid) &
                          (clip_df['video_start_sec'] <= min_timestamp) &
                          (clip_df['video_end_sec'] >= max_timestamp)]
    
    # if there's exactly one matching clip, all narrations belong to the same clip
    if len(valid_clips) == 1:
        clip = valid_clips.iloc[0]
        return True, clip['clip_uid'], clip['video_start_sec'], clip['video_end_sec']
    return False, None, None, None

In [13]:
# sample

import random
n = 4 # we will have n-1 narrations in each sample e.g n=4 -> 3 narrations for each sample, 
# we take n initially because we use the timestamp of the last+1-th narration as final timestamp for our group sample of narrations

# good samples
sampling_good_records = []

def sample_n_consecutive_records(group, n):
    N = len(group)
    if N >= n:
        i = random.randint(0, N-n)
        while i <= N - n: # manage the end of the clip
            sample = group.iloc[i:i + n]
            # if all the first n-1 narrations extracted are good, then return the sample. 
            if not sample.iloc[:-1]['narration_text'].str.strip().str.contains(('nsure')).any():
                return sample.copy()
            # else, we begin our sampling starting from the narration following the last "#unsure" one.
            unsure_index = sample[sample['narration_text'].str.strip().str.contains(('nsure'))].index[-1] #-1 last one
            i = unsure_index + 1 
    return None #if no good sample is found, we don't return anything. 

# group by 'annotation_uid' and do the sampling
grouped = df_sorted.groupby('annotation_uid')

for annotation_uid, group in tqdm(grouped):
    
    sample = sample_n_consecutive_records(group, n)
    is_valid, clip_uid, video_start_sec, video_end_sec = check_narrations_within_clip(sample, clip_df)
    if is_valid:
        sample['clip_uid'] = clip_uid
        sample['video_start_sec'] = video_start_sec
        sample['video_end_sec'] = video_end_sec
        
        sampling_good_records.append(sample)
        
samplings_df = pd.concat(sampling_good_records).reset_index(drop=True)

samplings_df

100%|██████████| 44258/44258 [02:34<00:00, 285.89it/s]


Unnamed: 0,video_uid,annotation_uid,narration_text,timestamp_sec,timestamp_frame,clip_uid,video_start_sec,video_end_sec
0,74427b5d-2780-4680-957a-62f532766672,000d16f1-dd58-456c-b6c6-e5e6d91fbb57,C picks a turner spoon from the pan on the fl...,861.178859,25835,f7a11713-a2cf-4f07-a6ee-f94a55f2cc43,694.021029,1173.987695
1,74427b5d-2780-4680-957a-62f532766672,000d16f1-dd58-456c-b6c6-e5e6d91fbb57,C hits the turner spoon on the pan cover with...,863.080549,25892,f7a11713-a2cf-4f07-a6ee-f94a55f2cc43,694.021029,1173.987695
2,74427b5d-2780-4680-957a-62f532766672,000d16f1-dd58-456c-b6c6-e5e6d91fbb57,C lifts the pan cover from the pan on the coo...,865.457859,25963,f7a11713-a2cf-4f07-a6ee-f94a55f2cc43,694.021029,1173.987695
3,74427b5d-2780-4680-957a-62f532766672,000d16f1-dd58-456c-b6c6-e5e6d91fbb57,C places the pan cover on the pan on the floor.,867.361219,26020,f7a11713-a2cf-4f07-a6ee-f94a55f2cc43,694.021029,1173.987695
4,cb6dbc68-561e-4f73-ad98-ba69d4fcf701,0012f8ba-2427-4ef8-9822-d3311a4b5921,C talks to man X,538.899758,16166,02de9500-d574-446e-95fc-3e82c367385a,299.998698,599.965365
...,...,...,...,...,...,...,...,...
18551,dc0ef581-bb29-464e-8ada-0953230d8f7d,ffc47179-dcff-441c-a032-ff1edf3ecc34,C walks forward a bit.,586.102258,17582,e355d07d-844b-44ff-bfab-c1fb98d48b6c,539.987695,810.987695
18552,f262e5b6-6bf0-42b3-b8a6-2ea1b61c75af,ffd7735d-866e-4298-a1fb-111877c9cdfa,C stares in boutique,2.944149,88,168b41cd-cf36-4b28-b704-bbd500663c9f,0.021029,480.021029
18553,f262e5b6-6bf0-42b3-b8a6-2ea1b61c75af,ffd7735d-866e-4298-a1fb-111877c9cdfa,C fixes cable on the socket,9.660709,289,168b41cd-cf36-4b28-b704-bbd500663c9f,0.021029,480.021029
18554,f262e5b6-6bf0-42b3-b8a6-2ea1b61c75af,ffd7735d-866e-4298-a1fb-111877c9cdfa,C operates mobile phone in the room,23.101209,692,168b41cd-cf36-4b28-b704-bbd500663c9f,0.021029,480.021029


In [14]:
sample_dicts = [] # list of dictionaries, one per sample

for i in tqdm(range(0, len(samplings_df) - n + 1, n)):
      
    # we can just iterate through them and build the dictionaries because the samples are already sorted
    
    # retrieve useful fields
    text = samplings_df.loc[i:i+n-2, 'narration_text'].tolist()            
    start = samplings_df.loc[i, 'timestamp_sec']
    end = samplings_df.loc[i+n-1, 'timestamp_sec']
    clip_uid = samplings_df.loc[i, 'clip_uid']
    video_uid = samplings_df.loc[i, 'video_uid']
    video_start_sec = samplings_df.loc[i, 'video_start_sec'] 
    video_end_sec = samplings_df.loc[i, 'video_end_sec'] 
    
    sample_dict = {
        "text": text,
        "start": start,
        "end": end,
        "clip_uid": clip_uid,
        "video_uid": video_uid,
        "video_start_sec": video_start_sec, # start of the clip inside the video
        "video_end_sec": video_end_sec # end of the clip inside the video
    }
   
    sample_dicts.append(sample_dict)

100%|██████████| 4639/4639 [00:00<00:00, 5824.64it/s]


In [15]:
p=10
# visualize first p elements of the list
print(sample_dicts[:p])

[{'text': [' C picks a turner spoon from the pan on the floor with her right hand.', ' C hits the turner spoon on the pan cover with her right hand.', ' C lifts the pan cover from the pan on the cooker with the turner spoon.'], 'start': 861.1788586, 'end': 867.3612186, 'clip_uid': 'f7a11713-a2cf-4f07-a6ee-f94a55f2cc43', 'video_uid': '74427b5d-2780-4680-957a-62f532766672', 'video_start_sec': 694.0210286, 'video_end_sec': 1173.9876952666666}, {'text': [' C talks to man X', ' Man X  taps the black piece', ' Man X points the black piece'], 'start': 538.899757950745, 'end': 542.685037950745, 'clip_uid': '02de9500-d574-446e-95fc-3e82c367385a', 'video_uid': 'cb6dbc68-561e-4f73-ad98-ba69d4fcf701', 'video_start_sec': 299.99869786666665, 'video_end_sec': 599.9653645333333}, {'text': [' C walks on the field ', ' C moves the wood ', ' C holds the woods '], 'start': 581.32364, 'end': 590.58847, 'clip_uid': '740202e6-5d33-45d3-a088-cbd310a14187', 'video_uid': '0b245a61-32d6-4b14-897c-724adad5b231', 

In [16]:
from tqdm import tqdm
# narrations: adapt format based on what we need
n = 50
TAKE_ALL = True # ignore the limit to n
list_formatted_narrations = [] # from a list of narrations to a string of concatenated narrations
PRINT_NARRATIONS = not TAKE_ALL
for i in tqdm(range(len(sample_dicts))):

    # join the list elements with '. ' as the separator
    formatted_narrations = ". ".join(sample_dicts[i]["text"])
    if (i<n or TAKE_ALL):        
        if PRINT_NARRATIONS:
            print(formatted_narrations)
            print()
            
        list_formatted_narrations.append(formatted_narrations)    
        
print(f"Number of groups of narrations: {len(list_formatted_narrations)}")

100%|██████████| 4639/4639 [00:00<00:00, 509342.07it/s]

Number of groups of narrations: 4639





## Generate new annotations

In [17]:
import re
from tqdm import tqdm

# LLM model

list_queries_plus_info = []

PRINT_RESULTS = False

BREAK_BEFORE_END = False
LIMIT_TO_N = 5 # for testing

NUM_QUESTIONS = 2 # number of queries requested

MODEL_TEMPLATE = "<start_of_turn>model\n"

# prompt: start user turn + user prompt + end user turn + start model turn
USER_CHAT_TEMPLATE = "<start_of_turn>user\n{prompt}<end_of_turn>\n"+MODEL_TEMPLATE
print(USER_CHAT_TEMPLATE)

# "bad prompt": very detailed, poor quality queries observed with a manual inspection

# prompt= """You are trying to help humans to augment their memory. 
#                     To achieve this goal some annotators have watched several videos about people performing different actions in various scenarios, 
#                     writing down, step by step, short narrations describing in a few words what is happening in the video and the time at which these actions take place. 
#                     You are requested to generate 2 simple queries that a person could answer just by looking at the video segments corresponding to these narrations:"""
#                     + formatted_narrations[i] +
#                     """
#                     To do so follow these rules:
#                     - do not include the answer in the question
#                     - avoid introductions, report just the text of the queries one per line

# better prompt, but still too detailed: 
# prompt = "Generate two short simple queries that a person could answer looking at the video corresponding to these narrations: "+ list_formatted_narrations[i]+ "\nIn your answer report just the queries one per line"


tot = LIMIT_TO_N if BREAK_BEFORE_END else len(list_formatted_narrations)

# if LIMIT_TO_N is greater than the length of the list

tot = min(tot, len(list_formatted_narrations))

# iterate through the formatted grouped narrations and generate queries

for i in tqdm(range(tot)):
    if i>LIMIT_TO_N and BREAK_BEFORE_END:
        break
    
    # build the prompt with the specific formatted narration
    # best prompt, concise and simple
    prompt = (
        USER_CHAT_TEMPLATE.format(
            prompt = "Generate two questions mixing the information of these sentences: \""+ list_formatted_narrations[i]+ "\"\nIn your answer write only two lines with a question each"
        )
    )

    output = model.generate(
        USER_CHAT_TEMPLATE.format(prompt=prompt),
        device=device,
        output_len=40, # empirically we obtained good results setting length at 20*number_of_queries
    )
    if PRINT_RESULTS:
        print(f"***\nNarration: {list_formatted_narrations[i]}\n***")
        print("model:",output)
    
    # adjust the output of the LLM
    
    # split the output into individual queries
    output_lines = output.strip().split("\n")
    
    # filter queries based on ending with a question mark
    def extract_queries(input_list):
        # filter the input list to include only lines ending with a question mark
        queries = [item for item in input_list if item.strip().endswith('?')]

        # remove any leading numbering and formatting
        queries = [re.sub(r'(^\d+\.\s*|-+\s*|>+\s*)|(^\w+ \d+\:)', '', query) for query in queries]

        return queries

    # extract the queries
    queries = extract_queries(output_lines)
    
    if PRINT_RESULTS:
        # print the result
        print("extracted:")
        print(queries)
    
    if queries: # if list not empty
        # retrieve information from the sample dictionary
        start = sample_dicts[i]["start"]
        end = sample_dicts[i]["end"]
        clip_uid = sample_dicts[i]["clip_uid"]
        video_uid = sample_dicts[i]["video_uid"]
        video_start_sec = sample_dicts[i]["video_start_sec"]
        video_end_sec = sample_dicts[i]["video_end_sec"]

        # build the query entry with necessary information
        query_entry = {
            'queries': queries,
            'start': start,
            'end': end,
            'clip_uid': clip_uid,
            'video_uid': video_uid,
            'video_start_sec':video_start_sec,
            'video_end_sec':video_end_sec
        }
        # append the query entry to the list
        list_queries_plus_info.append(query_entry)
        
# flatten the list of queries with their metadata
flat_queries = []
for entry in list_queries_plus_info:
    for query in entry['queries']:
        flat_queries.append({
            'query': query,
            'metadata': entry
        })

# number of queries to select
NUM_QUERIES_TO_SELECT = 8500

# randomly select 8500 queries from the list
if len(flat_queries) >= NUM_QUERIES_TO_SELECT:
    selected_flat_queries = random.sample(flat_queries, NUM_QUERIES_TO_SELECT)
else:
    raise ValueError("The total number of queries is less than 8500")

# create a new structure to hold the selected queries grouped by their original metadata
selected_queries_plus_info = []

metadata_dict = {}

for item in selected_flat_queries:
    query = item['query']
    metadata = item['metadata']
    metadata_key = (
        metadata['start'], metadata['end'], metadata['clip_uid'], metadata['video_uid'],
        metadata['video_start_sec'], metadata['video_end_sec']
    )
    if metadata_key not in metadata_dict:
        metadata_dict[metadata_key] = {
            'queries': [],
            'start': metadata['start'],
            'end': metadata['end'],
            'clip_uid': metadata['clip_uid'],
            'video_uid': metadata['video_uid'],
            'video_start_sec': metadata['video_start_sec'],
            'video_end_sec': metadata['video_end_sec']
        }
    metadata_dict[metadata_key]['queries'].append(query)

selected_queries_plus_info = list(metadata_dict.values())

<start_of_turn>user
{prompt}<end_of_turn>
<start_of_turn>model



100%|██████████| 4639/4639 [1:25:45<00:00,  1.11s/it]


In [18]:
# visualize some narrations - queries examples
for i in range(10):
    print("Narration: "+list_formatted_narrations[i])
    print("Queries: ")
    print(list_queries_plus_info[i]["queries"])  

Narration:  C picks a turner spoon from the pan on the floor with her right hand..  C hits the turner spoon on the pan cover with her right hand..  C lifts the pan cover from the pan on the cooker with the turner spoon.
Queries: 
['What did C pick up with her right hand from the pan on the floor?', 'How did C hit the turner spoon on the pan cover?']
Narration:  C talks to man X.  Man X  taps the black piece.  Man X points the black piece
Queries: 
['What did C talk to man X about?', 'What did man X tap on the black piece?']
Narration:  C walks on the field .  C moves the wood .  C holds the woods 
Queries: 
[' Where does C walk?', ' What does C do with the wood?']
Narration:  person X holds a cup .  person X lifts up a cup from the refrigerator.  person X holds a coffee mate 
Queries: 
['What is person X holding?', 'In what way is person X raising the cup?']
Narration:  C passes the wooden spatula from his right hand to his left hand..  C scrapes some scrambled eggs from the wooden spa

## Generate the output JSON

In [19]:
# build the automatically-generated-queries annotation file for pretraining
import uuid

output_data = {        
    "version":"2",
    "date":"120624",
    "description":"gemma NLQ Annotations from narrations (train) 3->2",
    "manifest":"", 
    "videos":[]
}
    
for item in list_queries_plus_info: 
    
    # check if the video entry is already present: if not it creates the whole structure and adds the video to the list
    
    video_entry = next((video for video in output_data["videos"] if video["video_uid"] == item["video_uid"]), None)
    
    if not video_entry:
        video_entry = {
            "video_uid": item["video_uid"],
            "clips": [],
            "split": "train" # used for training
        }
        output_data["videos"].append(video_entry)
        
    # same of video for clip level
    clip_entry = next((clip for clip in video_entry["clips"] if clip["clip_uid"] == item["clip_uid"]), None)
    
    
    if not clip_entry:

        clip_entry = {
            "clip_uid": item["clip_uid"],
            "video_start_sec": item["video_start_sec"],
            "video_end_sec": item["video_end_sec"],
            "video_start_frame": None,  
            "video_end_frame": None,     
            "clip_start_sec": None,
            "clip_end_sec": None,
            "clip_start_frame": None,
            "clip_end_frame": None,
            "source_clip_uid": "",  
            "annotations": []
        }
        video_entry["clips"].append(clip_entry)
    
    # annotation level
    
    annotation_entry = {
        "language_queries": [
            {
                "clip_start_sec": float(item['start'] - item["video_start_sec"]), #clip_start_sec,
                "clip_end_sec": float(item['end'] - item["video_end_sec"]), #clip_end_sec
                "video_start_sec": item['start'], # not used 
                "video_end_sec": item['end'], # not used
                "video_start_frame": int(item['start'] * 30), # All the videos have 30 fps, not used
                "video_end_frame": int(item['end'] * 30), # All the videos have 30 fps, not used
                "template": "",
                "query": query,
                "slot_x": "",
                "verb_x": "",
                "slot_y": "",
                "verb_y": "",
                "raw_tags": [
                    "",
                    query,
                    "",
                    "",
                    "",
                    ""
                ]
            } for query in item['queries']
        ],
        "annotation_uid": str(uuid.uuid4()),  # Generate a unique ID for the annotation
    }
    
    # if annotations list of the clip is empty
    if not clip_entry["annotations"]:
        clip_entry["annotations"].append(annotation_entry)
    else:
        # extend the list with new annotations, [0]-> first (and only) annotator
        clip_entry["annotations"][0]["language_queries"].extend(annotation_entry["language_queries"])

# print(json.dumps(output_data, indent=4))

In [20]:
# define the output file path
output_file_path = "/kaggle/working/nlq_train_gen.json"

# write the data to the JSON file
with open(output_file_path, "w") as json_file:
    json.dump(output_data, json_file, indent=4)

# print the output file path to confirm the file has been written
print(f"Output written to {output_file_path}")

Output written to /kaggle/working/nlq_train_gen.json
