<a href="https://colab.research.google.com/github/johan-lindell/VSL-egocentric/blob/main/notebooks/extension.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Natural Language Queries in Egocentric Videos Project Extension 2

##General Setup

Mount drive

In [None]:
from google.colab import drive, userdata
import numpy as np
import pandas as pd
import json
import os

In [None]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Set relevant directories

In [None]:
EXTENSION_DIR = '/content/drive/MyDrive/vsl-egocentric/extension'
VIDEO_OUT = EXTENSION_DIR + '/uncut_videos'
UNCUT_VIDEO_DIR = VIDEO_OUT + '/v1/clips'
CUT_VIDEO_OUT = EXTENSION_DIR + '/cut_videos'

## Find 50 correct queries

Find 50 correct NLQ queries based on the predictions of our model and validation data. The tolerance has been manually set

In [None]:
# Load the validation and prediction data
with open(EXTENSION_DIR + '/val.json') as f:
    val_data = json.load(f)

with open(EXTENSION_DIR + '/preds.json') as f:
    pred_data = json.load(f)

# Create a dictionary mapping annotation_uid to its exact times and sentences
val_dict = {}
for clip_uid, clip_data in val_data.items():
    for idx, annotation_uid in enumerate(clip_data["annotation_uids"]):
        val_dict[annotation_uid] = {
            "exact_times": clip_data["exact_times"][idx],
            "sentence": clip_data["sentences"][idx]
        }

# Set a tolerance for matching times (seconds)
tolerance = 1.704

# List to store correctly retrieved NLQ queries
correct_queries = []
unique_entries = set()
i = 0
for result in pred_data["results"]:
    annotation_uid = result["annotation_uid"]
    clip_uid = result["clip_uid"]
    predicted_times = result["predicted_times"]

    if annotation_uid in val_dict:
        exact_times = val_dict[annotation_uid]["exact_times"]
        # Check if any of the predicted times match the exact times within tolerance
        for pred_start, pred_end in predicted_times:
            exact_start, exact_end = exact_times
            if abs(pred_start - exact_start) <= tolerance and abs(pred_end - exact_end) <= tolerance:
                entry = (annotation_uid, clip_uid)
                if entry not in unique_entries:
                    correct_queries.append({
                        "idx": i,
                        "annotation_uid": annotation_uid,
                        "clip_uid": clip_uid,
                        "sentence": val_dict[annotation_uid]["sentence"],
                        "predicted_times": (pred_start, pred_end),
                        "exact_times": (exact_start, exact_end)
                    })
                    unique_entries.add(entry)
                    i += 1
                break
    if len(correct_queries) >= 50:
        break

# Output the first 50 correct queries
correct_queries = correct_queries[:50]

# Convert the results to a dataframe
df = pd.DataFrame(correct_queries)
print(f'{df.shape[0]} correct queries found with a tolerance of {tolerance} seconds.')
df.head()

50 correct queries found with a tolerance of 1.704 seconds.


Unnamed: 0,idx,annotation_uid,clip_uid,sentence,predicted_times,exact_times
0,0,847f64a8-5335-4f1b-8248-73727dfe52ce,00d9a297-d967-4d28-8e5a-6b891814ec65,where did i put the knife?,"(146.25, 150.0)","(147.95371, 148.928)"
1,1,f9cd0c31-4e28-411e-b498-19db3e544030,9a13aee2-0dca-49f8-968f-8f53c5a62963,what vegetable did i cut?,"(75.0, 93.75)","(74.42368, 92.91834)"
2,2,21563a23-ca10-4165-8b6a-74c72d722f0a,2c1724ce-f438-4d63-a699-8a7f65e3cbd9,where is phone?,"(0.0, 3.75)","(0.283, 3.5)"
3,3,e3015a5a-3e3e-47f5-a6b9-b77d3648621e,679cfee6-7da1-4701-b75a-9e34abb9400a,where was can drink before i drank it?,"(15.0, 18.75)","(15.0, 18.086)"
4,4,763cc50c-edf6-4b99-98e2-5030a557784c,1138ced6-d580-4013-96bb-1e5c3fea62d7,how many cans were in the fridge?,"(315.0, 326.25)","(315.706, 325.888)"


In [None]:
# Save the dataframe to a CSV file
df.to_csv(f'{EXTENSION_DIR}/correct_nlq_queries_tol{tolerance}s.csv', index=False)

## Download videos


In [None]:
#Add secrets to google colab
os.environ['AWS_ACCESS_KEY_ID'] = userdata.get('aws_access_key')
os.environ['AWS_SECRET_ACCESS_KEY'] = userdata.get('aws_secret_key')

In [None]:
# Download the AWS and Ego4D CLIs
%%bash

# Set up the AWS CLI
curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
unzip -o awscliv2.zip >/dev/null
sudo ./aws/install >/dev/null 2>&1
aws configure set aws_access_key_id "$AWS_ACCESS_KEY_ID" && aws configure set aws_secret_access_key "$AWS_SECRET_ACCESS_KEY"
rm "awscliv2.zip"

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100 57.8M  100 57.8M    0     0  78.9M      0 --:--:-- --:--:-- --:--:-- 79.0M


In [None]:
!pip install ego4d



In [None]:
# Extract unique clip_uids
unique_clip_uids = df["clip_uid"].unique().tolist()

# Run the commands to download the videos, could not get list input to work so looping with single uid
for uid in unique_clip_uids:
    command = f"ego4d --version v1 --output_directory={VIDEO_OUT} --datasets clips --video_uids={uid} --yes"
    !{command}

Datasets to download: {'clips'}
Download Path: /content/drive/MyDrive/vsl-egocentric/extension/uncut_videos/v1
Ego4D Metadata: /content/drive/MyDrive/vsl-egocentric/extension/uncut_videos/ego4d.json
Checking requested datasets and versions...
Created download directory for version 'v1' of dataset: 'clips' at: /content/drive/MyDrive/vsl-egocentric/extension/uncut_videos/v1/clips
Only downloading a subset of the video files because the 'video_uids' flag has been set on the command line or in the config file. A total of 1 video files will be downloaded.

Retrieving object metadata from S3...
100% 1/1 [00:00<00:00, 878.57object/s]
Checking if latest file versions are already downloaded...
100% 1/1 [00:01<00:00,  1.03s/file]
No existing videos to filter.
Downloading 1 files..
 88% 65.8M/74.5M [00:02<00:00, 76.2MiB/s]Checking file integrity...
100% 74.5M/74.5M [00:03<00:00, 25.4MiB/s]
Datasets to download: {'clips'}
Download Path: /content/drive/MyDrive/vsl-egocentric/extension/uncut_videos/

### Cut videos using ffmpeg

Videos are cut and then stored in the specified folder **CUT_VIDEO_OUT**

In [None]:
# Define the function to run ffmpeg commands
def run_ffmpeg(input_path, output_path, start_time, duration):
    command = f"ffmpeg -i {input_path} -ss {start_time} -t {duration} -c copy {output_path}"
    !{command}


os.makedirs(CUT_VIDEO_OUT, exist_ok=True)

# Loop through the correct queries to extract the segments
for query in correct_queries:
    clip_uid = query["clip_uid"]
    annotation_uid = query["annotation_uid"]
    predicted_start, predicted_end = query["predicted_times"]
    idx = query["idx"]
    start_time = predicted_start
    duration = predicted_end - predicted_start

    input_video_path = os.path.join(UNCUT_VIDEO_DIR, f"{clip_uid}.mp4")
    output_segment_path = os.path.join(CUT_VIDEO_OUT, f"{idx}_{clip_uid}_{annotation_uid}.mp4")

    run_ffmpeg(input_video_path, output_segment_path, start_time, duration)


ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enab

## Implement and train model

In [None]:
annotations_path = EXTENSION_DIR + '/manual_annotations.xlsx'
annotations_df = pd.read_excel(annotations_path)

# Replace NaN values with empty strings (for testing)
annotations_df.fillna("i don't know the video is unclear", inplace=True)

annotations_df.head()

Unnamed: 0,idx,annotation_uid,clip_uid,sentence,predicted_times,exact_times,manual_annotation
0,0,847f64a8-5335-4f1b-8248-73727dfe52ce,00d9a297-d967-4d28-8e5a-6b891814ec65,where did i put the knife?,"(146.25, 150.0)","(147.95371, 148.928)",i don't know the video is unclear
1,1,f9cd0c31-4e28-411e-b498-19db3e544030,9a13aee2-0dca-49f8-968f-8f53c5a62963,what vegetable did i cut?,"(75.0, 93.75)","(74.42368, 92.91834)",cabbage
2,2,21563a23-ca10-4165-8b6a-74c72d722f0a,2c1724ce-f438-4d63-a699-8a7f65e3cbd9,where is phone?,"(0.0, 3.75)","(0.283, 3.5)",on the shelf
3,3,e3015a5a-3e3e-47f5-a6b9-b77d3648621e,679cfee6-7da1-4701-b75a-9e34abb9400a,where was can drink before i drank it?,"(15.0, 18.75)","(15.0, 18.086)",In my hand
4,4,763cc50c-edf6-4b99-98e2-5030a557784c,1138ced6-d580-4013-96bb-1e5c3fea62d7,how many cans were in the fridge?,"(315.0, 326.25)","(315.706, 325.888)",there are seven cans in the fridge


In [None]:
def prepare_video_qa_data(annotations_df, video_segments_dir):
    qa_data = []
    for _, row in annotations_df.iterrows():
        idx = row['idx']
        clip_uid = row['clip_uid']
        annotation_uid = row['annotation_uid']
        sentence = row['sentence']
        manual_annotation = row['manual_annotation']

        video_segment_path = os.path.join(video_segments_dir, f"{idx}_{clip_uid}_{annotation_uid}.mp4")

        qa_data.append({
            'video_path': video_segment_path,
            'question': sentence,
            'ground_truth': manual_annotation
        })

    return qa_data


qa_data = prepare_video_qa_data(annotations_df, CUT_VIDEO_OUT)
qa_data[:3]

[{'video_path': '/content/drive/MyDrive/vsl-egocentric/extension/cut_videos/0_00d9a297-d967-4d28-8e5a-6b891814ec65_847f64a8-5335-4f1b-8248-73727dfe52ce.mp4',
  'question': 'where did i put the knife?',
  'ground_truth': "i don't know the video is unclear"},
 {'video_path': '/content/drive/MyDrive/vsl-egocentric/extension/cut_videos/1_9a13aee2-0dca-49f8-968f-8f53c5a62963_f9cd0c31-4e28-411e-b498-19db3e544030.mp4',
  'question': 'what vegetable did i cut?',
  'ground_truth': 'cabbage'},
 {'video_path': '/content/drive/MyDrive/vsl-egocentric/extension/cut_videos/2_2c1724ce-f438-4d63-a699-8a7f65e3cbd9_21563a23-ca10-4165-8b6a-74c72d722f0a.mp4',
  'question': 'where is phone?',
  'ground_truth': 'on the shelf'}]

### Install video-LLaVA

As per documentation https://github.com/PKU-YuanGroup/Video-LLaVA.

Model is also quantizied per https://huggingface.co/docs/transformers/main/en/model_doc/video_llava

In [None]:
%%bash

git clone https://github.com/PKU-YuanGroup/Video-LLaVA
cd Video-LLaVA
conda create -n videollava python=3.10 -y
conda activate videollava
pip install --upgrade pip  # enable PEP 660 support
pip install -e .
pip install -e ".[train]"
pip install flash-attn --no-build-isolation
pip install decord opencv-python git+https://github.com/facebookresearch/pytorchvideo.git@28fe037d212663c6a24f373b94cc5d478c8c1a1d
pip install -U transformers
pip install av
pip install bitsandbytes

Obtaining file:///content/Video-LLaVA
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Checking if build backend supports build_editable: started
  Checking if build backend supports build_editable: finished with status 'done'
  Getting requirements to build editable: started
  Getting requirements to build editable: finished with status 'done'
  Installing backend dependencies: started
  Installing backend dependencies: finished with status 'done'
  Preparing editable metadata (pyproject.toml): started
  Preparing editable metadata (pyproject.toml): finished with status 'done'
Collecting transformers==4.31.0 (from videollava==1.0.0)
  Using cached transformers-4.31.0-py3-none-any.whl.metadata (116 kB)
Collecting tokenizers<0.14,>=0.12.1 (from videollava==1.0.0)
  Using cached tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Using cached transformers-4.31.0-py3-none-any.whl (7.4 MB)
Usi

fatal: destination path 'Video-LLaVA' already exists and is not an empty directory.
bash: line 4: conda: command not found
bash: line 5: conda: command not found
  Running command git clone --filter=blob:none --quiet https://github.com/facebookresearch/pytorchvideo.git /tmp/pip-req-build-4ljhbbtc
  Running command git rev-parse -q --verify 'sha^28fe037d212663c6a24f373b94cc5d478c8c1a1d'
  Running command git fetch -q https://github.com/facebookresearch/pytorchvideo.git 28fe037d212663c6a24f373b94cc5d478c8c1a1d
  Running command git checkout -q 28fe037d212663c6a24f373b94cc5d478c8c1a1d
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
videollava 1.0.0 requires tokenizers<0.14,>=0.12.1, but you have tokenizers 0.19.1 which is incompatible.
videollava 1.0.0 requires transformers==4.31.0, but you have transformers 4.41.2 which is incompatible.


In [None]:
import av
from transformers import VideoLlavaProcessor, VideoLlavaForConditionalGeneration, BitsAndBytesConfig
import gc
import torch

# specify how to quantize the model
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

# Load your annotations
annotations_path = EXTENSION_DIR + '/manual_annotations.xlsx'
annotations_df = pd.read_excel(annotations_path)

# Initialize the model and processor once
model = VideoLlavaForConditionalGeneration.from_pretrained("LanguageBind/Video-LLaVA-7B-hf", quantization_config=quantization_config, device_map="auto")
processor = VideoLlavaProcessor.from_pretrained("LanguageBind/Video-LLaVA-7B-hf")

def read_video_pyav(container, indices):
    frames = []
    container.seek(0)
    start_index = indices[0]
    end_index = indices[-1]
    for i, frame in enumerate(container.decode(video=0)):
        if i > end_index:
            break
        if i >= start_index and i in indices:
            frames.append(frame)

    #incase of too short video
    while len(frames) < 8:
        frames.append(frames[-1])

    return np.stack([x.to_ndarray(format="rgb24") for x in frames])

# Prepare data for Video-LLaVA
def prepare_qa_data(row, video_segments_dir):
    clip_uid = row['clip_uid']
    annotation_uid = row['annotation_uid']
    sentence = row['sentence']
    idx = row['idx']
    video_segment_path = os.path.join(video_segments_dir, f"{idx}_{clip_uid}_{annotation_uid}.mp4")

    # Check if the video file exists
    if not os.path.exists(video_segment_path):
        print(f"Video file {video_segment_path} not found.")
        return None, None

    try:
        # Open the video file and check for video streams
        container = av.open(video_segment_path)
        if not container.streams.video:
            print(f"No video stream found in {video_segment_path}.")
            return None, None

        total_frames = container.streams.video[0].frames
        indices = np.arange(0, total_frames, total_frames / 8).astype(int)
        clip = read_video_pyav(container, indices)

        prompt = f"USER: <video>{sentence} ASSISTANT:"
        inputs = processor(text=prompt, videos=clip, return_tensors="pt")

        return inputs, prompt
    except Exception as e:
        print(f"Error processing video {video_segment_path}: {e}")
        return None, None

# Function to process a batch of data
def process_batch(batch_df):
    batch_answers = []
    for idx, row in batch_df.iterrows():
        inputs, prompt = prepare_qa_data(row, CUT_VIDEO_OUT)
        if inputs is None:
            continue

        generate_ids = model.generate(**inputs, max_length=80)

        answer = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

        batch_answers.append({
            'clip_uid': row['clip_uid'],
            'annotation_uid': row['annotation_uid'],
            'question': row['sentence'],
            'ground_truth': row['manual_annotation'],
            'answer': answer
        })

        # Clear variables to free up memory
        del inputs, generate_ids, answer
        gc.collect()

    return batch_answers

# Process the data in batches
batch_size = 25
answers = []

for start in range(0, len(annotations_df), batch_size):
    end = min(start + batch_size, len(annotations_df))
    batch_df = annotations_df[start:end]
    answers.extend(process_batch(batch_df))

# Convert the answers to a DataFrame
answers_df = pd.DataFrame(answers)

# Save the answers to a CSV file
answers_csv_path = EXTENSION_DIR + '/video_qa_answers.csv'
answers_df.to_csv(answers_csv_path, index=False)

# Display the first few rows of the answers DataFrame
answers_df.head()




Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

You are calling `save_pretrained` to a 4-bit converted model, but your `bitsandbytes` version doesn't support it. If you want to save 4-bit models, make sure to have `bitsandbytes>=0.41.3` installed.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


No video stream found in /content/drive/MyDrive/vsl-egocentric/extension/cut_videos/0_00d9a297-d967-4d28-8e5a-6b891814ec65_847f64a8-5335-4f1b-8248-73727dfe52ce.mp4.




No video stream found in /content/drive/MyDrive/vsl-egocentric/extension/cut_videos/2_2c1724ce-f438-4d63-a699-8a7f65e3cbd9_21563a23-ca10-4165-8b6a-74c72d722f0a.mp4.
No video stream found in /content/drive/MyDrive/vsl-egocentric/extension/cut_videos/10_8a855547-3574-4e67-a7ac-41b072984e3b_1918d43a-17fc-43d8-bfac-13bbd6dd3356.mp4.
No video stream found in /content/drive/MyDrive/vsl-egocentric/extension/cut_videos/13_89193dbc-0ffa-4ee2-9ea8-c434ce177310_294d2807-8bdd-4267-a4f1-a0ce18ff7969.mp4.
No video stream found in /content/drive/MyDrive/vsl-egocentric/extension/cut_videos/17_f800514a-5fb0-4620-beb3-69d6c73ddb3f_b308eab3-64e5-451e-8c89-1713bd30b624.mp4.
No video stream found in /content/drive/MyDrive/vsl-egocentric/extension/cut_videos/21_b01fdd44-357c-4566-b487-d9ff21e56c2e_7ae246f9-e526-454e-b655-ef0d9276fe22.mp4.
No video stream found in /content/drive/MyDrive/vsl-egocentric/extension/cut_videos/26_864371ec-e13b-44ad-8e24-034d7506065a_6cc64a46-22ed-43f4-bc40-67fc5dd630dd.mp4.
No vi

Unnamed: 0,clip_uid,annotation_uid,question,ground_truth,answer
0,9a13aee2-0dca-49f8-968f-8f53c5a62963,f9cd0c31-4e28-411e-b498-19db3e544030,what vegetable did i cut?,cabbage,USER: what vegetable did i cut? ASSISTANT: The...
1,679cfee6-7da1-4701-b75a-9e34abb9400a,e3015a5a-3e3e-47f5-a6b9-b77d3648621e,where was can drink before i drank it?,In my hand,USER: where was can drink before i drank it? A...
2,1138ced6-d580-4013-96bb-1e5c3fea62d7,763cc50c-edf6-4b99-98e2-5030a557784c,how many cans were in the fridge?,there are seven cans in the fridge,USER: how many cans were in the fridge? ASSIST...
3,43db99a3-61ce-4548-ba5a-faf4c91c72f1,547b4107-de69-416c-bf63-6f24d74b2a83,where was the cabbage before i picked it?,on the floor,USER: where was the cabbage before i picked it...
4,90602dc0-4c33-4b26-b4d5-c63105c40187,c6a76e69-c3c3-41d2-aa72-ccbc231fb56a,what color is the bowl on the shelf?,,USER: what color is the bowl on the shelf? ASS...
