In [1]:
# !pip install -U -q transformers accelerate bitsandbytes peft datasets
# !pip install -q decord protobuf sentencepiece pandas decord
# !pip install flash-attn --no-build-isolation
!pip freeze | grep -E "transformers|accelerate|bitsandbytes|peft|datasets"

accelerate==1.2.0
bitsandbytes==0.45.0
datasets==3.1.0
peft==0.14.0
transformers==4.47.0


In [3]:
# !cd /home && git clone https://github.com/egoschema/EgoSchema.git

Cloning into 'EgoSchema'...
remote: Enumerating objects: 259, done.[K
remote: Counting objects: 100% (259/259), done.[K
remote: Compressing objects: 100% (181/181), done.[K
remote: Total 259 (delta 138), reused 193 (delta 75), pack-reused 0 (from 0)[K
Receiving objects: 100% (259/259), 2.73 MiB | 12.01 MiB/s, done.
Resolving deltas: 100% (138/138), done.


In [2]:
import json
import requests
import random
import os
import requests
import numpy as np
import torch
from huggingface_hub import hf_hub_download
from transformers import VideoLlavaProcessor, VideoLlavaForConditionalGeneration
from decord import VideoReader, gpu, cpu
from datasets import load_dataset

In [3]:
Config = {
    "fps": 14,
    "vid_process_dir": "videos"
}

In [4]:
df = load_dataset("lmms-lab/egoschema", "Subset")
df

DatasetDict({
    test: Dataset({
        features: ['question_idx', 'question', 'video_idx', 'option', 'answer'],
        num_rows: 500
    })
})

In [5]:
with open("/home/EgoSchema/questions.json", "r") as f:
    all_questions = json.loads(f.read())

for k, v in all_questions[0].items():
    print(k, v)

q_uid 001934bb-81bd-4cd8-a574-0472ef3f6678
google_drive_id 15yGB0TI5zG5usdQZOXn0CrRzkJNB0gMl
question Although the video is predominantly focused on one recurring action, there is an interruption in c's activity. briefly describe this interruption and its significance within the video.
option 0 C stops scrolling to check notifications on their phone, indicating a possible distraction
option 1 C briefly stops scrolling to engage in conversation with others, showcasing the social aspect
option 2 C takes a break to drink from a cup, briefly shifting focus
option 3 C interrupts scrolling to type on the laptop, adding variety to the video's main focus
option 4 C briefly stops scrolling, showing a drifting focus.


In [6]:
def download_file_from_google_drive(id, destination):
    URL = "https://docs.google.com/uc?export=download"

    session = requests.Session()

    response = session.get(URL, params = { 'id' : id , 'confirm': 1 }, stream = True)
    token = get_confirm_token(response)

    if token:
        params = { 'id' : id, 'confirm' : token }
        response = session.get(URL, params = params, stream = True)

    save_response_content(response, destination)    

def get_confirm_token(response):
    for key, value in response.cookies.items():
        if key.startswith('download_warning'):
            return value

    return None

def save_response_content(response, destination):
    CHUNK_SIZE = 32768

    with open(destination, "wb") as f:
        for chunk in response.iter_content(CHUNK_SIZE):
            if chunk: # filter out keep-alive new chunks
                f.write(chunk)

In [7]:
# Filter out video links in the Subset dataset
all_ids = {q["q_uid"]: q["google_drive_id"] for q in all_questions}
df = df.map(lambda e: {**e, "google_drive_id": all_ids[e["video_idx"]]})
df = df['test']
df

Dataset({
    features: ['question_idx', 'question', 'video_idx', 'option', 'answer', 'google_drive_id'],
    num_rows: 500
})

In [8]:
if not os.path.isdir(Config['vid_process_dir']):
    os.makedirs(Config['vid_process_dir'])
for i in range(0, 500):
    d = df[i]
    if os.path.isfile(f'videos/{d["video_idx"]}.mp4'):
        print(f"#{i+1}/500 - already downloaded {d['video_idx']}")
        continue
    print(f'#{i+1}/500 - downloading {d["video_idx"]} from drive id {d["google_drive_id"]}')
    download_file_from_google_drive(d["google_drive_id"], f'videos/{d["video_idx"]}.mp4')

#1/500 - already downloaded 0074f737-11cb-497d-8d07-77c3a8127391
#2/500 - already downloaded 00b9a0de-c59e-49cb-a127-6081e2fb8c8e
#3/500 - already downloaded 00f93e1e-cf4e-4835-88b4-4ad68216e86f
#4/500 - already downloaded 00faf954-74f7-4aa3-8b29-4a5dff4f9518
#5/500 - already downloaded 011b8b73-0ce4-4843-95ef-33b79610d212
#6/500 - already downloaded 01a144a5-24d2-4a5a-af01-1f318d674bed
#7/500 - already downloaded 026a2f15-c454-4c28-80e0-24c85d7f4ecf
#8/500 - already downloaded 02925d7a-a5db-4127-8c31-b232e78b684d
#9/500 - already downloaded 03657401-d4a4-40d0-9b03-d7e093ef93d1
#10/500 - already downloaded 0437cf5f-5014-47d6-b4b3-f299380aa688
#11/500 - already downloaded 049249dc-bdad-48c4-bdc0-511814c5781c
#12/500 - already downloaded 04c51dba-1dcb-4b8f-a62c-efc363561d7b
#13/500 - already downloaded 057f8774-15c2-4e2e-b9fd-75f26d4b3b83
#14/500 - already downloaded 05ad5736-88f5-42bb-ac9f-689e199c50de
#15/500 - already downloaded 05defeef-40bd-4b08-b341-72879a6cf63e
#16/500 - already d

In [9]:
def read_video_decord(video_path, num_frames=Config['fps']):
    '''
    Decode the video with Decord decoder.

    Args:
        video_path (str): Path to the video file.
        num_frames (int): Number of frames to sample uniformly. Defaults to NUM_FRAMES

    Returns:
        np.ndarray: np array of decoded frames of shape (num_frames, height, width, 3).
    '''
    vr = VideoReader(uri=video_path, ctx=cpu(0)) # you need to install from source to use gpu ctx
    step = len(vr) / num_frames
    indices = np.arange(0, len(vr) - step, len(vr) / (num_frames - 1)).astype(int)
    indices = np.append(indices, len(vr) - 1)
    frames = vr.get_batch(indices).asnumpy()
    return frames

In [None]:
processor = VideoLlavaProcessor.from_pretrained("LanguageBind/Video-LLaVA-7B-hf")
processor.patch_size = 14
processor.vision_feature_select_strategy = "default"
processor

VideoLlavaProcessor:
- image_processor: VideoLlavaImageProcessor {
  "crop_size": {
    "height": 224,
    "width": 224
  },
  "do_center_crop": true,
  "do_convert_rgb": true,
  "do_normalize": true,
  "do_rescale": true,
  "do_resize": true,
  "image_mean": [
    0.48145466,
    0.4578275,
    0.40821073
  ],
  "image_processor_type": "VideoLlavaImageProcessor",
  "image_std": [
    0.26862954,
    0.26130258,
    0.27577711
  ],
  "processor_class": "VideoLlavaProcessor",
  "resample": 3,
  "rescale_factor": 0.00392156862745098,
  "size": {
    "shortest_edge": 224
  }
}

- tokenizer: LlamaTokenizerFast(name_or_path='LanguageBind/Video-LLaVA-7B-hf', vocab_size=32000, model_max_length=4096, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normal

In [11]:
def generate_prompt(e):

    # prompt = "USER: <video> Instruction: Analyze the video and answer this multiple choice question. The letter c or C is the person doing the activities and recording. Answer only with the letters A, B, C, D, or E"
    options = "\n".join(e['option'])
    # prompt += f"\nQuestion: {e['question']}\n{options}\nASSISTANT:"
    prompt = f"USER: <video> Provide a single-letter answer (A, B, C, D, or E) to the multiple-choice question. Your answer must be one of the letters (A, B, C, D, or E)." 
    prompt += f" Do not provide any other response. You are going to answer a multiple-choice question based on the video.\nHere is the question:"
    prompt += f" {e['question']}.\nHere are the choices:\n{options}\nASSISTANT:"
    return prompt

In [12]:
print(generate_prompt(df[0]))

USER: <video> Provide a single-letter answer (A, B, C, D, or E) to the multiple-choice question. Your answer must be one of the letters (A, B, C, D, or E). Do not provide any other response. You are going to answer a multiple-choice question based on the video.
Here is the question: Taking into account all the actions performed by c, what can you deduce about the primary objective and focus within the video content?.
Here are the choices:
A. C is cooking.
B. C is doing laundry.
C. C is cleaning the kitchen.
D. C is cleaning dishes.
E. C is cleaning the bathroom.
ASSISTANT:


In [13]:
def collate_fn(e, device): 
    clip = read_video_decord(f"videos/{e['video_idx']}.mp4")
    prompt = generate_prompt(e)
    return processor(text=prompt, videos=clip, return_tensors="pt").to(device)

In [14]:
sample = collate_fn(df[4], "cpu")
for k, v in sample.items():
    print(k, ":", v.shape)
print(processor.batch_decode(sample['input_ids'], skip_special_tokens=True))
print(sample['pixel_values_videos'].dtype)

pixel_values_videos : torch.Size([1, 14, 3, 224, 224])
input_ids : torch.Size([1, 3880])
attention_mask : torch.Size([1, 3880])
["USER:  Provide a single-letter answer (A, B, C, D, or E) to the multiple-choice question. Your answer must be one of the letters (A, B, C, D, or E). Do not provide any other response. You are going to answer a multiple-choice question based on the video.\nHere is the question: What can be deduced about c's level of expertise in the task by observing the kind of adjustments made throughout the video?.\nHere are the choices:\nA. C is a novice woodworker. he was not able to cut the wood to size and install it on the wall without making several adjustments.\nB. C is an expert woodworker. he was able to cut the wood to size and install it on the wall without making any adjustments.\nC. C is a professional woodworker. he was able to cut the wood to size and install it on the wall in a timely and efficient manner.\nD. C is an experienced woodworker. he was able to 

In [15]:
model = VideoLlavaForConditionalGeneration.from_pretrained(
    "LanguageBind/Video-LLaVA-7B-hf",
    torch_dtype=torch.float16,
    device_map="auto",
    # attn_implementation="flash_attention_2",
)
model

The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

VideoLlavaForConditionalGeneration(
  (video_tower): CLIPVisionModel(
    (vision_model): CLIPVisionTransformer(
      (embeddings): CLIPVisionEmbeddings(
        (patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)
        (position_embedding): Embedding(257, 1024)
      )
      (pre_layrnorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (encoder): CLIPEncoder(
        (layers): ModuleList(
          (0-23): 24 x CLIPEncoderLayer(
            (self_attn): CLIPSdpaAttention(
              (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
            )
            (layer_norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (mlp): CLIPMLP(
              (activation_fn

In [16]:
sample = collate_fn(df[0], model.device)

In [17]:
generate_ids = model.generate(**sample, max_new_tokens=50)
print(processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0])

USER:  Provide a single-letter answer (A, B, C, D, or E) to the multiple-choice question. Your answer must be one of the letters (A, B, C, D, or E). Do not provide any other response. You are going to answer a multiple-choice question based on the video.
Here is the question: Taking into account all the actions performed by c, what can you deduce about the primary objective and focus within the video content?.
Here are the choices:
A. C is cooking.
B. C is doing laundry.
C. C is cleaning the kitchen.
D. C is cleaning dishes.
E. C is cleaning the bathroom.
ASSISTANT: D


In [18]:
# Model inference with the model is loaded in torch.float16 

answers = {}
right = 0 
with torch.inference_mode():
    for i in range(500):
        print(f"processing #{i} . . .", end="  ")
        inputs = collate_fn(df[i], model.device)
        output = model.generate(**inputs, max_new_tokens=50)
        answer = processor.batch_decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        answer = answer.split('ASSISTANT: ')[1]
        answers[df[i]['question_idx']] = answer
        ground_truth = chr(65 + int(df[i]['answer']))
        if answer[0] == ground_truth:
            right += 1
        print(f"(answer, ground truth) : ({answer}, {ground_truth}). Correct is {right}/500")

processing #0 . . .  (answer, ground truth) : (D, D). Correct is 1/500
processing #1 . . .  (answer, ground truth) : (A, E). Correct is 1/500
processing #2 . . .  (answer, ground truth) : (A, B). Correct is 1/500
processing #3 . . .  

Token indices sequence length is longer than the specified maximum sequence length for this model (4176 > 4096). Running this sequence through the model will result in indexing errors


(answer, ground truth) : (A, E). Correct is 1/500
processing #4 . . .  (answer, ground truth) : (A, D). Correct is 1/500
processing #5 . . .  (answer, ground truth) : (A, E). Correct is 1/500
processing #6 . . .  (answer, ground truth) : (A, D). Correct is 1/500
processing #7 . . .  (answer, ground truth) : (B, B). Correct is 2/500
processing #8 . . .  (answer, ground truth) : (A, A). Correct is 3/500
processing #9 . . .  (answer, ground truth) : (B, A). Correct is 3/500
processing #10 . . .  (answer, ground truth) : (A, E). Correct is 3/500
processing #11 . . .  (answer, ground truth) : (B, E). Correct is 3/500
processing #12 . . .  (answer, ground truth) : (C, D). Correct is 3/500
processing #13 . . .  (answer, ground truth) : (A, D). Correct is 3/500
processing #14 . . .  (answer, ground truth) : (A, A). Correct is 4/500
processing #15 . . .  (answer, ground truth) : (D, D). Correct is 5/500
processing #16 . . .  (answer, ground truth) : (A, E). Correct is 5/500
processing #17 . . .

In [4]:
print(f"Final Score: {int(163/5):02d}%")

Final Score: 32%
