In [1]:
import pandas as pd
import numpy as np
from PIL import Image
import os
from transformers import VideoLlavaForConditionalGeneration, VideoLlavaProcessor

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
open_eqa = pd.read_json(r"data\open-eqa-v0.json")

In [3]:
open_eqa.head()

Unnamed: 0,question,answer,category,question_id,episode_history,extra_answers
0,What is the white object on the wall above the...,Air conditioning unit,object recognition,f2e82760-5c3c-41b1-88b6-85921b9e7b32,hm3d-v0/000-hm3d-BFRyYbPCCPE,
1,What material is the ceiling in the living room?,Wood panel,attribute recognition,7447d782-d1a7-4c87-86dc-b5eafc5a0f76,hm3d-v0/000-hm3d-BFRyYbPCCPE,
2,What color is the staircase railing?,Brown,attribute recognition,e2ccf6f4-22a9-47d1-ab8d-a05a13435b82,hm3d-v0/000-hm3d-BFRyYbPCCPE,
3,What is in between the two picture frames on t...,The TV,spatial understanding,c841bb52-1cec-46d7-bb83-8c99b5c66fa8,hm3d-v0/000-hm3d-BFRyYbPCCPE,
4,Is there room on the dining table to eat?,Yes,spatial understanding,79344680-6b45-4531-8789-ad0f5ef85b3b,hm3d-v0/000-hm3d-BFRyYbPCCPE,


In [4]:
spatial = open_eqa[open_eqa.category == 'spatial understanding']
print(spatial.shape)
spatial.head()

(220, 6)


Unnamed: 0,question,answer,category,question_id,episode_history,extra_answers
3,What is in between the two picture frames on t...,The TV,spatial understanding,c841bb52-1cec-46d7-bb83-8c99b5c66fa8,hm3d-v0/000-hm3d-BFRyYbPCCPE,
4,Is there room on the dining table to eat?,Yes,spatial understanding,79344680-6b45-4531-8789-ad0f5ef85b3b,hm3d-v0/000-hm3d-BFRyYbPCCPE,
13,What is to the left of the mirror?,A plant in a tall vase,spatial understanding,b6dcf043-30a6-4b4e-9787-9b29bd1b1703,hm3d-v0/001-hm3d-TPhiubUHKcP,
14,What is to the left of the staircase?,A storage closet,spatial understanding,5460114d-e885-4eae-8bdc-a273deb3df0a,hm3d-v0/001-hm3d-TPhiubUHKcP,
22,What is on the top shelf to the right side of ...,An ice cooler,spatial understanding,9b4a7fbb-680d-4e39-8d60-7b1e521f3108,hm3d-v0/002-hm3d-wcojb4TFT35,


In [5]:
spatial_hm3d = spatial[spatial.episode_history.str.contains(r'^hm3d.*', na=False)]
print(spatial_hm3d.shape)
spatial_hm3d.head()

(69, 6)


Unnamed: 0,question,answer,category,question_id,episode_history,extra_answers
3,What is in between the two picture frames on t...,The TV,spatial understanding,c841bb52-1cec-46d7-bb83-8c99b5c66fa8,hm3d-v0/000-hm3d-BFRyYbPCCPE,
4,Is there room on the dining table to eat?,Yes,spatial understanding,79344680-6b45-4531-8789-ad0f5ef85b3b,hm3d-v0/000-hm3d-BFRyYbPCCPE,
13,What is to the left of the mirror?,A plant in a tall vase,spatial understanding,b6dcf043-30a6-4b4e-9787-9b29bd1b1703,hm3d-v0/001-hm3d-TPhiubUHKcP,
14,What is to the left of the staircase?,A storage closet,spatial understanding,5460114d-e885-4eae-8bdc-a273deb3df0a,hm3d-v0/001-hm3d-TPhiubUHKcP,
22,What is on the top shelf to the right side of ...,An ice cooler,spatial understanding,9b4a7fbb-680d-4e39-8d60-7b1e521f3108,hm3d-v0/002-hm3d-wcojb4TFT35,


### Load episode history to 'movie'

In [15]:
def load_images_to_array(directory):
    # List all PNG files in the directory
    files = [f for f in sorted(os.listdir(directory)) if f.endswith('.png')]
    images = []
    
    # Loop through files and load each image
    for file in files:
        # Construct full file path
        file_path = os.path.join(directory, file)
        # Open the image file
        with Image.open(file_path) as img:
            # Convert the image to RGB (if not already in that format)
            img = img.convert('RGB')
            # Convert the image to a NumPy array and append to list
            images.append(np.array(img))
    
    video = np.stack(images)
    total_frames = video.shape[0]

    indices = np.linspace(0, total_frames - 1, num=8, dtype=int)
    images = [video[i] for i in indices]
    # Stack all image arrays into a single numpy array
    return np.stack(images)

### Load VideoLLaVA

In [7]:
model = VideoLlavaForConditionalGeneration.from_pretrained("LanguageBind/Video-LLaVA-7B-hf", device_map="auto")
processor = VideoLlavaProcessor.from_pretrained("LanguageBind/Video-LLaVA-7B-hf")

The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.
Loading checkpoint shards: 100%|██████████| 3/3 [00:10<00:00,  3.43s/it]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
question = spatial_hm3d.iloc[0].question
question

'What is in between the two picture frames on the blue wall in the living room?'

In [9]:
prompt = f"USER: <video>{question} ASSISTANT:"
prompt

'USER: <video>What is in between the two picture frames on the blue wall in the living room? ASSISTANT:'

In [16]:
ep_history = spatial_hm3d.iloc[0].episode_history
video = load_images_to_array(f'data/frames/{ep_history}')

In [17]:
video.shape

(8, 1080, 1920, 3)

In [18]:
inputs = processor(text=prompt, videos=video, return_tensors="pt")

In [19]:
out = model.generate(**inputs, max_new_tokens=40)
print(processor.batch_decode(out, skip_special_tokens=True, clean_up_tokenization_spaces=True))