In [None]:
import os
import cv2
import torch
import pickle
from PIL import Image
from tqdm import tqdm
from transformers import Blip2Processor, Blip2ForConditionalGeneration

device = "cuda:3"

processor = Blip2Processor.from_pretrained(
    os.path.join(os.environ["SCRATCH"], "mq_libs/blip2")
)
model = Blip2ForConditionalGeneration.from_pretrained(
    os.path.join(os.environ["SCRATCH"], "mq_libs/blip2"),
    torch_dtype=torch.float16,
)
model.to(device)


: 

In [2]:
import pickle
import json


In [3]:
with open(os.path.join(os.environ["SCRATCH"], "ego4d_data/v2/analysis_data/ground_truth_labels/ground_truth_labels.pickle"), "rb") as reader:
    ground_truth_label_indices = pickle.load(reader)


In [4]:
unique_ground_truth_labels = set()


In [5]:
with open(os.path.join(os.environ["CODE"], "scripts/06_analyze_frame_features/02_map_label_dependency_parsing_features_and_blip2_answer_dependency_parsing_features", "label_verb_noun_tool_mapping.json"), "r") as reader:
    label_verb_noun_tools_mapping = json.load(reader)

label_verb_noun_tools_mapping_keys = sorted(list(label_verb_noun_tools_mapping.keys())) + ["background"]


In [6]:
clip_id = "0076e425-bdb6-48b3-b4d3-695089ac9800"

for frame_id in ground_truth_label_indices[clip_id].keys():
    for current_ground_truth_label_index in ground_truth_label_indices[clip_id][frame_id]:
        current_ground_truth_label = label_verb_noun_tools_mapping_keys[current_ground_truth_label_index]
        unique_ground_truth_labels.add(current_ground_truth_label)

unique_ground_truth_labels


{'background',
 'browse_through_accessories_on_rack_/_shelf',
 'browse_through_clothing_items_on_rack_/_shelf_/_hanger',
 'converse_/_interact_with_someone',
 'hang_clothes_in_closet_/_on_hangers',
 'read_a_book_/_magazine_/_shopping_list_etc.'}

In [16]:
label_question_mapping = {
    "browse_through_accessories_on_rack_/_shelf": "Browsing through accessories on rack or shelf?",
    "browse_through_clothing_items_on_rack_/_shelf_/_hanger": "Is the person in this image browsing through clothing items on rack or shelf or hanger?",
    "converse_/_interact_with_someone": "Is the person in this image conversing or interacting with someone?",
    "hang_clothes_in_closet_/_on_hangers": "Is the person in this image hanging clothes in closet or on hangers?",
    "read_a_book_/_magazine_/_shopping_list_etc.": "Is the person in this image reading a book, magazine, shopping list or similar?"
}


In [17]:
cap = cv2.VideoCapture(os.path.join(os.environ["SCRATCH"], "ego4d_data/v2/clips", clip_id + ".mp4"))

frame_id_label_answer_mapping = {}

success = True
frame_id = 0

with tqdm(total=int(cap.get(cv2.CAP_PROP_FRAME_COUNT))) as pbar:
    while success:
        frame_id_label_answer_mapping[frame_id] = dict()
        success, frame = cap.read()
        frame = Image.fromarray(frame[:, :, ::-1])
        for label, question in label_question_mapping.items():
            label_index = label_verb_noun_tools_mapping_keys.index(label)
            preprocessed_frames_batch_dict = processor(
                images=[
                    frame
                ],
                text=["Question: " + question + " Answer:"],
                return_tensors="pt",
            ).to(device, torch.float16)
            generated_ids = model.generate(**preprocessed_frames_batch_dict)
            generated_text = processor.batch_decode(
                generated_ids, skip_special_tokens=True
            )[0].strip()
            frame_id_label_answer_mapping[frame_id][label_index] = generated_text
        pbar.update(1)
        frame_id += 1
        if frame_id == 1000:
            break


  7%|▋         | 1000/14400 [26:23<5:53:38,  1.58s/it]
