In [1]:
import os
import cv2
import torch
import json
import pickle
from PIL import Image
from tqdm import tqdm
from transformers import Blip2Processor, Blip2ForConditionalGeneration

device = "cuda:4"

processor = Blip2Processor.from_pretrained(
    os.path.join(os.environ["SCRATCH"], "mq_libs/blip2")
)
model = Blip2ForConditionalGeneration.from_pretrained(
    os.path.join(os.environ["SCRATCH"], "mq_libs/blip2"),
    torch_dtype=torch.float16,
)
model.to(device)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Blip2ForConditionalGeneration(
  (vision_model): Blip2VisionModel(
    (embeddings): Blip2VisionEmbeddings(
      (patch_embedding): Conv2d(3, 1408, kernel_size=(14, 14), stride=(14, 14))
    )
    (encoder): Blip2Encoder(
      (layers): ModuleList(
        (0): Blip2EncoderLayer(
          (self_attn): Blip2Attention(
            (dropout): Dropout(p=0.0, inplace=False)
            (qkv): Linear(in_features=1408, out_features=4224, bias=True)
            (projection): Linear(in_features=1408, out_features=1408, bias=True)
          )
          (layer_norm1): LayerNorm((1408,), eps=1e-05, elementwise_affine=True)
          (mlp): Blip2MLP(
            (activation_fn): GELUActivation()
            (fc1): Linear(in_features=1408, out_features=6144, bias=True)
            (fc2): Linear(in_features=6144, out_features=1408, bias=True)
          )
          (layer_norm2): LayerNorm((1408,), eps=1e-05, elementwise_affine=True)
        )
        (1): Blip2EncoderLayer(
          (self_attn): 

In [2]:
with open(os.path.join(os.environ["SCRATCH"], "ego4d_data/v2/analysis_data/ground_truth_labels/ground_truth_labels.pickle"), "rb") as reader:
    ground_truth_label_indices = pickle.load(reader)


In [11]:
with open(os.path.join(os.environ["CODE"], "scripts/06_analyze_frame_features/02_map_label_dependency_parsing_features_and_blip2_answer_dependency_parsing_features", "label_verb_noun_tool_mapping.json"), "r") as reader:
    label_verb_noun_tools_mapping = json.load(reader)

distinct_ground_truth_labels = sorted(list(label_verb_noun_tools_mapping.keys())) + ["background"]


In [6]:
clip_id = "0076e425-bdb6-48b3-b4d3-695089ac9800"

unique_ground_truth_labels = set()

for frame_id in ground_truth_label_indices[clip_id].keys():
    for current_ground_truth_label_index in ground_truth_label_indices[clip_id][frame_id]:
        current_ground_truth_label = distinct_ground_truth_labels[current_ground_truth_label_index]
        unique_ground_truth_labels.add(current_ground_truth_label)

unique_ground_truth_labels


{'background',
 'browse_through_accessories_on_rack_/_shelf',
 'browse_through_clothing_items_on_rack_/_shelf_/_hanger',
 'converse_/_interact_with_someone',
 'hang_clothes_in_closet_/_on_hangers',
 'read_a_book_/_magazine_/_shopping_list_etc.'}

In [7]:
label_question_mapping = {
    "browse_through_accessories_on_rack_/_shelf": "Is the person in this image browsing through accessories on rack or shelf?",
    "browse_through_clothing_items_on_rack_/_shelf_/_hanger": "Is the person in this image browsing through clothing items on rack or shelf or hanger?",
    "converse_/_interact_with_someone": "Is the person in this image conversing or interacting with someone?",
    "hang_clothes_in_closet_/_on_hangers": "Is the person in this image hanging clothes in closet or on hangers?",
    "read_a_book_/_magazine_/_shopping_list_etc.": "Is the person in this image reading a book, magazine, shopping list or similar?"
}


In [8]:
cap = cv2.VideoCapture(os.path.join(os.environ["SCRATCH"], "ego4d_data/v2/clips", clip_id + ".mp4"))

frame_id_label_answer_mapping = {}

success = True
frame_id = 0

with tqdm(total=int(cap.get(cv2.CAP_PROP_FRAME_COUNT))) as pbar:
    while success:
        frame_id_label_answer_mapping[frame_id] = dict()
        success, frame = cap.read()
        frame = Image.fromarray(frame[:, :, ::-1])
        for label, question in label_question_mapping.items():
            label_index = distinct_ground_truth_labels.index(label)
            preprocessed_frames_batch_dict = processor(
                images=[
                    frame
                ],
                text=["Question: " + question + " Answer:"],
                return_tensors="pt",
            ).to(device, torch.float16)
            generated_ids = model.generate(**preprocessed_frames_batch_dict)
            generated_text = processor.batch_decode(
                generated_ids, skip_special_tokens=True
            )[0].strip()
            frame_id_label_answer_mapping[frame_id][label_index] = generated_text
        pbar.update(1)
        frame_id += 1
        if frame_id == 1000:
            break


  7%|▋         | 1000/14400 [25:42<5:44:24,  1.54s/it]


In [28]:
frame_id_label_answer_mapping[700]


{11: 'Yes',
 12: 'Yes',
 24: 'No, they are not.',
 52: 'Hanging clothes in closet',
 81: 'No, they are not. They are wearing a headband with a green ribbon on it.'}

In [29]:
distinct_ground_truth_labels[ground_truth_label_indices[clip_id][999][0]]


'background'