In [1]:
import os
import cv2
import json
import torch
import pickle
import random
from sklearn.utils import shuffle
import numpy as np

import sys
import ray
from PIL import Image
from tqdm import tqdm
sys.path.insert(
    0,
    os.path.join(
        os.environ["CODE"],
        "scripts/04_extract_frame_features/",
    ),
)

import torch.nn as nn

from constants import question_constant_mapping

from torch.nn import CrossEntropyLoss
from transformers import Blip2Processor
from transformers.models.blip_2.configuration_blip_2 import Blip2Config
from transformers.models.auto import AutoModelForCausalLM, AutoModelForSeq2SeqLM
from transformers.models.blip_2.modeling_blip_2 import Blip2PreTrainedModel, Blip2VisionModel, Blip2QFormerModel, Blip2ForConditionalGenerationModelOutput

from typing import List, Optional, Tuple, Union

seed = 1903
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)


KeyboardInterrupt: 

In [2]:
with open(os.path.join(os.environ["SCRATCH"], "ego4d_data/v2/analysis_data/ground_truth_labels/ground_truth_labels.pickle"), "rb") as reader:
    ground_truth_label_indices = pickle.load(reader)
    clip_ids = shuffle(list(ground_truth_label_indices.keys()))


In [3]:
train_size = int(len(clip_ids) * 0.6)
val_size = int(len(clip_ids) * 0.2)
test_size = int(len(clip_ids) * 0.2)

train_clip_ids = clip_ids[:train_size]
val_clip_ids = clip_ids[train_size:train_size+val_size]
test_clip_ids = clip_ids[train_size+val_size:]

def get_random_frame_and_corresponding_label():
    random_clip_id_index = np.random.randint(low=0, high=len(clip_ids))
    random_clip_id = clip_ids[random_clip_id_index]
    random_cap = cv2.VideoCapture(os.path.join(os.environ["SCRATCH"], "ego4d_data/v2/clips", random_clip_id + ".mp4"))
    number_of_frames = random_cap.get(cv2.CAP_PROP_FRAME_COUNT)
    random_frame_id = np.random.randint(low=0, high=number_of_frames)
    random_cap.set(cv2.CAP_PROP_POS_FRAMES, random_frame_id - 1)
    success, random_frame = random_cap.read()
    corresponding_label = ground_truth_label_indices[random_clip_id][random_frame_id]
    return random_frame, corresponding_label


In [6]:
blip2_vqa_frame_feature_extractor = BLIP2VQAFrameFeatureExtractor()


Loading checkpoint shards: 100%|██████████| 2/2 [01:29<00:00, 44.54s/it]


In [9]:
with torch.no_grad():
    random_frame, corresponding_label = get_random_frame_and_corresponding_label()
    preprocessed_frames_batch_dict = blip2_vqa_frame_feature_extractor.processor(
        images=[Image.fromarray(random_frame[:, :, ::-1])],
        text=["Question: " + "What is the person in this image doing?" + " Answer:"],
        return_tensors="pt",
    ).to("cuda:3", torch.float16)
    generated_ids = blip2_vqa_frame_feature_extractor.model.generate(**preprocessed_frames_batch_dict)
    # blip2_answer = blip2_vqa_frame_feature_extractor.processor.batch_decode(
    #     generated_ids, skip_special_tokens=True
    # )[0].strip()
    # print(blip2_answer)
    # display(Image.fromarray(random_frame[:, :, ::-1]))




In [8]:
with open(os.path.join(os.environ["CODE"], "scripts/06_analyze_frame_features/02_map_label_dependency_parsing_features_and_blip2_answer_dependency_parsing_features", "label_verb_noun_tool_mapping.json"), "r") as reader:
    label_verb_noun_tools_mapping = json.load(reader)

label_verb_noun_tools_mapping_keys = sorted(list(label_verb_noun_tools_mapping.keys())) + ["background"]
