In [1]:
import os
from copy import deepcopy
import cv2
import json
import math
import spacy
import gensim
import numpy as np
import pandas as pd
import tensorflow_hub
from tqdm import tqdm
from pqdm.processes import pqdm
import torchtext.vocab as vocab
from gensim.models import KeyedVectors
from sentence_transformers import SentenceTransformer
from joblib import Parallel, delayed
from collections import defaultdict
from typing import Dict, List
from abc import ABC, abstractmethod

annotations_json_file_path = f"{os.environ['CODE']}/scripts/07_reproduce_baseline_results/data/ego4d/ego4d_clip_annotations_v3.json"

with open(annotations_json_file_path, "r") as reader:
    annotations = json.load(reader)

nlp = spacy.load('en_core_web_lg')

action_category_blip2_answer_word_tfs = dict()
blip2_answer_word_dfs = dict()

def fill_missing_cells(df_group):
    df_group_sorted = df_group.sort_values(by="frame_index").reset_index(drop=False)
    delete_first_row = False
    for index, row in df_group_sorted.iterrows():
        if pd.isnull(row["answer"]):
            if index == 0:
                delete_first_row = True
            else:
                df_group_sorted.at[index, "answer"] = df_group_sorted.at[index - 1, "answer"]
    if delete_first_row:
        df_group_sorted = df_group_sorted.iloc[1:, :]
    return df_group_sorted

def get_fill_value(blip2_vqa_answers_df: pd.DataFrame, frame_index: int, question: str):
    blip2_vqa_row = blip2_vqa_answers_df[(blip2_vqa_answers_df["frame_index"] == frame_index - 6) & (blip2_vqa_answers_df["question"] == question)]
    if len(blip2_vqa_row) == 0:
        if frame_index == 0:
            return "no_answer"
        else:
            return get_fill_value(blip2_vqa_answers_df=blip2_vqa_answers_df, frame_index=frame_index - 6, question=question)
    elif len(blip2_vqa_row) == 1:
        if pd.isnull(blip2_vqa_row["answer"].values[0]):
            return get_fill_value(blip2_vqa_answers_df=blip2_vqa_answers_df, frame_index=frame_index - 6, question=question)
        else:
            return blip2_vqa_row["answer"].values[0]
    else:
        raise Exception("Duplicate rows!")

clip_ids = set(os.listdir(os.path.join(os.environ["SCRATCH"], "ego4d_data/v2/frame_features"))).intersection(annotations.keys())
train_clip_ids = []
val_clip_ids = []
test_clip_ids = []
for clip_id in clip_ids:
    if annotations[clip_id]["subset"] == "train":
        train_clip_ids.append(clip_id)
    elif annotations[clip_id]["subset"] == "val":
        val_clip_ids.append(clip_id)
    else:
        test_clip_ids.append(clip_id)

def get_clip_info(clip_id: str):
    cap = cv2.VideoCapture(os.path.join(os.environ["SCRATCH"], "ego4d_data/v2/clips", clip_id + ".mp4"))
    num_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    cap.release()
    return {"num_frames": num_frames, "fps": fps}

clip_infos = dict((clip_id, get_clip_info(clip_id)) for clip_id in clip_ids)

def get_blip2_answers(clip_id: str):
    clip_info = clip_infos[clip_id]
    num_frames = clip_info["num_frames"]
    blip2_vqa_answers_df = pd.read_csv(os.path.join(os.environ["SCRATCH"], "ego4d_data/v2/frame_features/", clip_id, "blip2_vqa_features.tsv"), sep="\t")

    for index, row in blip2_vqa_answers_df[pd.isnull(blip2_vqa_answers_df["answer"])].iterrows():
        frame_index = row["frame_index"]
        question = row["question"]
        fill_value = get_fill_value(blip2_vqa_answers_df=blip2_vqa_answers_df, frame_index=frame_index, question=question)
        blip2_vqa_answers_df.iat[index, 2] = fill_value

    frame_id_blip2_answers_mapping = {}
    for frame_id in range(0, num_frames, 6):
        current_blip2_answers = blip2_vqa_answers_df[blip2_vqa_answers_df["frame_index"] == frame_id]
        frame_id_blip2_answers_mapping[frame_id] = dict()
        for question in ["What is happening in this picture?", "What does the image describe?", "What does the image describe?"]:
            answer = current_blip2_answers[current_blip2_answers["question"] == question]["answer"].values[0]
            frame_id_blip2_answers_mapping[frame_id][question] = answer
    return frame_id_blip2_answers_mapping

def get_labels(clip_id: str):
    clip_info = clip_infos[clip_id]
    num_frames = clip_info["num_frames"]
    fps = clip_info["fps"]
    frame_id_labels_mapping = {}
    current_annotations = annotations[clip_id]["annotations"]
    for frame_id in range(num_frames):
        current_labels = set()
        if len(current_annotations) == 0:
            current_labels.add("no_annotation")
        else:
            for current_annotation in current_annotations:
                if frame_id / fps >= current_annotation["segment"][0] and frame_id / fps <= current_annotation["segment"][1]:
                    current_labels.add(current_annotation["label"])
            if len(current_labels) == 0:
                current_labels.add("background")
        frame_id_labels_mapping[frame_id] = current_labels
    return frame_id_labels_mapping

train_clip_id_frame_id_blip2_answers_mapping = dict((clip_id, get_blip2_answers(clip_id)) for clip_id in tqdm(train_clip_ids))
val_clip_id_frame_id_blip2_answers_mapping = dict((clip_id, get_blip2_answers(clip_id)) for clip_id in tqdm(val_clip_ids))
test_clip_id_frame_id_blip2_answers_mapping = dict((clip_id, get_blip2_answers(clip_id)) for clip_id in tqdm(test_clip_ids))


2023-09-16 10:20:18.643993: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm
100%|██████████| 82/82 [04:30<00:00,  3.30s/it]
100%|██████████| 31/31 [01:47<00:00,  3.46s/it]
100%|██████████| 34/34 [01:56<00:00,  3.44s/it]


In [2]:
import gensim

class FrameEmbedder(ABC):
    def __init__(self, word_weights: Dict[str, float], unify_words: bool):
        self.vocabulary = sorted(list(word_weights.keys()))
        self.word_weights = word_weights
        self.embedding_dimension = len(self.vocabulary)
        self.unify_words = unify_words

    @classmethod
    def process_per_frame_blip2_answers(cls, blip2_answers: List[str]):
        words = []
        docs = [nlp(blip2_answer) for blip2_answer in blip2_answers]
        for doc in docs:
            words.extend([token.lemma_.lower() for token in doc if (token.lemma_.isalpha()) and (not token.is_stop) and (token.text != "no_answer")])
        return words

    @classmethod
    def process_per_clip_blip2_answers(cls, clip_id, frame_id_blip2_answers_mapping):
        frame_id_words_mapping = {}
        for frame_id, blip2_answers_mapping in frame_id_blip2_answers_mapping.items():
            frame_id_words_mapping[frame_id] = FrameEmbedder.process_per_frame_blip2_answers(blip2_answers_mapping.values())
        return clip_id, frame_id_words_mapping

    @classmethod
    def get_embedding_per_frame(cls, blip2_answers: List[str], blip2_words: List[str]):
        pass

    @classmethod
    def get_embedding_per_clip(cls, clip_id, frame_id_blip2_answers_mapping: Dict[str, List[str]], frame_id_blip2_words_mapping: Dict[str, List[str]]):
        frame_id_embedding_mapping = {}
        for frame_id, blip2_answers in frame_id_blip2_answers_mapping.items():
            blip2_words = frame_id_blip2_words_mapping[frame_id]
            current_embedding = cls.get_embedding_per_frame(blip2_answers=blip2_answers, blip2_words=blip2_words)
            frame_id_embedding_mapping[frame_id] = current_embedding
        return clip_id, frame_id_embedding_mapping

class OneHotFrameEmbedder(FrameEmbedder):
    def __init__(self, word_weights: Dict[str, float], unify_words: bool):
        super().__init__(word_weights=word_weights, unify_words=unify_words)

    @classmethod
    def get_embedding_per_frame(cls, blip2_answers: List[str], blip2_words: List[str]):
        words = cls.process_per_frame_blip2_answers(blip2_answers)
        frame_embedding = None
        if cls.unify_words:
            words = set(words)
        for word in words:
            if word not in cls.vocabulary:
                continue
            word_weight = cls.word_weights[word]
            word_embedding = np.zeros(len(cls.vocabulary))
            word_embedding[cls.vocabulary.index(word)] = 1
            if frame_embedding is None:
                frame_embedding = word_weight * word_embedding
            else:
                frame_embedding += word_weight * word_embedding
        return frame_embedding


class GloveFrameEmbedder(FrameEmbedder):
    def __init__(self, word_weights: Dict[str, float], unify_words: bool):
        super().__init__(word_weights=word_weights, unify_words=unify_words)
        self.glove_embeddings = vocab.GloVe(name='6B', dim=100)

    @classmethod
    def get_embedding_per_frame(cls, blip2_answers: List[str], blip2_words: List[str]):
        frame_embedding = None
        if cls.unify_words:
            words = set(words)
        for word in blip2_words:
            if word not in cls.vocabulary:
                continue
            word_weight = cls.word_weights[word]
            word_embedding = cls.glove_embeddings[word]
            if frame_embedding is None:
                frame_embedding = word_weight * word_embedding
            else:
                frame_embedding += word_weight * word_embedding
        return frame_embedding


class Word2VecFrameEmbedder(FrameEmbedder):
    def __init__(self, word_weights: Dict[str, float], unify_words: bool):
        super().__init__(word_weights=word_weights, unify_words=unify_words)
        self.word2vec_embeddings = gensim.models.KeyedVectors.load_word2vec_format(os.path.join(os.environ["SCRATCH"], "mq_libs/word2vec", 'GoogleNews-vectors-negative300.bin'), binary=True)

    @classmethod
    def get_embedding_per_frame(cls, blip2_answers: List[str], blip2_words: List[str]):
        frame_embedding = None
        if cls.unify_words:
            blip2_words = set(blip2_words)
        for blip2_word in blip2_words:
            if blip2_word not in cls.vocabulary:
                continue
            word_weight = cls.word_weights[blip2_word]
            word_embedding = cls.word2vec_embeddings[blip2_word]
            if frame_embedding is None:
                frame_embedding = word_weight * word_embedding
            else:
                frame_embedding += word_weight * word_embedding
        return frame_embedding


class SentenceTransformerFrameEmbedder(FrameEmbedder):
    def __init__(self, word_weights: Dict[str, float], unify_words: bool):
        super().__init__(word_weights=word_weights, unify_words=unify_words)
        self.model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
    
    @classmethod
    def get_embedding_per_frame(cls, blip2_answers: List[str], blip2_words: List[str]):
        frame_embedding = cls.model.encode(blip2_answers)
        frame_embedding = np.hstack([np.array(frame_embedding[0]), np.array(frame_embedding[1]), np.array(frame_embedding[2])])
        return frame_embedding


class UniversalSentenceEncoderFrameEmbedder(FrameEmbedder):
    def __init__(self, word_weights: Dict[str, float], unify_words: bool):
        super().__init__(word_weights=word_weights, unify_words=unify_words)
        self.model = tensorflow_hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
    
    @classmethod
    def get_embedding_per_frame(cls, blip2_answers: List[str], blip2_words: List[str]):
        frame_embedding = cls.model(blip2_answers)
        frame_embedding = np.hstack([np.array(frame_embedding[0]), np.array(frame_embedding[1]), np.array(frame_embedding[2])])
        return frame_embedding


train_clip_id_frame_id_blip2_words_mapping = dict(pqdm([{"clip_id": clip_id, "frame_id_blip2_answers_mapping": frame_id_blip2_answers_mapping} for clip_id, frame_id_blip2_answers_mapping in train_clip_id_frame_id_blip2_answers_mapping.items()],
                                                  function=FrameEmbedder.process_per_clip_blip2_answers,
                                                  n_jobs=8,
                                                  argument_type='kwargs'))
val_clip_id_frame_id_blip2_words_mapping = dict(pqdm([{"clip_id": clip_id, "frame_id_blip2_answers_mapping": frame_id_blip2_answers_mapping} for clip_id, frame_id_blip2_answers_mapping in val_clip_id_frame_id_blip2_answers_mapping.items()],
                                                function=FrameEmbedder.process_per_clip_blip2_answers,
                                                n_jobs=8,
                                                argument_type='kwargs'))
test_clip_id_frame_id_blip2_words_mapping = dict(pqdm([{"clip_id": clip_id, "frame_id_blip2_answers_mapping": frame_id_blip2_answers_mapping} for clip_id, frame_id_blip2_answers_mapping in test_clip_id_frame_id_blip2_answers_mapping.items()],
                                                 function=FrameEmbedder.process_per_clip_blip2_answers,
                                                 n_jobs=8,
                                                 argument_type='kwargs'))

train_clip_id_frame_id_labels_mapping = dict((clip_id, get_labels(clip_id)) for clip_id in train_clip_ids)
val_clip_id_frame_id_labels_mapping = dict((clip_id, get_labels(clip_id)) for clip_id in val_clip_ids)


QUEUEING TASKS | : 100%|██████████| 82/82 [00:00<00:00, 124.43it/s]
PROCESSING TASKS | : 100%|██████████| 82/82 [11:35<00:00,  8.48s/it]
COLLECTING RESULTS | : 100%|██████████| 82/82 [00:00<00:00, 241018.17it/s]
QUEUEING TASKS | : 100%|██████████| 31/31 [00:00<00:00, 48.81it/s]
PROCESSING TASKS | :  26%|██▌       | 8/31 [01:19<01:23,  3.62s/it]

In [5]:
train_clip_id_frame_id_labels_mapping

{'0970c868-03c3-4719-ae48-9842ed2bc0e1': {0: [],
  1: [],
  2: [],
  3: [],
  4: [],
  5: [],
  6: [],
  7: [],
  8: [],
  9: [],
  10: [],
  11: [],
  12: [],
  13: [],
  14: [],
  15: [],
  16: [],
  17: [],
  18: [],
  19: [],
  20: [],
  21: [],
  22: [],
  23: [],
  24: [],
  25: [],
  26: [],
  27: [],
  28: [],
  29: [],
  30: [],
  31: [],
  32: [],
  33: [],
  34: [],
  35: [],
  36: [],
  37: [],
  38: [],
  39: [],
  40: [],
  41: [],
  42: [],
  43: [],
  44: [],
  45: [],
  46: [],
  47: [],
  48: [],
  49: [],
  50: [],
  51: [],
  52: [],
  53: [],
  54: [],
  55: [],
  56: [],
  57: [],
  58: [],
  59: [],
  60: [],
  61: [],
  62: [],
  63: [],
  64: [],
  65: [],
  66: [],
  67: [],
  68: [],
  69: [],
  70: [],
  71: [],
  72: [],
  73: [],
  74: [],
  75: [],
  76: [],
  77: [],
  78: [],
  79: [],
  80: [],
  81: [],
  82: [],
  83: [],
  84: [],
  85: [],
  86: [],
  87: [],
  88: [],
  89: [],
  90: [],
  91: [],
  92: [],
  93: [],
  94: [],
  95: [],
  96: [],


In [3]:
train_blip2_answer_word_label_mapping = {}
for clip_id in train_clip_id_frame_id_blip2_words_mapping.keys():
    for frame_id in train_clip_id_frame_id_blip2_words_mapping[clip_id].keys():
        current_labels = train_clip_id_frame_id_labels_mapping[clip_id][frame_id]
        for word in train_clip_id_frame_id_blip2_words_mapping[clip_id][frame_id]:
            for label in current_labels:
                if word not in train_blip2_answer_word_label_mapping.keys():
                    train_blip2_answer_word_label_mapping[word] = set([label])
                else:
                    train_blip2_answer_word_label_mapping[word].add(label)

train_blip2_answer_word_idf_mapping = {}
min_idf = np.inf
max_idf = -np.inf
for train_blip2_answer_word in train_blip2_answer_word_label_mapping.keys():
    current_idf = 1 / float(len(train_blip2_answer_word_label_mapping[train_blip2_answer_word]))
    train_blip2_answer_word_idf_mapping[train_blip2_answer_word] = current_idf
    if current_idf > max_idf:
        max_idf = current_idf
    if current_idf < min_idf:
        min_idf = current_idf

for train_blip2_answer_word, idf in train_blip2_answer_word_idf_mapping.items():
    train_blip2_answer_word_idf_mapping[train_blip2_answer_word] = (idf - min_idf) / (max_idf - min_idf)


In [4]:
def get_frame_embedder(frame_embedder_name: str, word_weights: Dict[str, float], unify_words: bool):
    if frame_embedder_name == "word2vec":
        frame_embedder_class = Word2VecFrameEmbedder
    elif frame_embedder_name == "glove":
        frame_embedder_class = GloveFrameEmbedder
    elif frame_embedder_name == "one_hot":
        frame_embedder_class = OneHotFrameEmbedder
    elif frame_embedder_name == "universal_sentence_encoder":
        frame_embedder_class = UniversalSentenceEncoderFrameEmbedder
    elif frame_embedder_name == "sentence_transformer":
        frame_embedder_class = SentenceTransformerFrameEmbedder
    return frame_embedder_class(word_weights=word_weights, unify_words=unify_words)

frame_embedder = get_frame_embedder(frame_embedder_name="word2vec", word_weights=train_blip2_answer_word_idf_mapping, unify_words=False)


In [None]:
# train_clip_id_frame_id_embedding_mapping = dict({"clip_id": clip_id, frame_embedder.get_embedding_per_clip(clip_id=clip_id, frame_id_blip2_answers_mapping=frame_id_blip2_answers_mapping, frame_id_blip2_words_mapping=train_clip_id_frame_id_blip2_words_mapping[clip_id]) for clip_id, frame_id_blip2_answers_mapping in train_clip_id_frame_id_blip2_answers_mapping.items())
# val_clip_id_frame_id_embedding_mapping = dict(frame_embedder.get_embedding_per_clip(clip_id=clip_id, frame_id_blip2_answers_mapping=frame_id_blip2_answers_mapping, frame_id_blip2_words_mapping=val_clip_id_frame_id_blip2_words_mapping[clip_id]) for clip_id, frame_id_blip2_answers_mapping in val_clip_id_frame_id_blip2_answers_mapping.items())
# test_clip_id_frame_id_embedding_mapping = dict(frame_embedder.get_embedding_per_clip(clip_id=clip_id, frame_id_blip2_answers_mapping=frame_id_blip2_answers_mapping, frame_id_blip2_words_mapping=test_clip_id_frame_id_blip2_words_mapping[clip_id]) for clip_id, frame_id_blip2_answers_mapping in test_clip_id_frame_id_blip2_answers_mapping.items())

 = pqdm([{"clip_id": clip_id, "frame_id_blip2_answers_mapping": train_clip_id_frame_id_blip2_answers_mapping[clip_id], "frame_id_blip2_words_mapping": train_clip_id_frame_id_blip2_words_mapping[clip_id]} for clip_id in  .keys()],
        function=FrameEmbedder.process_per_clip_blip2_answers,
        n_jobs=8,
        argument_type='kwargs')


In [None]:
# train_clip_id_frame_id_embedding_mapping = dict(pqdm([{"clip_id": clip_id, "frame_id_blip2_answers_mapping": frame_id_blip2_answers_mapping} for clip_id, frame_id_blip2_answers_mapping in train_clip_id_frame_id_blip2_answers_mapping.items()],
#                                                       function=FrameEmbedder.process_per_clip_blip2_answers,
#                                                       n_jobs=8,
#                                                       argument_type='kwargs'))

# val_clip_id_frame_id_embedding_mapping = dict(pqdm([{"clip_id": clip_id, "frame_id_blip2_answers_mapping": frame_id_blip2_answers_mapping} for clip_id, frame_id_blip2_answers_mapping in train_clip_id_frame_id_blip2_answers_mapping.items()],
#                                                    function=FrameEmbedder.process_per_clip_blip2_answers,
#                                                    n_jobs=8,
#                                                    argument_type='kwargs'))

# test_clip_id_frame_id_embedding_mapping = dict(pqdm([{"clip_id": clip_id, "frame_id_blip2_answers_mapping": frame_id_blip2_answers_mapping} for clip_id, frame_id_blip2_answers_mapping in train_clip_id_frame_id_blip2_answers_mapping.items()],
#                                                     function=FrameEmbedder.process_per_clip_blip2_answers,
#                                                     n_jobs=8,
#                                                     argument_type='kwargs'))

# train_clip_id_frame_id_blip2_answers_mapping = dict([(clip_id, frame_embedder.get_embedding_per_clip(clip_id_frame_id_blip2_answers_mapping=train_clip_id_frame_id_blip2_answers_mapping[clip_id], clip_id_frame_id_blip2_words_mapping=train_clip_id_frame_id_blip2_words_mapping)) for clip_id in train_clip_id_frame_id_blip2_answers_mapping.keys()])
# val_clip_id_frame_id_blip2_answers_mapping = dict([(clip_id, frame_embedder.get_embedding_per_clip(clip_id_frame_id_blip2_answers_mapping=val_clip_id_frame_id_blip2_answers_mapping, clip_id_frame_id_blip2_words_mapping=val_clip_id_frame_id_blip2_words_mapping)) for clip_id in val_clip_id_frame_id_blip2_answers_mapping.keys()])
# test_clip_id_frame_id_blip2_answers_mapping = dict([(clip_id, frame_embedder.get_embedding_per_clip(clip_id_frame_id_blip2_answers_mapping=test_clip_id_frame_id_blip2_answers_mapping, clip_id_frame_id_blip2_words_mapping=test_clip_id_frame_id_blip2_words_mapping)) for clip_id in test_clip_id_frame_id_blip2_answers_mapping.keys()])
# # get_embedding_per_clip(cls, clip_id_frame_id_blip2_answers_mapping: Dict[str, List[str]], clip_id_frame_id_blip2_words_mapping: Dict[str, List[str]])


In [5]:
blip2_vqa_row = blip2_vqa_answers_df[(blip2_vqa_answers_df["frame_index"] == frame_index - 6) & (blip2_vqa_answers_df["question"] == question)]

# if len(blip2_vqa_row) == 1:
#     return blip2_vqa_row["answer"]
# else:
#     if frame_index == 0:
#         return "no_answer"
#     else:
#         return get_fill_value(blip2_vqa_answers_df=blip2_vqa_answers_df, frame_index=frame_index - 6, question=question)

blip2_vqa_row

Unnamed: 0,frame_index,question,answer
3967,7938,What is the person in this picture doing?,washing dishes


In [12]:
action_category_blip2_word_tf_idfs['knead_/_shape_/_roll-out_dough']


[('meal', 108.00502822366532),
 ('pizza', 105.11668722947276),
 ('seed', 104.01005644733061),
 ('cook', 102.29967039099618),
 ('eat', 101.52026951527475),
 ('plate', 84.21922960491248),
 ('clay', 79.76330038044563),
 ('dough', 69.34451188995082),
 ('bread', 69.34451188995082),
 ('planting', 67.38467619331439),
 ('pot', 61.649207050620696),
 ('flour', 61.05764592703684),
 ('scoop', 60.646208573982946),
 ('grow', 58.47359333847099),
 ('tree', 54.609617318027176),
 ('spoon', 54.492377622311544),
 ('egg', 52.704355785013405),
 ('hand', 49.108096685540644),
 ('process', 48.90882803554846),
 ('kitchen', 45.94247604895656),
 ('family', 39.956608238014056),
 ('wash', 37.06157190632291),
 ('doughnut', 36.73300984801734),
 ('bird', 36.73300984801734),
 ('walk', 35.516985100456935),
 ('dirt', 33.330858627111354),
 ('background', 33.330858627111354),
 ('garden', 32.29699127869219),
 ('piece', 31.77577390560726),
 ('mix', 29.725177862321253),
 ('compost', 29.725177862321253),
 ('pile', 29.725177862

In [4]:
action_category_blip2_word_tf_idfs


{}

In [44]:
blip2_vqa_answers_df["frame_index", "question"].values


KeyError: ('frame_index', 'question')

In [1]:
import os
import cv2
import json

annotations_json_file_path = f"{os.environ['CODE']}/scripts/07_reproduce_baseline_results/data/ego4d/ego4d_clip_annotations_v3.json"
clip_id = "003c5ae8-3abd-4824-8efb-21a9a4f8eafe"

with open(annotations_json_file_path, "r") as reader:
    annotations = json.load(reader)[clip_id]["annotations"]


In [9]:
import os
import cv2
import json

annotations_json_file_path = f"{os.environ['CODE']}/scripts/07_reproduce_baseline_results/data/ego4d/ego4d_clip_annotations_v3.json"

with open(annotations_json_file_path, "r") as reader:
    annotations = json.load(reader)[clip_id]["annotations"]

cap = cv2.VideoCapture(os.path.join(os.environ["SCRATCH"], "ego4d_data/v2/clips", "003c5ae8-3abd-4824-8efb-21a9a4f8eafe.mp4"))
fps = cap.get(cv2.CAP_PROP_FPS)
num_frames = cap.get(cv2.CAP_PROP_FRAME_COUNT)

for frame_id in range(num_frames):
    for annotations_dict




In [4]:
import os
import cv2
import json
import argparse
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import random
from dash import Dash, html, dcc, Input, Output, no_update

import sys

sys.path.append("../")

from utils import extract_frames

random.seed(1903)

ground_truth_asl_predicted_action_category_match_color_mapping = {
    True: "rgba(0, 255, 0, 1.0)",
    False: "rgba(255, 0, 0, 1.0)",
}

unique_action_categories = set(["background", "no_annotations_for_the_clip"])


def generate_random_color():
    random_int = np.random.randint(low=0, high=256, size=(3,))
    random_color = f"rgba({random_int[0]}, {random_int[1]}, {random_int[2]}, 1.0)"
    return random_color


def get_blip2_answer(current_blip2_rows, blip2_question):
    answer = current_blip2_rows[current_blip2_rows["question"] == blip2_question][
        "answer"
    ]
    if len(answer) == 0:
        return "NaN"
    else:
        return answer.values[0]


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Argument parser")
    parser.add_argument(
        "--clip_id",
        type=str,
        default="003c5ae8-3abd-4824-8efb-21a9a4f8eafe",
    )
    parser.add_argument(
        "--ground_truth_action_instances_file_path",
        type=str,
        default=f"{os.environ['CODE']}/scripts/07_reproduce_baseline_results/data/ego4d/ego4d_clip_annotations_v3.json",
    )
    parser.add_argument(
        "--asl_predicted_action_instances_file_path",
        type=str,
        default=f"{os.environ['CODE']}/scripts/07_reproduce_baseline_results/submission_final.json",
    )
    parser.add_argument(
        "--assets_path",
        type=str,
        default=f"{os.environ['SCRATCH']}/ego4d_data/v2/frames",
    )
    parser.add_argument("--frame_feature_extraction_stride", type=int, default=6)
    args = parser.parse_args()

    with open(
        os.path.join(
            os.environ["CODE"],
            "scripts/07_reproduce_baseline_results/data/ego4d/ego4d_clip_annotations_v3.json",
        ),
        "r",
    ) as reader:
        annotations_dict = json.load(reader)

    if (
        not os.path.exists(
            os.path.join(
                os.environ["SCRATCH"],
                "ego4d_data/v2/frames",
                args.clip_id,
                "end.txt",
            )
        )
        or not (len(annotations_dict[args.clip_id]["annotations"]) > 0)
        or not os.path.exists(
            os.path.join(
                os.environ["SCRATCH"],
                "ego4d_data/v2/frame_features",
                args.clip_id,
                "blip2_vqa_features.tsv",
            )
        )
    ):
        raise Exception("Please choose another clip.")

    ground_truth_action_instances = json.load(
        open(args.ground_truth_action_instances_file_path, "r")
    )[args.clip_id]["annotations"]
    asl_predicted_action_instances = json.load(
        open(args.asl_predicted_action_instances_file_path, "r")
    )["detect_results"][args.clip_id]
    blip2_answers_folder_path = os.path.join(
        os.environ["SCRATCH"], "ego4d_data/v2/frame_features", args.clip_id
    )
    blip2_answers_file_names = [
        file_name
        for file_name in os.listdir(blip2_answers_folder_path)
        if file_name.startswith("blip2_")
    ]
    blip2_answers_file_paths = [
        os.path.join(
            os.environ["SCRATCH"],
            "ego4d_data/v2/frame_features",
            args.clip_id,
            blip2_answers_file_name,
        )
        for blip2_answers_file_name in blip2_answers_file_names
    ]
    blip2_answers_dfs = pd.concat(
        [
            pd.read_csv(blip2_answers_file_path, sep="\t")
            for blip2_answers_file_path in blip2_answers_file_paths
        ],
        axis=0,
    )

    cap = cv2.VideoCapture(
        os.path.join(
            os.environ["SCRATCH"], "ego4d_data/v2/clips", args.clip_id + ".mp4"
        )
    )
    fps = cap.get(cv2.CAP_PROP_FPS)
    num_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    cap.release()

    extract_frames(clip_id=args.clip_id, output_folder_path=args.assets_path)

    frame_id_ground_truth_action_categories_mapping = {}
    frame_id_asl_predicted_action_categories_mapping = {}

    for current_frame_id in range(num_frames):
        frame_id_asl_predicted_action_categories_mapping[current_frame_id] = []
        current_frame_time = current_frame_id / fps
        assigned_to_an_action_category = False
        for asl_predicted_action_instance in asl_predicted_action_instances:
            if (
                current_frame_time >= asl_predicted_action_instance["segment"][0]
                and current_frame_time <= asl_predicted_action_instance["segment"][1]
            ):
                assigned_to_an_action_category = True
                frame_id_asl_predicted_action_categories_mapping[
                    current_frame_id
                ].append(
                    (
                        asl_predicted_action_instance["label"],
                        asl_predicted_action_instance["score"],
                    )
                )
                unique_action_categories.add(asl_predicted_action_instance["label"])
        if assigned_to_an_action_category:
            frame_id_asl_predicted_action_categories_mapping[current_frame_id] = sorted(
                frame_id_asl_predicted_action_categories_mapping[current_frame_id],
                key=lambda x: x[1],
            )[-1][0]
        else:
            frame_id_asl_predicted_action_categories_mapping[
                current_frame_id
            ] = "background"

        if len(ground_truth_action_instances) == 0:
            frame_id_ground_truth_action_categories_mapping[
                current_frame_id
            ] = "no_annotations_for_the_clip"
        else:
            assigned_to_an_action_category = False
            for ground_truth_action_instance in ground_truth_action_instances:
                if (
                    current_frame_time >= ground_truth_action_instance["segment"][0]
                    and current_frame_time <= ground_truth_action_instance["segment"][1]
                ):
                    assigned_to_an_action_category = True
                    frame_id_ground_truth_action_categories_mapping[
                        current_frame_id
                    ] = ground_truth_action_instance["label"]
                    unique_action_categories.add(ground_truth_action_instance["label"])
            if not assigned_to_an_action_category:
                frame_id_ground_truth_action_categories_mapping[
                    current_frame_id
                ] = "background"

    action_category_color_mapping = dict(
        (action_category, generate_random_color())
        for action_category in sorted(list(unique_action_categories))
    )

    sequences_dict = {
        "gt_colors": [],
        "asl_pred_colors": [],
        "match_colors": [],
        "gt_values": [],
        "asl_pred_values": [],
        "match_values": [],
        "frame_ids": [],
        "blip2_happen_answers": [],
        "blip2_do_answers": [],
        "blip2_describe_answers": [],
        "blip2_captioning_answers": [],
    }

    blip2_describe_question = "What does the image describe?"
    blip2_do_question = "What is the person in this picture doing?"
    blip2_happen_question = "What is happening in this picture?"
    blip2_captioning_question = "Image Caption"

    for frame_id in range(num_frames):
        current_blip2_rows = blip2_answers_dfs[
            blip2_answers_dfs["frame_index"]
            == (frame_id // args.frame_feature_extraction_stride)
            * args.frame_feature_extraction_stride
        ]
        current_blip2_describe_answer = get_blip2_answer(
            current_blip2_rows=current_blip2_rows,
            blip2_question=blip2_describe_question,
        )
        current_blip2_do_answer = get_blip2_answer(
            current_blip2_rows=current_blip2_rows, blip2_question=blip2_do_question
        )
        current_blip2_happen_answer = get_blip2_answer(
            current_blip2_rows=current_blip2_rows,
            blip2_question=blip2_happen_question,
        )
        current_blip2_captioning_answer = get_blip2_answer(
            current_blip2_rows=current_blip2_rows,
            blip2_question=blip2_captioning_question,
        )

        current_ground_truth_action_category = (
            frame_id_ground_truth_action_categories_mapping[frame_id]
        )
        sequences_dict["frame_ids"].append(frame_id)
        sequences_dict["gt_values"].append(current_ground_truth_action_category)
        current_ground_truth_action_category_color = action_category_color_mapping[
            current_ground_truth_action_category
        ]
        sequences_dict["gt_colors"].append(current_ground_truth_action_category_color)
        sequences_dict["blip2_happen_answers"].append(current_blip2_happen_answer)
        sequences_dict["blip2_do_answers"].append(current_blip2_do_answer)
        sequences_dict["blip2_describe_answers"].append(current_blip2_describe_answer)
        sequences_dict["blip2_captioning_answers"].append(
            current_blip2_captioning_answer
        )

        current_asl_predicted_action_category = (
            frame_id_asl_predicted_action_categories_mapping[frame_id]
        )
        current_asl_predicted_action_category_color = action_category_color_mapping[
            current_asl_predicted_action_category
        ]
        sequences_dict["asl_pred_values"].append(current_asl_predicted_action_category)
        sequences_dict["asl_pred_colors"].append(
            current_asl_predicted_action_category_color
        )

        current_ground_truth_asl_predicted_action_category_match = (
            current_ground_truth_action_category
            == current_asl_predicted_action_category
        )
        current_ground_truth_asl_predicted_action_category_match_color = (
            ground_truth_asl_predicted_action_category_match_color_mapping[
                current_ground_truth_asl_predicted_action_category_match
            ]
        )
        sequences_dict["match_values"].append(
            current_ground_truth_asl_predicted_action_category_match
        )
        sequences_dict["match_colors"].append(
            current_ground_truth_asl_predicted_action_category_match_color
        )

    sequences_dict["frame_file_paths"] = [
        os.path.join(
            args.clip_id,
            frame_file_name,
        )
        for frame_file_name in os.listdir(
            os.path.join(os.environ["SCRATCH"], "ego4d_data/v2/frames", args.clip_id)
        )
    ]

    fig = go.Figure(
        data=[
            go.Bar(
                orientation="h",
                x=[1] * num_frames,
                y=[name] * num_frames,
                marker=dict(
                    color=sequences_dict[f"{name}_colors"],
                    line=dict(color="rgb(255, 255, 255)", width=0),
                ),
                customdata=list(
                    zip(
                        sequences_dict["frame_file_paths"],
                        sequences_dict["frame_ids"],
                        sequences_dict["gt_values"],
                        sequences_dict["asl_pred_values"],
                        sequences_dict["match_values"],
                        sequences_dict["blip2_describe_answers"],
                        sequences_dict["blip2_do_answers"],
                        sequences_dict["blip2_happen_answers"],
                        sequences_dict["blip2_captioning_answers"],
                    )
                ),
            )
            for name in ["match", "asl_pred", "gt"]
        ],
        layout=dict(
            title=f"Clip ID: {args.clip_id}",
            barmode="stack",
            barnorm="fraction",
            bargap=0.5,
            showlegend=False,
            xaxis=dict(range=[-0.02, 1.02], showticklabels=False, showgrid=False),
            height=max(600, 40 * len(sequences_dict.keys())),
            template=None,
            margin=dict(b=1),
        ),
    )

    fig.update_traces(hoverinfo="none", hovertemplate=None)

    app = Dash(__name__, assets_folder=args.assets_path)

    app.layout = html.Div(
        [
            dcc.Graph(id="graph-basic-2", figure=fig, clear_on_unhover=True),
            dcc.Tooltip(id="graph-tooltip"),
        ]
    )

    @app.callback(
        Output("graph-tooltip", "show"),
        Output("graph-tooltip", "bbox"),
        Output("graph-tooltip", "children"),
        Input("graph-basic-2", "hoverData"),
    )
    def display_hover(hoverData):
        if hoverData is None:
            return False, no_update, no_update

        bbox = hoverData["points"][0]["bbox"]

        children = [
            html.Div(
                [
                    html.Img(
                        src=app.get_asset_url(hoverData["points"][0]["customdata"][0]),
                        style={"width": "100%"},
                    ),
                    html.P(f"Frame ID: {hoverData['points'][0]['customdata'][1]}"),
                    html.P(
                        f"Ground Truth: {str(hoverData['points'][0]['customdata'][2]).replace('_', ' ')}"
                    ),
                    html.P(
                        f"ASL Prediction: {str(hoverData['points'][0]['customdata'][3]).replace('_', ' ')}"
                    ),
                    html.P(
                        f"Match: {str(hoverData['points'][0]['customdata'][4]).replace('_', ' ')}"
                    ),
                    html.P(
                        f"What does the image describe? (BLIP2): {hoverData['points'][0]['customdata'][5]}"
                    ),
                    html.P(
                        f"What is the person in this picture doing? (BLIP2): {hoverData['points'][0]['customdata'][6]}"
                    ),
                    html.P(
                        f"What is happening in this picture? (BLIP2): {hoverData['points'][0]['customdata'][7]}"
                    ),
                    html.P(
                        f"Image Caption (BLIP2): {hoverData['points'][0]['customdata'][8]}"
                    ),
                ],
                style={"width": "400px", "white-space": "normal"},
            )
        ]

        return True, bbox, children

    app.run_server(debug=True)


ModuleNotFoundError: No module named 'plotly'