In [1]:
import os
import json
import pickle
import numpy as np
from tqdm import tqdm
from pqdm.processes import pqdm
from typing import Dict, List, Tuple
# from sentence_transformers import SentenceTransformer


with open(os.path.join(os.environ["CODE"], "scripts/06_analyze_frame_features/label_verb_noun_tool_mapping.json"), "r") as reader:
    label_verb_noun_tools_mapping = json.load(reader)

with open(os.path.join(os.environ["SCRATCH"], "ego4d_data/v2/analysis_data", "clip_id_frame_id_blip2_verb_noun_tool_pair_mapping.pickle"), "rb") as reader:
    clip_id_frame_id_blip2_verb_noun_tool_pair_mapping = pickle.load(reader)

with open(os.path.join(os.environ["SCRATCH"], "ego4d_data/v2/analysis_data", "analysis_data.pickle"), "rb") as reader:
    clip_id_frame_id_ground_truth_labels_mapping = pickle.load(reader)["clip_id_frame_id_labels_mapping"]

# sbert = SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L6-v2')


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
distinct_ground_truth_labels = ["background"] + sorted(list(label_verb_noun_tools_mapping.keys()))


In [6]:
clip_id_frame_id_ground_truth_label_indices_mapping = dict()
for clip_id, frame_id_ground_truth_labels_mapping in clip_id_frame_id_ground_truth_labels_mapping.items():
    if list(clip_id_frame_id_ground_truth_labels_mapping[clip_id][0])[0] == "no_annotation":
        continue
    clip_id_frame_id_ground_truth_label_indices_mapping[clip_id] = dict()
    for frame_id, ground_truth_labels in frame_id_ground_truth_labels_mapping.items():
        clip_id_frame_id_ground_truth_label_indices_mapping[clip_id][frame_id] = []
        for ground_truth_label in ground_truth_labels:
            clip_id_frame_id_ground_truth_label_indices_mapping[clip_id][frame_id].append(distinct_ground_truth_labels.index(ground_truth_label))
del clip_id_frame_id_ground_truth_labels_mapping


In [7]:
def calculate_SBERT_cosine_similarities(blip2_SBERT_embeddings: List[np.array], label_SBERT_embeddings: List[np.array]):
    cosine_similarities = []
    for blip2_SBERT_embedding in blip2_SBERT_embeddings:
        for label_SBERT_embedding in label_SBERT_embeddings:
            cosine_similarity = np.dot(blip2_SBERT_embedding, label_SBERT_embedding)/(np.linalg.norm(blip2_SBERT_embedding) * np.linalg.norm(label_SBERT_embedding))
            cosine_similarities.append(cosine_similarity)
    return cosine_similarities


def nontemporal_dictionary_matching_for_given_clip(clip_id: str, frame_id_blip2_question_answer_verb_noun_tool_pairs_mapping: Dict[str, List[Tuple[str, str, str]]], label_verb_noun_tools_mapping: Dict[str, List[Tuple[str, str, str]]]):
    frame_id_predicted_label_indices_and_scores = dict()
    for frame_id, blip2_question_answer_verb_noun_tool_pairs_mapping in frame_id_blip2_question_answer_verb_noun_tool_pairs_mapping.items():
        frame_id_predicted_label_indices_and_scores[frame_id] = dict()
        for label, label_verb_noun_tools in label_verb_noun_tools_mapping.items():
            label_index = distinct_ground_truth_labels.index(label)
            if label not in frame_id_predicted_label_indices_and_scores[frame_id].keys():
                frame_id_predicted_label_indices_and_scores[frame_id][label_index] = []
            for label_verb_noun_tool in label_verb_noun_tools:
                label_verb = label_verb_noun_tool[0].replace(" ", "")
                label_noun = label_verb_noun_tool[1].replace(" ", "")
                label_tool = label_verb_noun_tool[2].replace(" ", "")
                for blip2_question, blip2_answer_verb_noun_tool_pairs in blip2_question_answer_verb_noun_tool_pairs_mapping.items():
                    blip2_answer = blip2_answer_verb_noun_tool_pairs[0]
                    blip2_verb_noun_tool_pairs = blip2_answer_verb_noun_tool_pairs[1]
                    for blip2_verb_noun_tool_pair in blip2_verb_noun_tool_pairs:
                        blip2_verb = blip2_verb_noun_tool_pair[0].replace(" ", "")
                        blip2_noun = blip2_verb_noun_tool_pair[1].replace(" ", "")
                        blip2_tool = blip2_verb_noun_tool_pair[2].replace(" ", "")
                        if label_verb == blip2_verb and label_noun == blip2_noun and label_tool == blip2_tool:
                            frame_id_predicted_label_indices_and_scores[frame_id][label_index].append(1.00)
                        elif label_verb == blip2_verb and label_noun == blip2_noun:
                            frame_id_predicted_label_indices_and_scores[frame_id][label_index].append(0.75)
                        elif label_verb == blip2_verb and label_tool == blip2_tool:
                            frame_id_predicted_label_indices_and_scores[frame_id][label_index].append(0.50)
                        elif label_verb == blip2_verb:
                            frame_id_predicted_label_indices_and_scores[frame_id][label_index].append(0.25)
    return clip_id, frame_id_predicted_label_indices_and_scores


# def nontemporal_SBERT_embedding_for_given_clip(clip_id: str, frame_id_blip2_question_answer_verb_noun_tool_pairs_mapping: Dict[str, List[Tuple[str, str, str]]], label_verb_noun_tools_mapping: Dict[str, List[Tuple[str, str, str]]]):
#     label_verb_noun_tool_SBERT_embeddings_mapping = dict()
#     label_verb_noun_SBERT_embeddings_mapping = dict()
#     label_verb_tool_SBERT_embeddings_mapping = dict()
#     label_verb_SBERT_embeddings_mapping = dict()

#     for label, label_verb_noun_tools in label_verb_noun_tools_mapping.items():
#         if label not in label_verb_noun_tool_SBERT_embeddings_mapping.keys():
#             label_verb_noun_tool_SBERT_embeddings_mapping[label] = []
#         if label not in label_verb_noun_SBERT_embeddings_mapping.keys():
#             label_verb_noun_SBERT_embeddings_mapping[label] = []
#         if label not in label_verb_tool_SBERT_embeddings_mapping.keys():
#             label_verb_tool_SBERT_embeddings_mapping[label] = []
#         if label not in label_verb_SBERT_embeddings_mapping.keys():
#             label_verb_SBERT_embeddings_mapping[label] = []

#         for label_verb_noun_tool in label_verb_noun_tools:
#             label_verb = label_verb_noun_tool[0]
#             label_noun = label_verb_noun_tool[1]
#             label_tool = label_verb_noun_tool[2]
#             if label_noun == "NaN":
#                 label_noun = "something"
#             if label_tool == "NaN":
#                 label_tool = "a tool"
#             label_verb_noun_tool_SBERT_embeddings_mapping[label].append(sbert.encode([f"{label_verb} {label_noun} using {label_tool}"])[0])
#             label_verb_noun_SBERT_embeddings_mapping[label].append(sbert.encode([f"{label_verb} {label_noun}"])[0])
#             label_verb_tool_SBERT_embeddings_mapping[label].append(sbert.encode([f"{label_verb} using {label_tool}"])[0])
#             label_verb_SBERT_embeddings_mapping[label].append(sbert.encode([f"{label_verb}"])[0])

#     frame_id_blip2_answer_SBERT_embeddings_mapping = dict()
#     frame_id_blip2_verb_noun_tool_SBERT_embeddings_mapping = dict()
#     frame_id_blip2_verb_noun_SBERT_embeddings_mapping = dict()
#     frame_id_blip2_verb_tool_SBERT_embeddings_mapping = dict()
#     frame_id_blip2_verb_SBERT_embeddings_mapping = dict()

#     for frame_id, blip2_question_answer_verb_noun_tool_pairs_mapping in frame_id_blip2_question_answer_verb_noun_tool_pairs_mapping.items():
#         frame_id_blip2_answer_SBERT_embeddings_mapping[frame_id] = []
#         frame_id_blip2_verb_noun_tool_SBERT_embeddings_mapping[frame_id] = []
#         frame_id_blip2_verb_noun_SBERT_embeddings_mapping[frame_id] = []
#         frame_id_blip2_verb_tool_SBERT_embeddings_mapping[frame_id] = []
#         frame_id_blip2_verb_SBERT_embeddings_mapping[frame_id] = []
#         for blip2_question, blip2_answer_verb_noun_tool_pairs in blip2_question_answer_verb_noun_tool_pairs_mapping.items():
#             blip2_answer = blip2_answer_verb_noun_tool_pairs[0]
#             frame_id_blip2_answer_SBERT_embeddings_mapping[frame_id].append(sbert.encode([f"{blip2_answer}"])[0])
#             for blip2_answer_verb_noun_tool_pair in blip2_answer_verb_noun_tool_pairs[1]:
#                 blip2_verb = blip2_answer_verb_noun_tool_pair[0]
#                 blip2_noun = blip2_answer_verb_noun_tool_pair[1]
#                 blip2_tool = blip2_answer_verb_noun_tool_pair[2]
#                 if blip2_noun == "NaN":
#                     blip2_noun = "something"
#                 if blip2_tool == "NaN":
#                     blip2_tool = "a tool"
#                 frame_id_blip2_verb_noun_tool_SBERT_embeddings_mapping[frame_id].append(sbert.encode([f"{label_verb} {label_noun} using {label_tool}"])[0])
#                 frame_id_blip2_verb_noun_SBERT_embeddings_mapping[frame_id].append(sbert.encode([f"{label_verb} {label_noun}"])[0])
#                 frame_id_blip2_verb_tool_SBERT_embeddings_mapping[frame_id].append(sbert.encode([f"{label_verb} using {label_tool}"])[0])
#                 frame_id_blip2_verb_SBERT_embeddings_mapping[frame_id].append(sbert.encode([f"{label_verb}"])[0])
    
#     frame_id_predicted_label_indices_and_scores = dict()
#     for frame_id in frame_id_blip2_question_answer_verb_noun_tool_pairs_mapping.keys():
#         frame_id_predicted_label_indices_and_scores[frame_id] = dict()

#         blip2_answer_SBERT_embeddings = frame_id_blip2_answer_SBERT_embeddings_mapping[frame_id]
#         blip2_verb_noun_tool_SBERT_embeddings = frame_id_blip2_verb_noun_tool_SBERT_embeddings_mapping[frame_id]
#         blip2_verb_noun_SBERT_embeddings = frame_id_blip2_verb_noun_SBERT_embeddings_mapping[frame_id]
#         blip2_verb_tool_SBERT_embeddings = frame_id_blip2_verb_tool_SBERT_embeddings_mapping[frame_id]
#         blip2_verb_SBERT_embeddings = frame_id_blip2_verb_SBERT_embeddings_mapping[frame_id]

#         for label in label_verb_noun_tools_mapping.keys():
#             if label not in frame_id_predicted_label_indices_and_scores[frame_id].keys():
#                 frame_id_predicted_label_indices_and_scores[frame_id][label] = []

#             label_verb_noun_tool_SBERT_embeddings = label_verb_noun_tool_SBERT_embeddings_mapping[label]
#             label_verb_noun_SBERT_embeddings = label_verb_noun_SBERT_embeddings_mapping[label]
#             label_verb_tool_SBERT_embeddings = label_verb_tool_SBERT_embeddings_mapping[label]
#             label_verb_SBERT_embeddings = label_verb_SBERT_embeddings_mapping[label]

#             frame_id_predicted_label_indices_and_scores[frame_id][label].extend(calculate_SBERT_cosine_similarities(blip2_SBERT_embeddings=blip2_answer_SBERT_embeddings, label_SBERT_embeddings=label_verb_noun_tool_SBERT_embeddings))
#             frame_id_predicted_label_indices_and_scores[frame_id][label].extend(calculate_SBERT_cosine_similarities(blip2_SBERT_embeddings=blip2_answer_SBERT_embeddings, label_SBERT_embeddings=label_verb_noun_SBERT_embeddings))
#             frame_id_predicted_label_indices_and_scores[frame_id][label].extend(calculate_SBERT_cosine_similarities(blip2_SBERT_embeddings=blip2_answer_SBERT_embeddings, label_SBERT_embeddings=label_verb_tool_SBERT_embeddings))
#             frame_id_predicted_label_indices_and_scores[frame_id][label].extend(calculate_SBERT_cosine_similarities(blip2_SBERT_embeddings=blip2_answer_SBERT_embeddings, label_SBERT_embeddings=label_verb_SBERT_embeddings))

#             frame_id_predicted_label_indices_and_scores[frame_id][label].extend(calculate_SBERT_cosine_similarities(blip2_SBERT_embeddings=blip2_verb_noun_tool_SBERT_embeddings, label_SBERT_embeddings=label_verb_noun_tool_SBERT_embeddings))
#             frame_id_predicted_label_indices_and_scores[frame_id][label].extend(calculate_SBERT_cosine_similarities(blip2_SBERT_embeddings=blip2_verb_noun_SBERT_embeddings, label_SBERT_embeddings=label_verb_noun_SBERT_embeddings))
#             frame_id_predicted_label_indices_and_scores[frame_id][label].extend(calculate_SBERT_cosine_similarities(blip2_SBERT_embeddings=blip2_verb_tool_SBERT_embeddings, label_SBERT_embeddings=label_verb_tool_SBERT_embeddings))
#             frame_id_predicted_label_indices_and_scores[frame_id][label].extend(calculate_SBERT_cosine_similarities(blip2_SBERT_embeddings=blip2_verb_SBERT_embeddings, label_SBERT_embeddings=label_verb_SBERT_embeddings))

#     return clip_id, frame_id_predicted_label_indices_and_scores


In [8]:
nontemporal_dictionary_matching_clip_id_frame_id_predicted_label_indices_and_scores = dict(pqdm(
    [{"clip_id": clip_id, "frame_id_blip2_question_answer_verb_noun_tool_pairs_mapping": frame_id_blip2_question_answer_verb_noun_tool_pairs_mapping, "label_verb_noun_tools_mapping": label_verb_noun_tools_mapping} for clip_id, frame_id_blip2_question_answer_verb_noun_tool_pairs_mapping in clip_id_frame_id_blip2_verb_noun_tool_pair_mapping.items()],
    function=nontemporal_dictionary_matching_for_given_clip,
    n_jobs=4,
    exception_behaviour="immediate",
    argument_type="kwargs",
))


QUEUEING TASKS | : 100%|██████████| 623/623 [00:00<00:00, 1704.22it/s]
PROCESSING TASKS | : 100%|██████████| 623/623 [08:08<00:00,  1.28it/s]
COLLECTING RESULTS | : 100%|██████████| 623/623 [00:00<00:00, 174692.57it/s]


In [19]:
def nontemporal_select_labels_with_individual_score_higher_than_threshold(nontemporal_clip_id_frame_id_predicted_label_indices_and_scores: Dict[str, Dict[str, List[float]]], threshold: float):
    nontemporal_clip_id_frame_id_predicted_label_indices = dict()
    for clip_id, frame_id_predicted_label_indices_and_scores in tqdm(nontemporal_clip_id_frame_id_predicted_label_indices_and_scores.items()):
        nontemporal_clip_id_frame_id_predicted_label_indices[clip_id] = dict()
        for frame_id, predicted_label_indices_and_scores in frame_id_predicted_label_indices_and_scores.items():
            labels_with_individual_score_higher_than_a_threshold = []
            for predicted_label_index, scores in predicted_label_indices_and_scores.items():
                for score in scores:
                    if score >= threshold:
                        labels_with_individual_score_higher_than_a_threshold.append(predicted_label_index)
                        break
            nontemporal_clip_id_frame_id_predicted_label_indices[clip_id][frame_id] = labels_with_individual_score_higher_than_a_threshold
    return nontemporal_clip_id_frame_id_predicted_label_indices


def nontemporal_select_labels_with_total_score_higher_than_threshold(nontemporal_clip_id_frame_id_predicted_label_indices_and_scores: Dict[str, Dict[str, List[float]]], threshold: float):
    nontemporal_clip_id_frame_id_predicted_label_index_mapping = dict()
    for clip_id, frame_id_predicted_label_indices_and_scores in tqdm(nontemporal_clip_id_frame_id_predicted_label_indices_and_scores.items()):
        nontemporal_clip_id_frame_id_predicted_label_index_mapping[clip_id] = dict()
        for frame_id, predicted_label_indices_and_scores in frame_id_predicted_label_indices_and_scores.items():
            labels_with_total_score_higher_than_a_threshold = []
            for predicted_label_index, scores in predicted_label_indices_and_scores.items():
                if sum(scores) >= threshold:
                    labels_with_total_score_higher_than_a_threshold.append(predicted_label_index)
            nontemporal_clip_id_frame_id_predicted_label_index_mapping[clip_id][frame_id] = labels_with_total_score_higher_than_a_threshold
    return nontemporal_clip_id_frame_id_predicted_label_index_mapping


In [42]:
from sklearn.metrics import f1_score

def evaluate_predictions(clip_id_frame_id_predicted_label_indices_mapping: Dict[str, Dict[str, List[str]]], clip_id_frame_id_ground_truth_label_indices_mapping: Dict[str, Dict[str, List[str]]]):
    sum_average_f1_scores = 0.0
    count_average_f1_scores = 0.0
    for clip_id, frame_id_ground_truth_label_indices_mapping in clip_id_frame_id_ground_truth_label_indices_mapping.items():
        for frame_id, ground_truth_label_indices in frame_id_ground_truth_label_indices_mapping.items():
            predicted_label_indices = clip_id_frame_id_predicted_label_indices_mapping[clip_id][int(frame_id // 6 * 6)]
            if len(predicted_label_indices) == 0:
                predicted_label_indices = [0]
            current_f1_score = f1_score(y_true=[ground_truth_label_indices], y_pred=[predicted_label_indices], average="weighted")
            sum_average_f1_scores += current_f1_score
            count_average_f1_scores += 1.0
    average_weighted_f1_score = sum_average_f1_scores / float(count_average_f1_scores)
    return average_weighted_f1_score


In [43]:
for threshold in [0.25, 0.5, 0.75, 1.0]:
    nontemporal_selected_labels_with_individual_score_higher_than_threshold = nontemporal_select_labels_with_individual_score_higher_than_threshold(nontemporal_clip_id_frame_id_predicted_label_indices_and_scores=nontemporal_dictionary_matching_clip_id_frame_id_predicted_label_indices_and_scores, threshold=threshold)
    average_weighted_f1_score = evaluate_predictions(clip_id_frame_id_predicted_label_indices_mapping=nontemporal_selected_labels_with_individual_score_higher_than_threshold, clip_id_frame_id_ground_truth_label_indices_mapping=clip_id_frame_id_ground_truth_label_indices_mapping)
    print(f"Individual Score | Threshold: {np.round(threshold, 2)} | Average weighted F1 Score: {np.round(average_weighted_f1_score, 2)}")


In [None]:
for threshold in [0.25, 0.5, 0.75, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0, 7.5, 10.0]:
    nontemporal_selected_labels_with_total_score_higher_than_threshold = nontemporal_select_labels_with_total_score_higher_than_threshold(nontemporal_clip_id_frame_id_predicted_label_indices_and_scores=nontemporal_dictionary_matching_clip_id_frame_id_predicted_label_indices_and_scores, threshold=threshold)
    average_weighted_f1_score = evaluate_predictions(clip_id_frame_id_predicted_label_indices_mapping=nontemporal_selected_labels_with_total_score_higher_than_threshold, clip_id_frame_id_ground_truth_label_indices_mapping=clip_id_frame_id_ground_truth_label_indices_mapping)
    print(f"Total Score      | Threshold: {np.round(threshold, 2)} | Average weighted F1 Score: {np.round(average_weighted_f1_score, 2)}")


100%|██████████| 623/623 [00:29<00:00, 21.41it/s]


Total Score      | Threshold: 0.25 | Average cosine similarity: 0.18


100%|██████████| 623/623 [00:51<00:00, 12.06it/s]


Total Score      | Threshold: 0.5 | Average cosine similarity: 0.18


100%|██████████| 623/623 [00:28<00:00, 21.63it/s]


Total Score      | Threshold: 0.75 | Average cosine similarity: 0.24


100%|██████████| 623/623 [00:28<00:00, 21.64it/s]


Total Score      | Threshold: 1.0 | Average cosine similarity: 0.25


100%|██████████| 623/623 [00:28<00:00, 21.70it/s]


Total Score      | Threshold: 1.5 | Average cosine similarity: 0.32


100%|██████████| 623/623 [00:28<00:00, 21.70it/s]


Total Score      | Threshold: 2.0 | Average cosine similarity: 0.36


100%|██████████| 623/623 [00:28<00:00, 21.78it/s]


Total Score      | Threshold: 2.5 | Average cosine similarity: 0.4


100%|██████████| 623/623 [00:28<00:00, 21.78it/s]


Total Score      | Threshold: 3.0 | Average cosine similarity: 0.41


100%|██████████| 623/623 [00:28<00:00, 21.81it/s]


Total Score      | Threshold: 3.5 | Average cosine similarity: 0.46


100%|██████████| 623/623 [00:28<00:00, 21.79it/s]


Total Score      | Threshold: 4.0 | Average cosine similarity: 0.46


100%|██████████| 623/623 [00:28<00:00, 21.81it/s]


Total Score      | Threshold: 4.5 | Average cosine similarity: 0.46


100%|██████████| 623/623 [00:28<00:00, 21.82it/s]


KeyboardInterrupt: 

In [6]:
clip_id_frame_id_blip2_verb_noun_tool_label_mapping = dict()
for clip_id in clip_id_frame_id_labels_mapping.keys():
    for frame_id in clip_id_frame_id_labels_mapping[clip_id].keys():
        labels = clip_id_frame_id_labels_mapping[clip_id][frame_id]
        blip2_verb_noun_tool_pairs = clip_id_frame_id_blip2_verb_noun_tool_pair_mapping[clip_id][(frame_id // 6) * 6]
        for label in labels:
            if label not in label_verb_noun_tool_mapping.keys():
                continue
            label_verb_noun_tools = label_verb_noun_tool_mapping[label]
            for label_verb_noun_tool in label_verb_noun_tools:
                label_verb = label_verb_noun_tool[0]
                label_noun = label_verb_noun_tool[1]
                label_tool = label_verb_noun_tool[2]
                for blip2_verb_noun_tool_pair in blip2_verb_noun_tool_pairs:
                    blip2_verb = blip2_verb_noun_tool_pair[0]
                    blip2_noun = blip2_verb_noun_tool_pair[1]
                    blip2_tool = blip2_verb_noun_tool_pair[2]
                    clip_id_frame_id_blip2_verb_noun_tool_label_mapping[clip_id][frame_id] = (blip2_verb, blip2_noun, blip2_tool, label_verb, label_noun, label_tool)
                    


IndexError: list index out of range

In [None]:
for clip_id, frame_id_blip2_verb_noun_tool_pair in clip_id_frame_id_blip2_verb_noun_tool_pair_mapping.items():
    for frame_id, blip2_verb_noun_tool_pair in frame_id_blip2_verb_noun_tool_pair.items():
        

