In [None]:
#@title Load CLIP
%%capture
!pip -q install ftfy regex tqdm
!pip -q install git+https://github.com/openai/CLIP.git
#!pip -q install tensorflow_addons torchmetrics

import torch
import clip
import numpy as np
from PIL import Image
#from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize

device = "cuda" if torch.cuda.is_available() else "cpu"

clip_model, preprocess = clip.load("RN101", device=device)
clip_model.eval()

In [None]:
#@title Load Caption Dictionary
captions = np.load(".../LLaVa_fixed_captions.npy", allow_pickle=True).item()
len(captions)

13732

In [None]:
#@title Generate CLIP Text Embeddings
from tqdm import tqdm

text_emb = {}

for key, (speaker, current_label, caption, original_frames_count) in tqdm(captions.items()):

    text_tokens = clip.tokenize(caption).cuda()

    with torch.no_grad(): text_embeddings = clip_model.encode_text(text_tokens)

    text_embeddings = text_embeddings.cpu().detach()

    text_emb[key] = (speaker, current_label, text_embeddings, original_frames_count)

print(len(text_emb))

100%|██████████| 13732/13732 [02:21<00:00, 96.77it/s] 

13732





In [None]:
# Save resulting embeddings
np.save('.../ResNet_fixed_captions_emb.npy', text_emb)

## TEST SIMILARITY

'No, the person is not talking. He is sitting in front of a microphone, wearing a suit and tie, and looking at the camera.',
"Yes, the person is talking. He is wearing a suit and tie, and he is speaking into a microphone." - 92% SIMILARITY

"the person is engaged in a conversation", "no one is talking" - 75% SIMILARITY

In [None]:
def cosine_similarity(images_z: torch.Tensor, texts_z: torch.Tensor):
  images_z /= images_z.norm(dim=-1, keepdim=True)
  texts_z /= texts_z.norm(dim=-1, keepdim=True)

  # Valutare la similarità cosinica tra i set di caratteristiche
  similarity = (texts_z @ images_z.T)
  return similarity.cpu()

In [None]:
text = clip.tokenize(["yes", "no"]).to(device)

with torch.no_grad():
    text_features = clip_model.encode_text(text).float()

similarity = cosine_similarity(text_features[0], text_features[1])

print(f"Similarity: {similarity}")

Similarity: 0.9536035060882568


  similarity = (texts_z @ images_z.T)


In [None]:
# Prepare the text
text = clip.tokenize(["the person is engaged in a conversation", "no one is talking"]).to(device)

with torch.no_grad():
    text_features = clip_model.encode_text(text).float()

similarity = cosine_similarity(text_features[0], text_features[1])

print(f"Similarity: {similarity}")

Similarity: 0.7569506764411926


## CREATE FIXED CAPTIONS

After checking the similarity of different captions we decided to try out some fixed captions that minimize the similarity:


In [None]:
captions = np.load("/content/drive/MyDrive/TESI/CODICE/CLIP_Embeddings/LLaVa_long_captions.npy", allow_pickle=True).item()
len(captions)

13732

In [None]:
k = list(captions.keys())[7000]
captions[k]

('bell', 1, 'Yes', 10)

In [None]:
# Initialize a new dictionary for updated captions
updated_captions = {}

# Initialize the previous updated caption
prev_updated_caption = None

for key in list(captions.keys()):
    speaker, label, caption, original_frames_count = captions[key]

    # If caption starts with "yes", change it to "the person is engaged in a conversation"
    if caption.lower().startswith("yes"):
        caption = "the person is engaged in a conversation"
        prev_updated_caption = caption

    # If caption starts with "no", change it to "no one is talking"
    elif caption.lower().startswith("no"):
        caption = "no one is talking"
        prev_updated_caption = caption

    # If caption doesn't start with "yes" or "no", use the previous updated caption
    elif prev_updated_caption is not None:
        caption = prev_updated_caption

    # Store the updated caption in the new dictionary
    updated_captions[key] = (speaker, label, caption, original_frames_count)

len(updated_captions)

13732

In [None]:
k = list(updated_captions.keys())[7000]
updated_captions[k]

('bell', 1, 'the person is engaged in a conversation', 10)

In [None]:
np.save('/content/drive/MyDrive/TESI/CODICE/CLIP_Embeddings/LLaVa_fixed_captions.npy', updated_captions)

These captions will also be encoded into embeddings, like we did earlier in this notebook.