In [None]:
#@title Load CLIP
%%capture
!pip install ftfy regex tqdm
!pip install git+https://github.com/openai/CLIP.git
import pandas as pd
import numpy as np
import cv2
import os
from tqdm import tqdm
from PIL import Image
from IPython.display import Image as ImagePy, display
import torch
import warnings
warnings.filterwarnings('ignore')
import h5py
import clip
device = "cuda" if torch.cuda.is_available() else "cpu"

clip_model, preprocess = clip.load("RN101", device=device)
clip_model.eval() # per sicurezza

In [None]:
#@title Load Finetuned CLIP Visual Encoder
%%capture
from torch.nn import functional as F
class MLP(torch.nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.fc1 = torch.nn.Linear(512, 64)
        self.bn1 = torch.nn.BatchNorm1d(64)
        self.fc2 = torch.nn.Linear(64, 64)
        self.bn2 = torch.nn.BatchNorm1d(64)
        self.fc3 = torch.nn.Linear(64, 1)

    def forward(self, x):
        x = F.relu(self.bn1(self.fc1(x)))
        x = F.relu(self.bn2(self.fc2(x)))
        x = self.fc3(x)
        return x

class CustomCLIP(torch.nn.Module):
  def __init__(self):
    super().__init__()
    model = clip_model

    self.encoder = model.visual.float()

    self.classifier = MLP() # Classifier

  def forward(self, x: torch.Tensor) -> torch.Tensor:
    x = self.encoder(x)
    x = self.classifier(x)
    return x

net = CustomCLIP().to(device)
checkpoint = torch.load("/content/drive/MyDrive/TESI/CODICE/CLIP_Models/no_long_8_epochs")
net.load_state_dict(checkpoint['model_state_dict'])
net.eval()

In [None]:
#@title Generate CLIP embeddings from frames

def get_clip_embeddings(frames, finetuned_flag):
    if len(frames)!=10:
      print(f"WARNING: get_clip_embeddings received {len(frames)} frames!")

    # Convert the numpy array of frames to a list of PIL Images and convert to RGB if necessary
    images = [Image.fromarray(frame) for frame in frames]
    images = [image.convert('RGB') if image.mode != 'RGB' else image for image in images]
    image_inputs = torch.stack([preprocess(image) for image in images]).to(device)

    with torch.no_grad():
        if finetuned_flag: embeddings = net.encoder(image_inputs)
        else: embeddings = clip_model.encode_image(image_inputs)

    # Average the embeddings
    video_segment_embedding = embeddings.mean(dim=0)
    video_segment_embedding = torch.nn.functional.normalize(video_segment_embedding, p=2, dim=0) # normalize to help the MLP classifier later
    return video_segment_embedding.cpu()

In [None]:
#@title Generate CLIP patch embeddings from frames

# I primi 9 Embedding sono le patch, il 10 è dell'immagine completa
def get_patch_embeddings(frames, finetuned_flag):
    if len(frames)!=10:
      print(f"WARNING: get_clip_embeddings received {len(frames)} frames!")

    # Convert the numpy array of frames to a list of PIL Images and convert to RGB if necessary
    images = [Image.fromarray(frame) for frame in frames]
    images = [image.convert('RGB') if image.mode != 'RGB' else image for image in images]

    # Divide each image into 9 non-overlapping patches
    patch_size = 224 // 3  # results in 74, the bottom&right-most 2 pixels will be lost
    patches = [[image.crop((i * patch_size, j * patch_size, (i + 1) * patch_size, (j + 1) * patch_size)) for i in range(3) for j in range(3)] for image in images]

    # Transpose the list of patches so that the same positions are together
    patches = list(map(list, zip(*patches)))

    # Flatten the list of patches
    patches_flat = [patch for image_patches in patches for patch in image_patches]

    # Add the original images at the end of the patches_flat list
    patches_flat += images

    image_inputs = torch.stack([preprocess(image) for image in patches_flat]).to(device)

    with torch.no_grad():
        if finetuned_flag: embeddings = net.encoder(image_inputs)
        else: embeddings = clip_model.encode_image(image_inputs)

    # Reshape and average the embeddings
    embeddings = embeddings.view(10, len(frames), -1)
    video_segment_embedding = embeddings.mean(dim=1)
    video_segment_embedding = torch.nn.functional.normalize(video_segment_embedding, p=2, dim=1) # normalize to help the MLP classifier later

    return video_segment_embedding.cpu()

## Generate Embeddings from HDF5 dataset

In [None]:
# Load dataset
!cp ".../Columbia Dataset/Frames_labels_dataset/database.h5" "/content/dataset.h5"

In [None]:
dataset_path = "/content/database.h5"

In [None]:
#@title Check Data
print("----------------------- FRAMES ---- LABELS")
with h5py.File(dataset_path, 'r') as f:
    tot_frames = 0
    tot_labels = 0
    for p in f.keys():
      person = f[p]
      group_f = person['frames'].attrs.get('data_length')
      group_l = person['labels'].attrs.get('data_length')

      tot_frames += group_f
      tot_labels += group_l

      print(f'Group {p}:      \t{group_f} \t\t{group_l}')

    print(f"TOTAL DATA------------- {tot_frames} --------- {tot_labels}")

----------------------- FRAMES ---- LABELS
Group bell:      	37422 		37422
Group bollinger:      	15080 		15080
Group lieberman:      	16400 		16400
Group long:      	29391 		29391
Group sick:      	38526 		38526
TOTAL DATA------------- 136819 --------- 136819


In [None]:
embeddings_dict = {} # Store resulting embeddings

In [None]:
#@title Generate Embeddings from dataset

with h5py.File(dataset_path, 'r') as f:
  #for speaker in ['bell','sick','long','lieberman','bollinger']:
  for speaker in ['bell']:
    print(speaker.upper())
    start_len = len(embeddings_dict)
    person = f[speaker]
    frames = person['frames']
    labels = person['labels']
    frame_index = 0
    progress_bar = tqdm(total=len(frames), position=0, leave=False)

    # Process frames through 'Frame Interpolation' to create segments of fixed lenght=10
    while frame_index < len(frames):
        current_label = labels[frame_index]
        current_frames = []
        original_frames_count = 0
        while len(current_frames) < 10 and frame_index < len(frames):
            if labels[frame_index] == current_label:
                current_frames.append(frames[frame_index])
                original_frames_count += 1
                frame_index += 1
            else:
                break
        while len(current_frames) < 10:
            current_frames += current_frames[:10 - len(current_frames)]
        embedding = get_clip_embeddings(np.array(current_frames), True)
        key = f"{speaker}_{frame_index - len(current_frames)}_{frame_index - 1}"
        embeddings_dict[key] = (speaker, current_label, embedding, original_frames_count)
        progress_bar.update(10)
    progress_bar.close()
    end_len = len(embeddings_dict)

    print(f"New Items: {end_len-start_len} \t TOTAL: {end_len} \n------------------------------")

BELL


                           

Nuovi aggiunti: 3751 	 TOTALE: 13732 
------------------------------




In [None]:
#@title Generate PATCH Embeddings

with h5py.File(dataset_path, 'r') as f:
  #for speaker in ['bell','sick','long','lieberman','bollinger']:
  for speaker in ['lieberman','bollinger']:
    print(speaker.upper())
    start_len = len(embeddings_dict)
    person = f[speaker]
    frames = person['frames']
    labels = person['labels']
    frame_index = 0
    progress_bar = tqdm(total=len(frames), position=0, leave=False)

    # Process frames through 'Frame Interpolation' to create segments of fixed lenght=10
    while frame_index < len(frames):
        current_label = labels[frame_index]
        current_frames = []
        original_frames_count = 0
        while len(current_frames) < 10 and frame_index < len(frames):
            if labels[frame_index] == current_label:
                current_frames.append(frames[frame_index])
                original_frames_count += 1
                frame_index += 1
            else:
                break
        while len(current_frames) < 10:
            current_frames += current_frames[:10 - len(current_frames)]
        embedding = get_patch_embeddings(np.array(current_frames), True)

        embedding = embedding.view(-1)  # flatten the tensor

        key = f"{speaker}_{frame_index - len(current_frames)}_{frame_index - 1}"
        embeddings_dict[key] = (speaker, current_label, embedding, original_frames_count)
        progress_bar.update(10)
    progress_bar.close()
    end_len = len(embeddings_dict)
    print(f"New Items: {end_len-start_len} \t TOTAL: {end_len} \n------------------------------")

LIEBERMAN




Nuovi aggiunti: 1643 	 TOTALE: 12193 
------------------------------
BOLLINGER


                          

Nuovi aggiunti: 1539 	 TOTALE: 13732 
------------------------------




In [None]:
# Save Result
np.save('.../CLIP_Embeddings/Finetuned_no_long_patches.npy', embeddings_dict)
len(embeddings_dict)

13732