
# Notebook para probar ASIF - Retrieval




# Cargamos los embeddings de AUDIO y NLP

In [2]:
import os
import json
import numpy as np
import torch
import torchaudio

folder_path = '../experiments/layers'

file_path = os.path.join(folder_path, f'embeddings_layer6_wav2vec2.json')
with open(file_path, 'r') as f:
    audio = np.array(json.load(f))

file_path = os.path.join(folder_path, f'embeddings_layer3_bert-base-uncased.json')
with open(file_path, 'r') as f:
    nlp = np.array(json.load(f))

with open('words_in_order.json', 'r') as f:
    keys = json.load(f)


audio = torch.from_numpy(audio)
nlp = torch.from_numpy(nlp)

Calculamos la matriz de similitud entre los embeddings de nlp

In [22]:
from torchmetrics.functional.pairwise import pairwise_cosine_similarity
import gc

k = 46000
p = 1
n = nlp.shape[0]

large_matrix = torch.zeros(n, n, dtype=torch.float32, device='cpu')
batch_size = 5000

for i in range(0, n, batch_size):
    end = min(i + batch_size, nlp.shape[0])
    batch_embeddings = nlp[i:end]
    output = pairwise_cosine_similarity(batch_embeddings, nlp)
    values, indices = torch.topk(output, k=k, dim=1)
    zero_matrix = torch.zeros_like(output)

    # Use the indices to place the top k values in the zero matrix
    for row_idx, col_indices in enumerate(indices):
        zero_matrix[row_idx, col_indices] = values[row_idx] ** p

    large_matrix[i:end,:] = zero_matrix

    del output, zero_matrix
    gc.collect()


Queremos confirmar que los embeddings dentro de un mismo espacio están bien.

In [23]:
print(f'Para la siguiente palabra: {keys[292]}, se obtienen las siguientes palabras como retrievals:')
values, indices = torch.topk(large_matrix[292], k=20)
[keys[index.item()] for index in indices]

['by',
 'by',
 'by',
 'by',
 'by',
 'by',
 'by',
 'by',
 'by',
 'by',
 'by',
 'by',
 'by',
 'by',
 'by',
 'by',
 'by',
 'by',
 'by',
 'by']

# Cargamos audio para testear

In [46]:
from alignments.utils_alignments import parse_textgrid, match_words_to_frames
alignments = parse_textgrid('../datasets/alignments/test-clean/237/126133/237-126133-0001.TextGrid')
words = match_words_to_frames(alignments, 25, 20, line_content='')
words


{'every': [(23, 39)],
 'chance': [(39, 58)],
 'she': [(58, 66)],
 'could': [(66, 76)],
 'steal': [(76, 93)],
 'after': [(93, 107), (185, 200)],
 'practice': [(107, 129)],
 'hours': [(129, 143)],
 'were': [(143, 150), (334, 341)],
 'over': [(150, 175)],
 'and': [(175, 185)],
 'the': [(200, 204), (265, 269), (478, 484), (505, 509), (541, 546)],
 'clamorous': [(204, 235)],
 'demands': [(235, 260)],
 'of': [(260, 265), (499, 505)],
 'boys': [(269, 291)],
 'upon': [(291, 306)],
 'her': [(306, 312)],
 'time': [(312, 332)],
 'fully': [(341, 357)],
 'satisfied': [(357, 394)],
 'was': [(413, 428)],
 'seized': [(428, 449)],
 'to': [(449, 457), (535, 541)],
 'fly': [(457, 474)],
 'on': [(474, 478)],
 'wings': [(484, 499)],
 'wind': [(509, 531)],
 'flowers': [(546, 579)]}

Seleccionamos alguna de las palabras que querramos probar

In [77]:
# Obetenemos la representacion a través del ENCODER WAV2VEC2
import s3prl.hub as hub
from embeddings.utils_embeddings import get_embeddings_speech

model = getattr(hub, 'wav2vec2')()
model = model.to('cpu')
audio_path = '../datasets/librispeech-raw/test-clean/237/126133/237-126133-0001.flac'
reps = get_embeddings_speech(audio_path, model, 'cpu')
averaged_rep = reps[7][0][23:39+1].mean(dim=0)

Obtenemos la representación relativa de ese embedding

In [78]:
averaged_rep_2d = averaged_rep.unsqueeze(0).double()
relative_rep = pairwise_cosine_similarity(averaged_rep_2d, audio)

Confirmamos que usando esa representación relativa podamos devolver los embeddings de audio similares. Para confirmar que estamos haciendo las cosas bien.

In [79]:
values, indices = torch.topk(relative_rep[0], k=20)
[keys[index.item()] for index in indices]

['every',
 'every',
 'every',
 'every',
 'every',
 'every',
 'every',
 'every',
 'every',
 'every',
 'everything',
 'every',
 'every',
 'everything',
 'every',
 'everyday',
 'whichever',
 'everywhere',
 'every',
 'every']

Calculamos los embeddings de nlp más similares a la representación relativa del embedding de audio.

In [80]:
batch_size = 1000 
similarity_results = []

for i in range(0, large_matrix.size(0), batch_size):
    end = min(i + batch_size, large_matrix.size(0))
    batch = large_matrix[i:end, :] 
    similarity = pairwise_cosine_similarity(relative_rep.float(), batch)
    similarity_results.extend(similarity.cpu())
full_similarity = torch.cat(similarity_results, dim=0)

In [82]:
# Obtenemos los k mas similares
values, indices = torch.topk(full_similarity, k=20)
[keys[index.item()] for index in indices]