
# Cargamos los embeddings de AUDIO y NLP




In [1]:
import sys
print(sys.executable)


/home/cbolanos/miniconda3/envs/tesis/bin/python


In [2]:
import os
import json
import numpy as np
import torch
import torchaudio

folder_path = '../experiments/layers'

file_path = os.path.join(folder_path, f'embeddings_layer6_wav2vec2.json')
with open(file_path, 'r') as f:
    audio = np.array(json.load(f))

file_path = os.path.join(folder_path, f'embeddings_layer3_bert-base-uncased.json')
with open(file_path, 'r') as f:
    nlp = np.array(json.load(f))

with open('words_in_order.json', 'r') as f:
    keys = json.load(f)


In [3]:
audio = torch.from_numpy(audio)
nlp = torch.from_numpy(nlp)

In [22]:
from torchmetrics.functional.pairwise import pairwise_cosine_similarity
import gc

k = 46000
p = 1
large_matrix = torch.zeros(46906, 46906, dtype=torch.float32, device='cpu')
batch_size = 5000
for i in range(0, nlp.shape[0], batch_size):
    end = min(i + batch_size, nlp.shape[0])
    batch_embeddings = nlp[i:end]
    output = pairwise_cosine_similarity(batch_embeddings, nlp)
    values, indices = torch.topk(output, k=k, dim=1)
    zero_matrix = torch.zeros_like(output)

# Use the indices to place the top k values in the zero matrix
    for row_idx, col_indices in enumerate(indices):
        zero_matrix[row_idx, col_indices] = values[row_idx] ** p

    large_matrix[i:end,:] = zero_matrix

    del output, zero_matrix
    gc.collect()


In [23]:
values, indices = torch.topk(large_matrix[292], k=20)
[keys[index.item()] for index in indices]

['by',
 'by',
 'by',
 'by',
 'by',
 'by',
 'by',
 'by',
 'by',
 'by',
 'by',
 'by',
 'by',
 'by',
 'by',
 'by',
 'by',
 'by',
 'by',
 'by']

In [24]:
keys[292]

'by'

In [25]:
nlp.shape

torch.Size([46906, 768])

In [26]:
large_matrix.shape

torch.Size([46906, 46906])

# Cargamos audio para testear

In [27]:
import os
import torchaudio

def get_embeddings_speech(audio_path, model, device):
    """
     Get all the hidden_states for a specific audio
    """
    waveform, sample_rate = torchaudio.load(audio_path)
    waveform = waveform.to(device)
    with torch.no_grad():
        reps = model(waveform)["hidden_states"]

    return reps

def parse_textgrid(textgrid_path):
    """
    Parses a TextGrid file and returns alignments for words and phones.
    """
    alignments = {'words': [], 'phones': []}
    current_section = None

    with open(textgrid_path, 'r') as f:
        for line in f:
            line = line.strip()
            if 'name = "words"' in line:
                current_section = 'words'
            elif 'name = "phones"' in line:
                current_section = 'phones'
            elif line.startswith('intervals ['):
                if current_section:
                    xmin, xmax, text = None, None, None
            elif line.startswith('xmin ='):
                xmin = float(line.split('=')[1].strip())
            elif line.startswith('xmax ='):
                xmax = float(line.split('=')[1].strip())
            elif line.startswith('text ='):
                text = line.split('=')[1].strip().strip('"')
                if text:  # Ignore empty intervals
                    alignments[current_section].append((xmin, xmax, text))

    return alignments

def remove_first_directory(path):
    parts = path.split(os.path.sep)
    if len(parts) > 1:
        return os.path.join(*parts[3:])
    return path


def read_line_by_identifier(directory_path, identifier):
    # Find the .txt file in the given directory
    for file_name in os.listdir(directory_path):
        if file_name.endswith('.txt'):
            file_path = os.path.join(directory_path, file_name)
            break
    else:
        return "Text file not found in the directory."

    # Read the specified line from the found text file
    with open(file_path, 'r') as file:
        for line in file:
            if line.startswith(identifier):
                return line.strip().replace(identifier, '').strip()


def match_words_to_frames(alignments, frame_length, stride):
    """
    Matches frame indices to word from alignments, correctly calculating
    start and end frames based on the provided alignments.
    """
    # Convert frame_length and stride from milliseconds to seconds for consistency
    frame_length_sec = frame_length / 1000.0
    stride_sec = stride / 1000.0
    words = {}
    for key in alignments:
        for xmin, xmax, text in alignments[key]:
            # Calculate the frame index for the start and end of the interval
            # Start frame is calculated by dividing xmin by stride_sec, because each new frame starts every stride_sec seconds
            start_frame = int(xmin / stride_sec)
            end_frame = int(xmax / stride_sec) if xmax > 0 else 0
            if key == 'words':
                if text not in words:
                   words[text] = []
                words[text].append((start_frame, end_frame))
    return words



In [46]:
alignments = parse_textgrid('../datasets/alignments/test-clean/237/126133/237-126133-0001.TextGrid')
words = match_words_to_frames(alignments, 25, 20)
words


{'every': [(23, 39)],
 'chance': [(39, 58)],
 'she': [(58, 66)],
 'could': [(66, 76)],
 'steal': [(76, 93)],
 'after': [(93, 107), (185, 200)],
 'practice': [(107, 129)],
 'hours': [(129, 143)],
 'were': [(143, 150), (334, 341)],
 'over': [(150, 175)],
 'and': [(175, 185)],
 'the': [(200, 204), (265, 269), (478, 484), (505, 509), (541, 546)],
 'clamorous': [(204, 235)],
 'demands': [(235, 260)],
 'of': [(260, 265), (499, 505)],
 'boys': [(269, 291)],
 'upon': [(291, 306)],
 'her': [(306, 312)],
 'time': [(312, 332)],
 'fully': [(341, 357)],
 'satisfied': [(357, 394)],
 'was': [(413, 428)],
 'seized': [(428, 449)],
 'to': [(449, 457), (535, 541)],
 'fly': [(457, 474)],
 'on': [(474, 478)],
 'wings': [(484, 499)],
 'wind': [(509, 531)],
 'flowers': [(546, 579)]}

In [77]:
# Obetenemos la representacion a través del ENCODER WAV2VEC2
import s3prl.hub as hub

model = getattr(hub, 'wav2vec2')()
model = model.to('cpu')
audio_path = '../datasets/librispeech-raw/test-clean/237/126133/237-126133-0001.flac'
reps = get_embeddings_speech(audio_path, model, 'cpu')
averaged_rep = reps[7][0][23:39+1].mean(dim=0)

In [78]:
averaged_rep_2d = averaged_rep.unsqueeze(0).double()
relative_rep = pairwise_cosine_similarity(averaged_rep_2d, audio)

In [79]:
values, indices = torch.topk(relative_rep[0], k=20)
[keys[index.item()] for index in indices]

['every',
 'every',
 'every',
 'every',
 'every',
 'every',
 'every',
 'every',
 'every',
 'every',
 'everything',
 'every',
 'every',
 'everything',
 'every',
 'everyday',
 'whichever',
 'everywhere',
 'every',
 'every']

In [80]:
batch_size = 1000  # Adjust this based on your GPU memory capacity

# Store results
similarity_results = []

for i in range(0, large_matrix.size(0), batch_size):
    end = min(i + batch_size, large_matrix.size(0))
    batch = large_matrix[i:end, :] # Move batch to GPU if not already
    similarity = pairwise_cosine_similarity(relative_rep.float(), batch)
    similarity_results.extend(similarity.cpu())  # Move results to CPU to save GPU memory


In [81]:
full_similarity = torch.cat(similarity_results, dim=0)

In [82]:
# Obtenemos los k mas similares
values, indices = torch.topk(full_similarity, k=20)

In [83]:
[keys[index.item()] for index in indices]

['surely',
 'occasionally',
 'occasionally',
 'curiously',
 'accustomed',
 'next',
 'scarcely',
 'anyway',
 'fortunately',
 'whence',
 'fortunately',
 'wise',
 'yes',
 'every',
 'often',
 'occasionally',
 'perhaps',
 'sufficient',
 'fortunately',
 'probably']