In [None]:
from IPython.display import clear_output

In [None]:
!pip install git+https://github.com/huggingface/datasets.git
!pip install git+https://github.com/huggingface/transformers.git
!pip install -U sentence-transformers
!pip install torchaudio
!pip install librosa
!pip install jiwer
!pip install hazm
!pip install yake
!pip install multi_rake
!pip install fasttext
!pip install pickel
clear_output()

In [None]:
import librosa
import os
import gc
import json
import torch
import hazm
import yake
import numpy as np
import IPython.display as ipd
from multi_rake import Rake
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from sentence_transformers import models, SentenceTransformer, util

## ASR & Keyword Extraction Section


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
model = Wav2Vec2ForCTC.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
if torch.cuda.is_available():
    model.cuda()

clear_output()

In [None]:
def transcribe_dataset(dataset):
    utterances = dataset.keys()
    for utterance in utterances:
        waveform, sample_rate = librosa.load(utterance, sr=16000)
        for chunk in dataset[utterance]:
            start_time = chunk["start_time"]
            end_time = chunk["end_time"]
            start_sample = int(start_time * sample_rate)
            end_sample = int(end_time * sample_rate)
            audio_segment = waveform[start_sample:end_sample]

            input_values = processor(audio_segment, sampling_rate=sample_rate, return_tensors="pt").input_values
            input_values = input_values.to(device)
            logits = model(input_values).logits
            predicted_ids = np.argmax(logits.cpu().detach().numpy(), axis=-1)
            transcription = processor.decode(predicted_ids[0])
            
            chunk["transcription"] = transcription

    return dataset

In [None]:
def extract_keywords_yake(transcription):
    # Tokenize the transcription using Hazm
    normalizer = hazm.Normalizer()
    tokenizer = hazm.WordTokenizer()
    tokens = tokenizer.tokenize(normalizer.normalize(transcription))
    tokens = [token for token in tokens if token not in hazm.stopwords_list()]

    # Extract keywords using YAKE
    extractor = yake.KeywordExtractor()
    keywords = extractor.extract_keywords(' '.join(tokens))
    keywords = [k[0] for k in keywords]
    keywords = list(map(lambda x: x.replace('\u200c', ''), keywords))

    return keywords


def extract_keywords_rake(transcription):
    # Extract keywords using multi_rake
    stopwords = hazm.stopwords_list()
    rake = Rake(
        min_chars=3,
        max_words=3,
        min_freq=1,
        language_code=None,  
        stopwords=None, 
        lang_detect_threshold=50,
        max_words_unknown_lang=2,
        generated_stopwords_percentile=80,
        generated_stopwords_max_len=3,
        generated_stopwords_min_freq=2,
    )
    keywords = rake.apply(transcription)
    keywords = [kw[0].replace('\u200c', '') for kw in keywords]

    return keywords

In [None]:
def generate_dataset_keywords(dataset_file):
    with open(dataset_file, 'r') as f:
        dataset = json.load(f)  
    
    dataset = transcribe_dataset(dataset)
    utterances = dataset.keys()
    for utterance in utterances:
        for chunk in dataset[utterance]:
            chunk["keywords"] = extract_keywords_yake(chunk["transcription"])

    transcribed_dataset_file = 'transcribed_' + dataset_file
    with open(transcribed_dataset_file, "w", encoding='utf-8') as outFile:
        json.dump(dataset, outFile, ensure_ascii=False, indent=1)

    return dataset

## Query and Dataset Comparison Section

In [None]:
def cosine_similarity(vec1, vec2):
    dot_product = sum([vec1[i] * vec2[i] for i in range(len(vec1))])
    norm1 = sum([vec1[i] ** 2 for i in range(len(vec1))]) ** 0.5
    norm2 = sum([vec2[i] ** 2 for i in range(len(vec2))]) ** 0.5
    return dot_product / (norm1 * norm2)

In [None]:
def cosine_similarity(vec1, vec2):
    dot_product = sum([vec1[i] * vec2[i] for i in range(len(vec1))])
    norm1 = sum([vec1[i] ** 2 for i in range(len(vec1))]) ** 0.5
    norm2 = sum([vec2[i] ** 2 for i in range(len(vec2))]) ** 0.5
    return dot_product / (norm1 * norm2)


def get_similar_parts(dataset_file, query, N):
    with open(dataset_file, 'r') as f:
        dataset = json.load(f)

    st_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

    query_embedding = st_model.encode(query)

    similarity_scores = []
    for utterance in dataset.keys():
        for chunk in dataset[utterance]:
            chunk_keywords = ' '.join(chunk['keywords'])
            chunk_embedding = st_model.encode(chunk_keywords)
            similarity_scores.append((utterance, chunk['start_time'], chunk['end_time'],
                                      chunk['transcription'], cosine_similarity(query_embedding, chunk_embedding)))
            
    # Sort similarity scores in descending order
    similarity_scores.sort(key=lambda x: x[4], reverse=True)

    # Get top N most similar records
    return similarity_scores[:N]

In [None]:
def usage(dataset, query, N):
    if not os.path.isfile(os.getcwd() + 'transcribed_' + dataset):
        print("Generating Dataset Keywords...")
        generate_dataset_keywords('dataset.json')

    return get_similar_parts('transcribed_' + dataset, query, N)

## Evaluation

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from tabulate import tabulate


class TestDataset(Dataset):
    def __init__(self, data):
        self.data = data
        self.id_to_idx = {}
        for i in range(len(self.data)):
            d_id = self.data[i]['id']
            self.id_to_idx[d_id] = i

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        sample = self.data[index]
        text = sample['keywords']
        candidate_idxs = sample['candidates']
        label = sample['label']

        # Get the audio embeddings of the candidate instances
        candidates_audio_embeddings = []
        for d_id in candidate_idxs:
            candidate = self.data[self.id_to_idx[d_id]]
            candidate_audio_embedding = torch.tensor(candidate['audio_embedding'])
            candidates_audio_embeddings.append(candidate_audio_embedding)
        candidates_audio_embeddings = torch.stack(candidates_audio_embeddings)

        return text, candidates_audio_embeddings, label

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pickle

with open('/content/drive/MyDrive/test_dataset_with_negative_samples.pkl', 'rb') as f:
    test_data = pickle.load(f)

In [None]:
test_dataset = TestDataset(test_data)
test_loader = DataLoader(dataset=test_dataset, batch_size=1, shuffle=False)

In [None]:
from tqdm import tqdm


def retrieve_relevant_audios(audios, query, model, processor, threshold):
    st_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    output = []
    audio_embs = []
    for audio in tqdm(audios):
        input_values = processor(audio, sampling_rate=16000, return_tensors="pt").input_values
        input_values = input_values.to(device)
        logits = model(input_values).logits
        predicted_ids = np.argmax(logits.cpu().detach().numpy(), axis=-1)
        transcription = processor.decode(predicted_ids[0])
        trans_emb = st_model.encode(transcription)
        audio_embs.append(trans_emb)

    query_emb = st_model.encode(query)

    for emb in audio_embs:
        output.append(cosine_similarity(emb, query_emb))
    
    return output, [lambda x: (x >= threshold).int() for x in output]


In [None]:
def evaluate(model, processor, dataloader):
    total_hits_1 = 0
    total_mrr = 0
    total_instances = 0
    total_labels = []
    total_predictions = []

    with torch.no_grad():
        for text, candidates, label in tqdm(dataloader):
            batch_size = label.size(0)
            text = text[0]
            candidates = candidates.to(device)
            candidates = candidates.squeeze()
            label = label.to(device)

            # Compute text-to-candidates similarities
            text_candidate_cosine_similarities, res = retrieve_relevant_audios(candidates, text, model, processor, 0.6)
            text_candidate_cosine_similarities = torch.tensor(text_candidate_cosine_similarities)

            # Compute Hits@1
            _, predicted_idx = torch.max(text_candidate_cosine_similarities, dim=0)
            hits_1 = torch.sum(predicted_idx == label)
            total_hits_1 += hits_1.item()

            # Compute MRR
            candidate_ranks = torch.argsort(text_candidate_cosine_similarities, descending=True)
            candidate_ranks = candidate_ranks.tolist()
            label_rank = candidate_ranks.index(label.item())
            reciprocal_rank = 1 / (label_rank + 1)
            total_mrr += reciprocal_rank

            # Record predictions and labels
            predictions = res[label[0]]
            total_labels += label.tolist()
            total_predictions.append(predictions)

            total_instances += batch_size

    # Compute average metrics over all instances
    avg_hits_1 = total_hits_1 / total_instances
    avg_mrr = total_mrr / total_instances
    precision = precision_score(total_labels, total_predictions, average='macro')
    recall = recall_score(total_labels, total_predictions, average='macro')
    f1 = f1_score(total_labels, total_predictions, average='macro')
    accuracy = accuracy_score(total_labels, total_predictions)

    return {
        'Hits@1': avg_hits_1,
        'MRR': avg_mrr,
        'Precision': precision,
        'Recall': recall,
        'F1': f1,
        'Accuracy': accuracy
    }

In [None]:
def evaluate(model, processor, dataloader):
    total_hits_1 = 0
    total_mrr = 0
    total_instances = 0
    total_labels = []
    total_predictions = []

    with torch.no_grad():
        for text, candidates, label in tqdm(dataloader):
            batch_size = label.size(0)
            text = text[0]
            candidates = candidates.to(device)
            candidates = candidates.squeeze()
            label = label.to(device)

            # Compute text-to-candidates similarities
            text_candidate_cosine_similarities, res = retrieve_relevant_audios(candidates, text, model, processor, 0.6)
            text_candidate_cosine_similarities = torch.tensor(text_candidate_cosine_similarities)

            # Compute Hits@1
            _, predicted_idx = torch.max(text_candidate_cosine_similarities, dim=0)
            hits_1 = torch.sum(predicted_idx == label)
            total_hits_1 += hits_1.item()

            # Compute MRR
            candidate_ranks = torch.argsort(text_candidate_cosine_similarities, descending=True)
            candidate_ranks = candidate_ranks.tolist()
            label_rank = candidate_ranks.index(label.item())
            reciprocal_rank = 1 / (label_rank + 1)
            total_mrr += reciprocal_rank

            # Record predictions and labels
            predictions = [res[i](text_candidate_cosine_similarities[i]) for i in range(len(res))]
            total_labels += label.cpu().tolist()
            total_predictions += predictions

            total_instances += batch_size

    # Compute average metrics over all instances
    avg_hits_1 = total_hits_1 / total_instances
    avg_mrr = total_mrr / total_instances
    precision = precision_score(total_labels, total_predictions, average='macro')
    recall = recall_score(total_labels, total_predictions, average='macro')
    f1 = f1_score(total_labels, total_predictions, average='macro')
    accuracy = accuracy_score(total_labels, total_predictions)

    return {
        'Hits@1': avg_hits_1,
        'MRR': avg_mrr,
        'Precision': precision,
        'Recall': recall,
        'F1': f1,
        'Accuracy': accuracy
    }

In [None]:
results = evaluate(model, processor, test_loader)
table = []
for i in range(len(results)):
    table.append([list(results.keys())[i], list(results.values())[i]])
print(tabulate(table, ['Metrics', 'Values'], tablefmt="grid"))

  candidate_audio_embedding = torch.tensor(candidate['audio_embedding'])

  0%|          | 0/6 [00:00<?, ?it/s][A
 33%|███▎      | 2/6 [00:00<00:00, 16.84it/s][A
 67%|██████▋   | 4/6 [00:00<00:00, 16.85it/s][A
100%|██████████| 6/6 [00:00<00:00, 16.83it/s]
  0%|          | 1/300 [00:00<04:10,  1.19it/s]
  0%|          | 0/6 [00:00<?, ?it/s][A
 33%|███▎      | 2/6 [00:00<00:00, 15.07it/s][A
100%|██████████| 6/6 [00:00<00:00, 18.42it/s]
  1%|          | 2/300 [00:01<04:03,  1.22it/s]
  0%|          | 0/6 [00:00<?, ?it/s][A
 33%|███▎      | 2/6 [00:00<00:00, 14.35it/s][A
 67%|██████▋   | 4/6 [00:00<00:00, 16.28it/s][A
100%|██████████| 6/6 [00:00<00:00, 16.59it/s]
  1%|          | 3/300 [00:02<04:00,  1.24it/s]
  0%|          | 0/6 [00:00<?, ?it/s][A
100%|██████████| 6/6 [00:00<00:00, 29.01it/s]
  1%|▏         | 4/300 [00:02<03:26,  1.44it/s]
  0%|          | 0/6 [00:00<?, ?it/s][A
100%|██████████| 6/6 [00:00<00:00, 30.59it/s]
  2%|▏         | 5/300 [00:03<03:01,  1.62it/s]
  0%| 