## Requirements

In [1]:
!pip install git+https://github.com/huggingface/transformers
!pip install datasets
!pip install pytorch_metric_learning
!pip install opendatasets
!pip install pydub
!pip install gdown
!pip install tabulate

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-44u66l4z
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-44u66l4z
  Resolved https://github.com/huggingface/transformers to commit 2f4cdd97f5b837858f33d7d1095fba4b90871f57
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: transformers
  Building wheel for transformers (pyproject.toml) ... [?25ldone
[?25h  Created wheel for transformers: filename=transformers-4.27.0.dev0-py3-none-any.whl size=6708213 sha256=921dd4af552a3bfc1bf59dd9fd3e9c0c2017720d4a71c132842bcd0a1045fde8
  Stored in directory: /tmp/pip-ephem-wheel-cache-xzdyn2y_/wheels/35/2e/a7/d819e3310040329f0f47e57c9e3e7a7338aa5e74c49acfe522
Successfully built transformers
Insta

In [2]:
import torch
from transformers import AutoProcessor, Wav2Vec2Model
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel
import numpy as np
import torch.nn.functional as F
import torch.nn as nn
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from time import time
from matplotlib import pyplot as plt
from sklearn.metrics import accuracy_score
import os
from pydub import AudioSegment
import opendatasets as od
import pandas as pd
import gc
import random
import pickle as pkl
import gdown
from tabulate import tabulate

## Load Files

In [3]:
url = "https://drive.google.com/file/d/1-390QxYWgKkhxQLttpEPfQMk0xr8Lb2Y/view?usp=share_link"
output = "test_dataset.pkl"
gdown.download(url, output, quiet=False, fuzzy=True)

Downloading...
From: https://drive.google.com/uc?id=1-390QxYWgKkhxQLttpEPfQMk0xr8Lb2Y
To: /kaggle/working/test_dataset.pkl
100%|██████████| 476M/476M [00:01<00:00, 305MB/s] 


'test_dataset.pkl'

## Preparing Data

In [3]:
class TestDataset(Dataset):
    def __init__(self, data):
        self.data = data
        self.id_to_idx = {}
        for i in range(len(self.data)):
            d_id = self.data[i]['id']
            self.id_to_idx[d_id] = i

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        sample = self.data[index]
        text = sample['keywords']
        candidate_idxs = sample['candidates']
        label = sample['label']

        # Get the audio embeddings of the candidate instances
        candidates_audio_embeddings = []
        for d_id in candidate_idxs:
            candidate = self.data[self.id_to_idx[d_id]]
            candidate_audio_embedding = torch.tensor(candidate['audio_embedding'])
            candidates_audio_embeddings.append(candidate_audio_embedding)
        candidates_audio_embeddings = torch.stack(candidates_audio_embeddings)

        return text, candidates_audio_embeddings, label

In [4]:
with open('test_dataset.pkl', 'rb') as f:
    test_data = pkl.load(f)

In [5]:
test_dataset = TestDataset(test_data)
test_loader = DataLoader(dataset=test_dataset, batch_size=1, shuffle=False)

## Model

### CLAP

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [7]:
from datasets import load_dataset
from transformers import AutoProcessor, ClapModel

model = ClapModel.from_pretrained("laion/clap-htsat-unfused").to(device)
processor = AutoProcessor.from_pretrained("laion/clap-htsat-unfused")

Downloading (…)lve/main/config.json:   0%|          | 0.00/5.39k [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/615M [00:00<?, ?B/s]

Downloading (…)rocessor_config.json:   0%|          | 0.00/541 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/384 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

In [8]:
def text_audio_similarity(input_text, audio_sample, model=model, processor=processor):
    inputs = processor(text=input_text, audios=audio_sample, return_tensors="pt", padding=True).to(device)
    outputs = model(**inputs)
    logits_per_audio = outputs.logits_per_text
    probs = logits_per_audio.softmax(dim=-1)
    return probs

## Test and Evaluation

In [9]:
def evaluate(model, dataloader):
    total_hits_1 = 0
    total_mrr = 0
    total_instances = 0
    total_labels = []
    total_predictions = []

    with torch.no_grad():
        for text, candidates, label in tqdm(dataloader):
            gc.collect()
            batch_size = label.size(0)
            text = text[0]
            candidates = candidates.cpu()
            candidates = candidates.squeeze()
            label = label.to(device)

            # Compute text-to-candidates similarities
            text_candidate_cosine_similarities = text_audio_similarity(text, candidates)

            # Compute Hits@1
            _, predicted_idx = torch.max(text_candidate_cosine_similarities, dim=0)
            hits_1 = torch.sum(predicted_idx == label)
            total_hits_1 += hits_1.item()

            # Compute MRR
            candidate_ranks = torch.argsort(text_candidate_cosine_similarities, descending=True)
            candidate_ranks = candidate_ranks.tolist()
            label_rank = candidate_ranks.index(label.item())
            reciprocal_rank = 1 / (label_rank + 1)
            total_mrr += reciprocal_rank

            # Record predictions and labels
            predictions = predicted_idx
            total_labels.append(label)
            total_predictions.append(predictions)

            total_instances += batch_size

    # Compute average metrics over all instances
    avg_hits_1 = total_hits_1 / total_instances
    avg_mrr = total_mrr / total_instances
    precision = precision_score(total_labels, total_predictions, average='macro')
    recall = recall_score(total_labels, total_predictions, average='macro')
    f1 = f1_score(total_labels, total_predictions, average='macro')
    accuracy = accuracy_score(total_labels, total_predictions)

    return {
        'Hits@1': avg_hits_1,
        'MRR': avg_mrr,
        'Precision': precision,
        'Recall': recall,
        'F1': f1,
        'Accuracy': accuracy
    }

In [None]:
results = evaluate(model, test_loader)
table = []
for i in range(len(results)):
    table.append([list(results.keys())[i], list(results.values())[i]])
print(tabulate(table, ['Metrics', 'Values'], tablefmt="grid"))

It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
