## Requirements

In [1]:
!pip install gdown
!pip install transformers

[0m

## Libraries

In [2]:
import gdown
import torch
import pickle
import numpy as np
from transformers import AutoProcessor, Wav2Vec2Model
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
import gc
from sklearn.model_selection import train_test_split
import random
import os

## Loading Data

In [None]:
url = "https://drive.google.com/file/d/1klayW8tzpkmhS6n6Zb3ljvDG40SakMjR/view?usp=sharing"
output = "audios_dataset.npy"
gdown.download(url, output, quiet=False, fuzzy=True)

url = "https://drive.google.com/file/d/1-1HvdclgtF-geqbVJC8sOxJ0Q2oLS6ex/view?usp=share_link"
output = "keywords_dataset.pkl"
gdown.download(url, output, quiet=False, fuzzy=True)

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [4]:
# Loading data
with open('keywords_dataset.pkl', 'rb') as f:
    keywords_dataset = pickle.load(f)
audios_dataset = np.load('audios_dataset.npy', allow_pickle=True)
print(len(keywords_dataset), len(audios_dataset))

3000 3000


## Loading Models

In [None]:
audio_processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")
audio_model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h").to(device)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
text_model = AutoModel.from_pretrained("bert-base-uncased").to(device)

## Preprocess

In [7]:
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
gc.collect()

39

In [8]:
dataset = list()
with tqdm(enumerate(zip(keywords_dataset, audios_dataset)), total=len(keywords_dataset)) as pbar:
    for i, (keywords, audio_waveform) in pbar:
        if i % 10 == 0:
            torch.cuda.empty_cache()
            torch.cuda.ipc_collect()
            gc.collect()

        inputs = tokenizer(keywords, return_tensors="pt").to(device)
        outputs = text_model(**inputs)
        embeddings = outputs.last_hidden_state.squeeze(0)
        pooled_embeddings = embeddings.mean(dim=0)

        audio = torch.from_numpy(audio_waveform)
        inputs = audio_processor(audio, sampling_rate=16000, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = audio_model(**inputs)
        last_hidden_states = outputs.last_hidden_state.squeeze(0)
        embeddings = last_hidden_states.mean(dim=0)

        data = {}
        data['id'] = i + 1
        data['audio_waveform'] = audio_waveform
        data['keywords'] = keywords
        data['bert_embedding'] = pooled_embeddings
        data['audio_embedding'] = embeddings

        dataset.append(data)
len(dataset)

100%|██████████| 3000/3000 [04:26<00:00, 11.28it/s]


3000

In [9]:
train_dataset, test_dataset = train_test_split(dataset, test_size=0.2, random_state=42)
validation_dataset, test_dataset = train_test_split(test_dataset, test_size=0.5, random_state=42)

In [17]:
audio_processor = None
audio_model = None
tokenizer = None
text_model = None
keywords_dataset = None
audios_dataset = None

In [25]:
torch.cuda.empty_cache()
torch.cuda.ipc_collect()

gc.collect()

23

In [13]:
os.rename('/kaggle/working/train_dataset.pkl', 'old_train_dataset.pkl')
os.rename('/kaggle/working/test_dataset.pkl', 'old_test_dataset.pkl')
os.rename('/kaggle/working/validation_dataset.pkl', 'old_validation_dataset.pkl')
os.rename('/kaggle/working/test_dataset_with_negative_samples.pkl', 'old_test_dataset_with_negative_samples.pkl')

In [19]:
with open('train_dataset.pkl', 'wb') as f:
    pickle.dump(train_dataset, f)

In [23]:
with open('validation_dataset.pkl', 'wb') as f:
    pickle.dump(validation_dataset, f)

In [24]:
with open('test_dataset.pkl', 'wb') as f:
    pickle.dump(test_dataset, f)

In [26]:
test_dataset_with_negative_samples = []
total_ids = [data['id'] for data in test_dataset]
number_of_negative_samples = 5

for i in range(len(test_dataset)):
    data = test_dataset[i]
    index = data['id']
    ls = total_ids.copy()
    ls.remove(index)
    sampled_list = random.sample(ls, number_of_negative_samples)
    sampled_list.append(index)
    random.shuffle(sampled_list)
    
    data['candidates'] = sampled_list
    data['label'] = sampled_list.index(index)
    
    test_dataset_with_negative_samples.append(data)

In [27]:
with open('test_dataset_with_negative_samples.pkl', 'wb') as f:
    pickle.dump(test_dataset_with_negative_samples, f)

## Load

In [3]:
with open('train_dataset.pkl', 'rb') as f:
    training_dataset = pickle.load(f)

In [19]:
training_dataset[0].keys()

dict_keys(['id', 'audio_waveform', 'keywords', 'bert_embedding', 'audio_embedding'])

In [20]:
with open('test_dataset_with_negative_samples.pkl', 'rb') as f:
    test_dataset_with_negative_samples = pickle.load(f)

In [22]:
test_dataset_with_negative_samples[0].keys()

dict_keys(['id', 'audio_waveform', 'keywords', 'bert_embedding', 'audio_embedding', 'candidates', 'label'])

In [4]:
with open('validation_dataset.pkl', 'rb') as f:
    validation_dataset = pickle.load(f)

In [5]:
with open('test_dataset.pkl', 'rb') as f:
    test_dataset = pickle.load(f)

In [6]:
validation_ids = set([x['id'] for x in validation_dataset])
test_ids = set([x['id'] for x in test_dataset])
train_ids = set([x['id'] for x in training_dataset])
test_ids & validation_ids & train_ids

set()

In [7]:
validation_ids & train_ids

set()

In [9]:
test_ids & validation_ids

set()

In [10]:
train_ids & validation_ids

set()

In [8]:
len(test_ids | validation_ids | train_ids)

3000

In [None]:
total_ids = set(i + 1 for i in range(3000))
training_ids = total_ids - (test_ids | validation_ids)
len(training_ids)

In [None]:
with open('training_ids.pkl', 'wb') as f:
    pickle.dump(training_ids, f)

In [None]:
with open('training_ids.pkl', 'rb') as f:
    training_ids = pickle.load(f)
len(training_ids)

In [None]:
dataset = list()
with tqdm(enumerate(zip(keywords_dataset, audios_dataset)), total=len(keywords_dataset)) as pbar:
    for i, (keywords, audio_waveform) in pbar:
        if i % 10 == 0:
            torch.cuda.empty_cache()
            torch.cuda.ipc_collect()
            gc.collect()
        
        if i not in training_ids:
            continue

        inputs = tokenizer(keywords, return_tensors="pt").to(device)
        outputs = text_model(**inputs)
        embeddings = outputs.last_hidden_state.squeeze(0)
        pooled_embeddings = embeddings.mean(dim=0)

        audio = torch.from_numpy(audio_waveform)
        inputs = audio_processor(audio, sampling_rate=16000, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = audio_model(**inputs)
        last_hidden_states = outputs.last_hidden_state.squeeze(0)
        embeddings = last_hidden_states.mean(dim=0)

        data = {}
        data['id'] = i + 1
        data['audio_waveform'] = audio_waveform
        data['keywords'] = keywords
        data['bert_embedding'] = pooled_embeddings
        data['audio_embedding'] = embeddings

        dataset.append(data)
len(dataset)

In [None]:
with open('train_dataset.pkl', 'wb') as f:
    pickle.dump(dataset, f)

## Move Files to Drive

In [15]:
from IPython.display import FileLink
FileLink(r'train_dataset.pkl')

In [16]:
FileLink(r'test_dataset.pkl')

In [17]:
FileLink(r'test_dataset_with_negative_samples.pkl')

In [18]:
FileLink(r'validation_dataset.pkl')