In [13]:
BASE_DIR = "/tmp/akshett.jindal"
DATA_DIR = "/tmp/semeval24_task3"
BATCH_SIZE = 1

In [14]:
from os import path
from glob import glob
import re

audio_files_glob = "/home2/suyash.mathur/audios/*/*.wav"
# path.join(
#     DATA_DIR,
#     "train",
#     "SemEval-2024_Task3",
#     "Evaluation_Data",
#     "audios",
#     "*.wav",
# )

wav_files = sorted(
    glob(audio_files_glob, recursive=True),
    key=lambda fname: tuple([int(num) for num in re.findall(r"\d+", fname)]),
)
wav_files[:5], len(wav_files)

(['/home2/suyash.mathur/audios/train/dia1utt1.wav',
  '/home2/suyash.mathur/audios/train/dia1utt2.wav',
  '/home2/suyash.mathur/audios/train/dia1utt3.wav',
  '/home2/suyash.mathur/audios/train/dia1utt4.wav',
  '/home2/suyash.mathur/audios/train/dia1utt5.wav'],
 19920)

In [15]:
import transformers

MODEL_ID = "facebook/wav2vec2-large-960h"
CONFIG_CLASS = transformers.AutoConfig
MODEL_CLASS = transformers.AutoModel
PROCESSOR_CLASS = transformers.AutoFeatureExtractor

In [16]:
MODEL_NAME = MODEL_ID.replace("/", "_").replace(" ", "_")

In [17]:
from os import path

HUGGINGFACE_CACHE_DIR = path.join(BASE_DIR, ".huggingface_cache")
OUTPUT_FILE = path.join(DATA_DIR, "audio_embeddings", f"audio_embeddings_{MODEL_NAME}.pkl")

In [18]:
import torch

if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"

device = torch.device(device)
device

device(type='cuda')

In [19]:
processor = PROCESSOR_CLASS.from_pretrained(
    MODEL_ID,
    cache_dir=HUGGINGFACE_CACHE_DIR,
)

preprocessor_config.json: 100%|██████████| 159/159 [00:00<00:00, 46.1kB/s]
config.json: 100%|██████████| 843/843 [00:00<00:00, 2.21MB/s]


In [20]:
model = MODEL_CLASS.from_pretrained(
    MODEL_ID,
    cache_dir=HUGGINGFACE_CACHE_DIR,
).to(device)

pytorch_model.bin: 100%|██████████| 1.26G/1.26G [00:24<00:00, 50.6MB/s]
Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
from datasets import Dataset
import numpy
from os import path
import soundfile
from tqdm.auto import tqdm

def data_generator():
    for wav_file in tqdm(wav_files):
        with open(wav_file, "rb") as f:
            audio_data, _ = soundfile.read(f)
        audio_id = path.basename(wav_file).replace(".wav", "")
        yield { "id": audio_id, "audio": numpy.average(audio_data, axis=1) }

dataset = Dataset.from_generator(data_generator, cache_dir=path.join(HUGGINGFACE_CACHE_DIR, "datasets"))
dataset

Dataset({
    features: ['id', 'audio'],
    num_rows: 19920
})

In [22]:
def batch(iterable, n=1):
    l = len(iterable)
    for ndx in range(0, l, n):
        yield iterable[ndx:min(ndx+n, l)]

In [23]:
from os import path
import pickle

if path.exists(OUTPUT_FILE):
    with open(OUTPUT_FILE, "rb") as f:
        OUTPUTS = pickle.load(f)
else:
    OUTPUTS = {}

list(OUTPUTS.keys())[:5]

[]

In [24]:
import json
import numpy
from os import path
import pickle
import torch
from tqdm.auto import tqdm

model.eval()

with torch.inference_mode():
    for batch_num, d in tqdm(enumerate(batch(dataset, n=BATCH_SIZE)), total=len(dataset) // BATCH_SIZE):

        if all(audio_id in OUTPUTS for audio_id in d["id"]):
            continue

        try:
            inputs = processor(
                raw_speech=d["audio"],
                padding=BATCH_SIZE > 1,
                sampling_rate=16000,
                return_tensors="pt",
            )
            for k in inputs.keys():
                inputs[k] = inputs[k].to(device)

            outputs = model(**inputs)
            for k in outputs.keys():
                outputs[k] = numpy.array(outputs[k].cpu())

            last_hidden_states = numpy.mean(outputs["last_hidden_state"], axis=1)

            for audio_id, hs in zip(d["id"], last_hidden_states):
                OUTPUTS[audio_id] = last_hidden_states

        except Exception as ex:
            print(f"Failed on {d['id']= }: {ex}")

with open(OUTPUT_FILE, "wb") as f:
    pickle.dump(OUTPUTS, f)

  0%|          | 0/19920 [00:00<?, ?it/s]

 99%|█████████▉| 19745/19920 [20:17<00:10, 17.29it/s]

Failed on d['id']= ['dia2020utt6']: Calculated padded input size per channel: (1). Kernel size: (2). Kernel size can't be greater than actual input size


100%|██████████| 19920/19920 [20:26<00:00, 16.24it/s]


## Post Processing

In [25]:
BASE_DIR = "/tmp/akshett.jindal"

In [26]:
from os import path
from glob import glob
import re

batch_files_glob = path.join(
    BASE_DIR,
    "shared_task",
    "task03",
    "audio_embeddings",
    "microsoft_wavlm-base-sd",
    "batch_*.pkl",
)

batch_files = sorted(
    glob(batch_files_glob, recursive=True),
    key=lambda fname: tuple([int(num) for num in re.findall(r"\d+", fname)]),
)
batch_files[:5], len(batch_files)

([], 0)

In [27]:
import pickle

embeddings = {}

for batch_file in batch_files:
    with open(batch_file, "rb") as f:
        batches = pickle.load(f)
    for batch in batches:
        for audio_id, hidden_state in zip(batch["ids"], batch["last_hidden_state"]):
            embeddings[audio_id] = hidden_state

len(embeddings)

0

In [28]:
list(embeddings.keys())[:10]

[]

In [29]:
import pickle
import os.path

with open(os.path.join(BASE_DIR, "shared_task", "task03", "audio_embeddings", "microsoft_wavlm-base-sd_last_layer_embeddings.pkl"), "wb") as f:
    pickle.dump(embeddings, f)

FileNotFoundError: [Errno 2] No such file or directory: '/tmp/akshett.jindal/shared_task/task03/audio_embeddings/microsoft_wavlm-base-sd_last_layer_embeddings.pkl'