# Audio Embeddings

- uses [pyannote/embedding](https://huggingface.co/pyannote/embedding) to generate x-vectors for wave forms

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
#|default_exp audio.embeddings
#|export
import cgnai
from pathlib import Path
import sys
from cgnai.logging import cgnai_logger
import numpy as np
from cgnai.fileio import ls, load

logger = cgnai_logger("embeddings")
log = logger.info

In [None]:
#|export
from pyannote.audio import Inference

embedding_models = {}

def get_embedding_model(duration:float = 4.0, step:float = 1.0, device="cuda"):
    if (duration, step, device) not in embedding_models:
        embedding_models[(duration, step, device)] = Inference("pyannote/embedding",
                                                       window="sliding",
                                                       duration=duration,
                                                       step=step,
                                                       device=device)
    return embedding_models[(duration, step, device)]

In [None]:
#|export
def get_embedding(wav, rate, duration:float = 4.0, step:float = 1.0, device="cuda"):
    model = get_embedding_model(duration, step, device)
    embeddings = model({'waveform': wav, "sample_rate": rate}).data
    return embeddings

In [None]:
#|export
def load_embedding(mp3_path):
    emb = load(str(mp3_path) + "_emb.npy")
    return emb/(np.linalg.norm(emb, axis=1,keepdims=True))

In [None]:
from cgnai.utils import cgnai_home
from cgnai.fileio import ls, load
import torchaudio

model = get_embedding_model()
data_path = cgnai_home() / "shared/podverse/data/dlf_politik_podcast/"
files = [f for f in ls(data_path).files if str(f).endswith(".mp3")]
torchaudio.set_audio_backend("sox_io")
wav, sr = torchaudio.load(data_path / files[0], format="mp3")
emb=get_embedding(wav[:], sr)
emb.shape