Configurations

In [4]:
SAMPLE_RATE = 16000
DURATION = 3
AUDIO_DIR_PATH = "./data/test" 
HOP_LENGTH = 512
LABELS = 3 # my data only consist of 3 different speaker, change to 200 here

Load audio and load it into raw waveforms segments

In [5]:
import os
import librosa

testsegments = []
for audio_file_name in os.listdir(AUDIO_DIR_PATH):    
    waveform, sr = librosa.load(os.path.join(AUDIO_DIR_PATH, audio_file_name), sr=SAMPLE_RATE)
    total_samples = DURATION * SAMPLE_RATE 
    for i in range(0, len(waveform) - total_samples + 1, total_samples):
        testsegments.append(waveform[i:i + total_samples]) # last incomplete segment will be excluded

In [6]:
len(testsegments)

2

In [7]:
import random

segment = testsegments[random.randint(0, len(testsegments) - 1)]

Phonetic and Acoustic Features

In [8]:
from transformers import ASTFeatureExtractor, ASTForAudioClassification

feature_extractor = ASTFeatureExtractor.from_pretrained(
    "MIT/ast-finetuned-audioset-10-10-0.4593",
    cache_dir="./model/ast"
)

ast_model = ASTForAudioClassification.from_pretrained(
    "MIT/ast-finetuned-audioset-10-10-0.4593",
    cache_dir="./model/ast"
)

ast_model.eval()

  from .autonotebook import tqdm as notebook_tqdm


ASTForAudioClassification(
  (audio_spectrogram_transformer): ASTModel(
    (embeddings): ASTEmbeddings(
      (patch_embeddings): ASTPatchEmbeddings(
        (projection): Conv2d(1, 768, kernel_size=(16, 16), stride=(10, 10))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ASTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ASTLayer(
          (attention): ASTAttention(
            (attention): ASTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
            )
            (output): ASTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ASTIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=T

In [9]:
import torch

def get_phonetic_acoustic_vector(segment):
    # AST expects NumPy array (1D waveform)
    if isinstance(segment, torch.Tensor):
        segment = segment.numpy()

    # Use feature extractor to prepare inputs
    inputs = feature_extractor(segment, sampling_rate=16000, return_tensors="pt")

    # Get embedding from AST model
    with torch.no_grad():
        outputs = ast_model(**inputs)
        ast_embedding = outputs.logits.squeeze(0)  # shape: [D_ast]

    return ast_embedding

Prosodic Features

In [10]:
import numpy as np
import torch.nn as nn

class ProsodyRNN(nn.Module):
    def __init__(self, input_dim=2, hidden_dim=64, output_dim=128):
        super().__init__()
        self.rnn = nn.GRU(input_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.proj = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, x):
        _, h = self.rnn(x)  # h shape: [2, B, H]
        h = torch.cat([h[0], h[1]], dim=1)  # [B, 2H]
        return self.proj(h)  # [B, output_dim]

def get_prosodic_features(segment): 
    # Each row = 1 time frame
    # hop_duration_ms = (512/ 16000) * 1000 = 32ms
    # Each column = 1 feature (pitch, energy)

    pitch = librosa.yin(segment, fmin=50, fmax=300, sr=sr, hop_length=HOP_LENGTH)
    energy = librosa.feature.rms(y=segment, hop_length=HOP_LENGTH)[0]

    # Frame-align pitch & energy (zero-pad to same length)
    length = min(len(pitch), len(energy))
    pitch = pitch[:length]
    energy = energy[:length]

    # Combine frame-wise
    features = np.stack([pitch, energy], axis=1)  # [T x 2]

    return features

def get_prosodic_vector(segment): 
    features = get_prosodic_features(segment)  # shape: [T, 2]

    # Convert to tensor and add batch dimension
    x = torch.tensor(features, dtype=torch.float32).unsqueeze(0)  # shape: [1, T, 2]

    # Load model
    prosody_model = ProsodyRNN()
    prosody_model.load_state_dict(torch.load('./model/weights/prosody_rnn.pt'))
    prosody_model.eval()

    with torch.no_grad():
        prosody_vector = prosody_model(x)  # shape: [1, D]
    
    return prosody_vector.squeeze(0).numpy()  # shape: [D]

Feature Extraction

In [11]:
def l2_normalize(v):
    return v / np.linalg.norm(v)

# phonetic and acoustic features 
phonetic_acoustic_vector = get_phonetic_acoustic_vector(segment)

# prosodic features
prosody_vector = get_prosodic_vector(segment)

# Normalize both vectors
p_vec = l2_normalize(prosody_vector)  # [D1]
a_vec = l2_normalize(phonetic_acoustic_vector.numpy())   # [D2]
    
# Concatenate
query_vector = np.concatenate([p_vec, a_vec])            # shape: [D1 + D2]

Classification Model

In [12]:
import joblib

clf = joblib.load('./model/classifier.pkl')

predicted_cluster = clf.predict([query_vector])[0]

In [13]:
predicted_cluster

np.int32(0)