# Compare Feature Extraction Type (MFCCs and WavLM) and compare Template Type (Child, Adult, VC Adult)

## Setup of Functions

In [2]:
from __future__ import division
from __future__ import print_function
from os import path
import os, glob, torch, torchaudio, re
from python_speech_features import delta
from python_speech_features import mfcc
import numpy as np
import sys
from pathlib import Path
import speech_dtw.qbe as qbe

from transformers import WavLMModel
from sklearn.decomposition import PCA

sys.path.append("..")
sys.path.append(path.join("..", "utils"))

SAMPLE_RATE = 16000 
WAVLM_LAYER_INDEX = 6
device = "cpu"
model = WavLMModel.from_pretrained("microsoft/wavlm-large").to(device).eval()

def cmvn(X):
    # X: [T, D] NumPy
    mu = X.mean(axis=0, keepdims=True)
    sd = X.std(axis=0, keepdims=True)
    return (X - mu) / (sd + 1e-8)

def getMFCCsFeatures(file): #A function which extracts MFCCs features from a given audio file
    sig, rate = torchaudio.load(file) #Reads the audio file, extracting the sample rate and signal data (as an array)
    if rate != SAMPLE_RATE: #Check if sampled as correct sampling rate, if not - resample
        print("Resampling", file ,"at 16kHz.\n")
        sig = torchaudio.functional.resample(sig, rate, SAMPLE_RATE)
    sig = sig.squeeze(0).numpy()
    MFCC_static = mfcc(sig, SAMPLE_RATE) #Extracts MFCCs features given
    MFCC_deltas = delta(MFCC_static, 2) #Calculates delta (first derivative) of MFCCs features
    MFCC_delta_deltas = delta(MFCC_deltas, 2) #Calculates delta-delta (second derivative) of MFCCs features
    features = np.hstack((MFCC_static, MFCC_deltas, MFCC_delta_deltas)) #Combine static, delta, and delta-delta features into a single feature vector
    features = cmvn(features) #Applies cepstral mean and variance normalization to features
    return features

def getWavLMFeatures(file): #A function which extracts MFCCs features from a given audio file
    sig, rate = torchaudio.load(file) #Reads the audio file, extracting the sample rate and signal data (as an array)
    if rate != SAMPLE_RATE: #Check if sampled as correct sampling rate, if not - resample
        print("Resampling", file ,"at 16kHz.\n")
        sig = torchaudio.functional.resample(sig, rate, SAMPLE_RATE)
    sig = sig.to(device)
    with torch.inference_mode(): #Extracts layer 6 features
        out = model(sig, output_hidden_states=True)
        features = out.hidden_states[WAVLM_LAYER_INDEX].squeeze(0)  # [T, D] torch
    features = features.numpy() #Convert to numpy
    features = cmvn(features) #Applies cepstral mean and variance normalization to features
    return features

def getFeatures(file, feature_type="wavlm"):
    if feature_type == "mfcc":
        return getMFCCsFeatures(file)
    else:
        return getWavLMFeatures(file)

def getMinimumCost(queryFile, templateFile, feature_type="wavlm"): #Loading the features
    queryFeatures = getFeatures(queryFile, feature_type) #Extract features for query data
    templateFeatures = torch.load(templateFile)["features"].numpy() #Load the template's feature file
    queryFeatures = np.ascontiguousarray(queryFeatures, dtype=np.float64) #Make both feature sets 2D, float64, contiguous
    templateFeatures = np.ascontiguousarray(templateFeatures, dtype=np.float64)
    distance = qbe.dtw_sweep_min(queryFeatures, templateFeatures) #Calculate the minimum sweeping DTW distance between the two feature sets
    return distance

def predict(queryFile, templateFolder, feature_type="wavlm"):
    distances = [] #This will store tuples of (distance, label)
    for templateFile in Path(templateFolder).rglob("*.pt"): #Loop through all template feature files
        distance = getMinimumCost(queryFile, str(templateFile), feature_type) #Get the minimum cost between the query and template
        label = torch.load(templateFile)["label"] #Extract the label of the template
        distances.append((distance, label)) #Append the distance and label as a tuple to the distances list
    distances.sort(key=lambda x: x[0]) #Sort the distances list by distance (first element of tuple) (using a lambda function)
    predicted_label = distances[0][1] #Minimum cost prediction (k=1)
    return predicted_label

def getAccuracy(testFolder, templateFolder, feature_type="wavlm"):
    correct = 0
    total = 0
    for testFile in Path(testFolder).rglob("*.wav"): #Loop through all test audio files
        if not re.match(r"^\d{2}_\d{2}\.wav$", testFile.name): # skip files not matching NN_NN.wav
            continue
        true_label = int(testFile.stem.split("_")[0]) #Extract the true label from the filename
        predicted_label = predict(str(testFile), templateFolder, feature_type) #Predict the label using minimum cost
        if predicted_label == true_label:
            correct += 1
        total += 1
    accuracy = (correct / total) * 100
    return accuracy

  from .autonotebook import tqdm as notebook_tqdm


## Evaluation for Template Types and Feature Types

WavLM Accuracies:

In [6]:
accuracyWavLMAdult = getAccuracy("ValidationData/OnlyNumbers/", "TrainingData/TrainingFeatures/WavLM/Adult", feature_type="wavlm")
print(f"WavLM Adult Template Accuracy: {accuracyWavLMAdult:.2f}%")
accuracyWavLMAdultVC = getAccuracy("ValidationData/OnlyNumbers/", "TrainingData/TrainingFeatures/WavLM/AdultVC", feature_type="wavlm")
print(f"WavLM AdultVC Template Accuracy: {accuracyWavLMAdultVC:.2f}%")
accuracyWavLMChild = getAccuracy("ValidationData/OnlyNumbers/", "TrainingData/TrainingFeatures/WavLM/Child", feature_type="wavlm")
print(f"WavLM Child Template Accuracy: {accuracyWavLMChild:.2f}%")

WavLM Adult Template Accuracy: 58.82%
WavLM AdultVC Template Accuracy: 60.78%
WavLM Child Template Accuracy: 65.69%


MFCCs Accuracies:

In [4]:
accuracyMFCCsAdult = getAccuracy("ValidationData/OnlyNumbers/", "TrainingData/TrainingFeatures/MFCCs/Adult", feature_type="mfcc")
print(f"MFCCs Adult Template Accuracy: {accuracyMFCCsAdult:.2f}%")
accuracyMFCCsAdultVC = getAccuracy("ValidationData/OnlyNumbers/", "TrainingData/TrainingFeatures/MFCCs/AdultVC", feature_type="mfcc")
print(f"MFCCs AdultVC Template Accuracy: {accuracyMFCCsAdultVC:.2f}%")
accuracyMFCCsChild = getAccuracy("ValidationData/OnlyNumbers/", "TrainingData/TrainingFeatures/MFCCs/Child", feature_type="mfcc")
print(f"MFCCs Child Template Accuracy: {accuracyMFCCsChild:.2f}%")

MFCCs Adult Template Accuracy: 23.53%
MFCCs AdultVC Template Accuracy: 39.22%
MFCCs Child Template Accuracy: 59.80%


## Evaluation for Different WavLM Models

In [9]:
accuracyWavLMLarge = getAccuracy("ValidationData/OnlyNumbers/", "TrainingData/TrainingFeatures/WavLM/Child", feature_type="wavlm")
print(f"WavLM Large Accuracy: {accuracyWavLMLarge:.2f}%")

WavLM Large Accuracy: 65.69%


The large model took 20min 57.9 seconds to run, giving it a prediction time of 12.33 seconds per query

In [10]:
model = WavLMModel.from_pretrained("microsoft/wavlm-base-plus").to(device).eval()
accuracyWavLMBasePlus = getAccuracy("ValidationData/OnlyNumbers/", "TrainingData/TrainingFeatures/WavLMBase+/English", feature_type="wavlm")
print(f"WavLM Base+ Template Accuracy: {accuracyWavLMBasePlus:.2f}%")

WavLM Base+ Template Accuracy: 75.49%


The Base+ model took 7min 7.3 seconds to run, giving it a prediction time of 4.20 seconds per query

In [11]:
model = WavLMModel.from_pretrained("microsoft/wavlm-base").to(device).eval()
accuracyWavLMBase = getAccuracy("ValidationData/OnlyNumbers/", "TrainingData/TrainingFeatures/WavLMBase/English", feature_type="wavlm")
print(f"WavLM Base Template Accuracy: {accuracyWavLMBase:.2f}%")

WavLM Base Template Accuracy: 57.84%


The base model took 6min 45.8 seconds to run, giving it a prediction time of 3.98 seconds per query