### Evaluation

In [2]:
from nnmnkwii.datasets import PaddedFileSourceDataset
from nnmnkwii.datasets.cmu_arctic import CMUArcticWavFileDataSource
from nnmnkwii.preprocessing.alignment import DTWAligner
from nnmnkwii.util import trim_zeros_frames, remove_zeros_frames
from nnmnkwii.metrics import melcd
from nnmnkwii.baseline.gmm import MLPG
import os
from os.path import basename, splitext
import sys
import time
from tqdm import tqdm
import matplotlib.pyplot as plt
import pyroomacoustics as pra
import numpy as np
from scipy.io import wavfile
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import train_test_split
import pyworld
import pysptk
from pysptk.synthesis import MLSADF, Synthesizer
import librosa
import librosa.display
import IPython
from IPython.display import Audio

import pesq
import numpy as np
import pyworld as pw
from scipy.io import wavfile
from scipy.spatial.distance import cdist
from pystoi import stoi
from scipy import signal
import soundfile as sf

In [3]:
def compute_sd(target, converted, fs):
    # Resample signals to 16kHz
    from scipy.signal import resample
    x = resample(target, int(len(target) * (16000 / fs)))
    y = resample(converted, int(len(converted) * (16000 / fs)))

    # Compute short-time Fourier transform (STFT) for both signals
    from scipy.signal import stft
    f, t, X = stft(x, fs=16000, nperseg=400, noverlap=240, nfft=512)
    f, t, Y = stft(y, fs=16000, nperseg=400, noverlap=240, nfft=512)

    # Compute magnitude spectra for both signals
    mag_x = np.abs(X)
    mag_y = np.abs(Y)

    # Compute the spectral distortion using mean square error (MSE)
    mse = np.mean((mag_x - mag_y) ** 2)
    return mse

def compute_ssnr(target, converted, fs):
    # Calculate the segmental SNR
    target_power = np.sum(np.square(target))
    error = converted - target
    noise_power = np.sum(np.square(error))
    ssnr = 10 * np.log10(target_power / noise_power)
    
    # Calculate the speech presence probability (SPP)
    b = signal.firwin(513, [300.0 / (fs / 2.0), 350.0 / (fs / 2.0)], pass_zero=False)
    s = np.abs(signal.convolve(target, b, mode='same'))
    spp = np.mean(s / np.max(s))

    # Adjust the SSNR by the SPP
    ssnr = ssnr - 20 * np.log10(1 - spp)
    return ssnr

def get_mc(x, fs):
    """Converts time domain signal to mel-cepstrum"""
    alpha = pysptk.util.mcepalpha(fs)
    order = 24
    frame_period = 5

    f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period)
    f0 = pyworld.stonemask(x, f0, timeaxis, fs)
    spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
    spectrogram = trim_zeros_frames(spectrogram)
    mc = pysptk.sp2mc(spectrogram, order=order, alpha=alpha)
    return mc

def compute_mcd(target, converted, sr, n_mels):
    # compute mel spectrograms for target and converted audio
    target_mel_spec = librosa.feature.melspectrogram(y=target, sr=sr, n_mels=n_mels)
    converted_mel_spec = librosa.feature.melspectrogram(y=converted, sr=sr, n_mels=n_mels)
    
    # compute mel cepstra for target and converted audio
    target_mel_cep = librosa.feature.mfcc(S=librosa.power_to_db(target_mel_spec), n_mfcc=n_mels)
    converted_mel_cep = librosa.feature.mfcc(S=librosa.power_to_db(converted_mel_spec), n_mfcc=n_mels)
    
    # compute mel cepstrum distortion
    mcd = np.sqrt(np.mean((target_mel_cep - converted_mel_cep)**2))
    return mcd

In [36]:
target_dir = 'C:\\Users\\YASH\\Desktop\\SAP_Project\\cmu_us_jmk_arctic\\wav\\'
converted_dir = 'C:\\Users\\YASH\\Desktop\\SAP_Project\\save\\exp-14\\'
eval_files = os.listdir(converted_dir)

scores = {
        'MCD': [],
        'PESQ': [],
        'STOI': [],
        'SD': [],
        'SSNR': []
    }

for i in tqdm(range(len(eval_files))):
    file_name = eval_files[i]
    # Load the original and converted audio files
    # target_audio, fs = sf.read(target_dir + file_name)
    # converted_audio, fs_con = sf.read(converted_dir + file_name)
    fs, target_audio = wavfile.read(target_dir + file_name)
    fs_con, converted_audio = wavfile.read(converted_dir + file_name)
    # print("Fs:", fs)

    # Resampling if necessary
    if fs != fs_con:
        converted_audio = librosa.resample(converted_audio.T, fs_con, fs).T
        fs_con = fs

    target_audio = target_audio.astype(np.float64)
    converted_audio = converted_audio.astype(np.float64)

    # Make sure the length of the audio signals are equal
    if len(target_audio) > len(converted_audio):
        converted_audio = np.pad(converted_audio, (0, len(target_audio) - len(converted_audio)), 'constant')
    elif len(converted_audio) > len(target_audio):
        target_audio = np.pad(target_audio, (0, len(converted_audio) - len(target_audio)), 'constant')

    # Compute the PESQ score
    pesq_score = pesq.pesq(fs, target_audio, converted_audio, 'wb')
    scores['PESQ'].append(pesq_score)

    # Compute the STOI score
    stoi_score = stoi(target_audio, converted_audio, fs, False)
    scores['STOI'].append(stoi_score)

    # Compute the SSNR score
    ssnr_score = compute_ssnr(target_audio, converted_audio, fs)
    scores['SSNR'].append(ssnr_score)

    # Compute spectral distortion
    # sd_score = np.linalg.norm(target_audio - converted_audio) / np.linalg.norm(target_audio)
    sd_score = compute_sd(target_audio, converted_audio, fs)
    scores['SD'].append(sd_score)

    # Compute the MCD score
    mcd_score = compute_mcd(target_audio, converted_audio, fs, n_mels=128)
    scores['MCD'].append(mcd_score)

    # target_mc, converted_mc = get_mc(target_audio, fs), get_mc(target_audio, fs)
    # mcd_score = melcd(target_mc, converted_mc)
    # print(target_mc.shape, converted_mc.shape)

    # X_aligned, Y_aligned = DTWAligner(verbose=0, dist=melcd).transform((X, Y))

    # target_f0, target_sp, target_ap = pw.wav2world(target_audio, fs)
    # converted_f0, converted_sp, converted_ap = pw.wav2world(converted_audio, fs)
    # target_mcep = pw.code_spectral_envelope(target_sp, fs, 36)
    # converted_mcep = pw.code_spectral_envelope(converted_sp, fs, 36)
    # mcd_score = np.mean(cdist(target_mcep, converted_mcep, 'euclidean'))

    # Print the scores
    # print(f'PESQ score: {pesq_score:.5f}')
    # print(f'STOI score: {stoi_score:.5f}')
    # print(f'MCD score: {mcd_score:.5f}')
    # print(f'SD score: {sd_score:.5f}')
    # print(f'SSNR score: {ssnr_score:.5f}')
    # print('done')


100%|██████████| 2/2 [00:01<00:00,  1.84it/s]


In [37]:
print("\nMean Scores:")
# Calculate mean scores across all samples
for metric in scores.keys():
    mean_score = np.mean(scores[metric])
    print(f"{metric}: {mean_score:.5f}")


Mean Scores:
MCD: 15.53615
PESQ: 1.04730
STOI: 0.21686
SD: 69352.50693
SSNR: -4.00102
