In [1]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.linalg import eigh, LinAlgError
from IPython.display import Audio
import wave
import librosa
from scipy.io import wavfile
import scipy
from chainer import Variable
import torchaudio.transforms
import torch
import torchaudio.functional as F
#Evaluation stuff
from pesq import pesq
from pystoi import stoi
import mir_eval
from numpy.fft import rfft, irfft
from scipy import signal
from scipy.io.wavfile import write as wav_write
import string
import threading

### Signal Processing

In [2]:
def istft_reconstruction_from_complex(real, imag, hop_length=160, win_length=400, length=65535):
    spec = real + 1j*imag
    wav = librosa.istft(spec, hop_length=hop_length, win_length=win_length, length=length)
    return np.clip(wav, -1., 1.)
def _samples_to_stft_frames(samples, size, shift):
    """
    Calculates STFT frames from samples in time domain.
    :param samples: Number of samples in time domain.
    :param size: FFT size.
    :param shift: Hop in samples.
    :return: Number of STFT frames.
    """

    return np.ceil((samples - size + shift) / shift).astype(np.int64)


def segment_axis(a, length, overlap=0, axis=None, end='cut', endvalue=0):
    """Generate a new array that chops the given array along the given axis into overlapping frames.

    example:
    >>> segment_axis(np.arange(10), 4, 2)
    array([[0, 1, 2, 3],
           [2, 3, 4, 5],
           [4, 5, 6, 7],
           [6, 7, 8, 9]])

    arguments:
    a       The array to segment
    length  The length of each frame
    overlap The number of array elements by which the frames should overlap
    axis    The axis to operate on; if None, act on the flattened array
    end     What to do with the last frame, if the array is not evenly
            divisible into pieces. Options are:

            'cut'   Simply discard the extra values
            'wrap'  Copy values from the beginning of the array
            'pad'   Pad with a constant value

    endvalue    The value to use for end='pad'

    The array is not copied unless necessary (either because it is
    unevenly strided and being flattened or because end is set to
    'pad' or 'wrap').
    """

    if axis is None:
        a = np.ravel(a)  # may copy
        axis = 0

    l = a.shape[axis]

    if overlap >= length: raise ValueError(
            "frames cannot overlap by more than 100%")
    if overlap < 0 or length <= 0: raise ValueError(
            "overlap must be nonnegative and length must be positive")

    if l < length or (l - length) % (length - overlap):
        if l > length:
            roundup = length + (1 + (l - length) // (length - overlap)) * (
                length - overlap)
            rounddown = length + ((l - length) // (length - overlap)) * (
                length - overlap)
        else:
            roundup = length
            rounddown = 0
        assert rounddown < l < roundup
        assert roundup == rounddown + (length - overlap) or (
            roundup == length and rounddown == 0)
        a = a.swapaxes(-1, axis)

        if end == 'cut':
            a = a[..., :rounddown]
        elif end in ['pad', 'wrap']:  # copying will be necessary
            s = list(a.shape)
            s[-1] = roundup
            b = np.empty(s, dtype=a.dtype)
            b[..., :l] = a
            if end == 'pad':
                b[..., l:] = endvalue
            elif end == 'wrap':
                b[..., l:] = a[..., :roundup - l]
            a = b

        a = a.swapaxes(-1, axis)

    l = a.shape[axis]
    if l == 0: raise ValueError(
            "Not enough data points to segment array in 'cut' mode; "
            "try 'pad' or 'wrap'")
    assert l >= length
    assert (l - length) % (length - overlap) == 0
    n = 1 + (l - length) // (length - overlap)
    s = a.strides[axis]
    newshape = a.shape[:axis] + (n, length) + a.shape[axis + 1:]
    newstrides = a.strides[:axis] + ((length - overlap) * s, s) + a.strides[
                                                                  axis + 1:]

    if not a.flags.contiguous:
        a = a.copy()
        newstrides = a.strides[:axis] + ((length - overlap) * s, s) + a.strides[
                                                                      axis + 1:]
        return np.ndarray.__new__(np.ndarray, strides=newstrides,
                                  shape=newshape, buffer=a, dtype=a.dtype)

    try:
        return np.ndarray.__new__(np.ndarray, strides=newstrides,
                                  shape=newshape, buffer=a, dtype=a.dtype)
    except TypeError or ValueError:
        warnings.warn("Problem with ndarray creation forces copy.")
        a = a.copy()
        # Shape doesn't change but strides does
        newstrides = a.strides[:axis] + ((length - overlap) * s, s) + a.strides[
                                                                      axis + 1:]
        return np.ndarray.__new__(np.ndarray, strides=newstrides,
                                  shape=newshape, buffer=a, dtype=a.dtype)


def _stft_frames_to_samples(frames, size, shift):
    """
    Calculates samples in time domain from STFT frames
    :param frames: Number of STFT frames.
    :param size: FFT size.
    :param shift: Hop in samples.
    :return: Number of samples in time domain.
    """
    return frames * shift + size - shift


def stft(time_signal, time_dim=None, size=512, shift=256,
         window=signal.blackman, fading=True, window_length=None):
    """
    Calculates the short time Fourier transformation of a multi channel multi
    speaker time signal. It is able to add additional zeros for fade-in and
    fade out and should yield an STFT signal which allows perfect
    reconstruction.

    :param time_signal: multi channel time signal.
    :param time_dim: Scalar dim of time.
        Default: None means the biggest dimension
    :param size: Scalar FFT-size.
    :param shift: Scalar FFT-shift. Typically shift is a fraction of size.
    :param window: Window function handle.
    :param fading: Pads the signal with zeros for better reconstruction.
    :param window_length: Sometimes one desires to use a shorter window than
        the fft size. In that case, the window is padded with zeros.
        The default is to use the fft-size as a window size.
    :return: Single channel complex STFT signal
        with dimensions frames times size/2+1.
    """
    if time_dim is None:
        time_dim = np.argmax(time_signal.shape)

    # Pad with zeros to have enough samples for the window function to fade.
    if fading:
        pad = [(0, 0)] * time_signal.ndim
        pad[time_dim] = [size - shift, size - shift]
        time_signal = np.pad(time_signal, pad, mode='constant')

    # Pad with trailing zeros, to have an integral number of frames.
    frames = _samples_to_stft_frames(time_signal.shape[time_dim], size, shift)
    samples = _stft_frames_to_samples(frames, size, shift)
    pad = [(0, 0)] * time_signal.ndim
    pad[time_dim] = [0, samples - time_signal.shape[time_dim]]
    time_signal = np.pad(time_signal, pad, mode='constant')

    if window_length is None:
        window = window(size)
    else:
        window = window(window_length)
        window = np.pad(window, (0, size - window_length), mode='constant')

    time_signal_seg = segment_axis(time_signal, size,
                                   size - shift, axis=time_dim)

    letters = string.ascii_lowercase
    mapping = letters[:time_signal_seg.ndim] + ',' + letters[time_dim + 1] \
              + '->' + letters[:time_signal_seg.ndim]

    return rfft(np.einsum(mapping, time_signal_seg, window),
                axis=time_dim + 1)

def _biorthogonal_window_loopy(analysis_window, shift):
    """
    This version of the synthesis calculation is as close as possible to the
    Matlab impelementation in terms of variable names.

    The results are equal.

    The implementation follows equation A.92 in
    Krueger, A. Modellbasierte Merkmalsverbesserung zur robusten automatischen
    Spracherkennung in Gegenwart von Nachhall und Hintergrundstoerungen
    Paderborn, Universitaet Paderborn, Diss., 2011, 2011
    """
    fft_size = len(analysis_window)
    assert np.mod(fft_size, shift) == 0
    number_of_shifts = len(analysis_window) // shift

    sum_of_squares = np.zeros(shift)
    for synthesis_index in range(0, shift):
        for sample_index in range(0, number_of_shifts + 1):
            analysis_index = synthesis_index + sample_index * shift

            if analysis_index + 1 < fft_size:
                sum_of_squares[synthesis_index] \
                    += analysis_window[analysis_index] ** 2

    sum_of_squares = np.kron(np.ones(number_of_shifts), sum_of_squares)
    synthesis_window = analysis_window / sum_of_squares / fft_size
    return synthesis_window

def istft(stft_signal, size=512, shift=256,
          window=signal.blackman, fading=True, window_length=None):
    """
    Calculated the inverse short time Fourier transform to exactly reconstruct
    the time signal.

    :param stft_signal: Single channel complex STFT signal
        with dimensions frames times size/2+1.
    :param size: Scalar FFT-size.
    :param shift: Scalar FFT-shift. Typically shift is a fraction of size.
    :param window: Window function handle.
    :param fading: Removes the additional padding, if done during STFT.
    :param window_length: Sometimes one desires to use a shorter window than
        the fft size. In that case, the window is padded with zeros.
        The default is to use the fft-size as a window size.
    :return: Single channel complex STFT signal
    :return: Single channel time signal.
    """
    print(f'Shape of stft_signal is:{stft_signal.shape[1]}')
    a = size // 2 + 1
    print(f'size // 2 + 1 is: ={a}')
    assert stft_signal.shape[1] == size // 2 +1

    if window_length is None:
        window = window(size)
    else:
        window = window(window_length)
        window = np.pad(window, (0, size - window_length), mode='constant')

    window = _biorthogonal_window_loopy(window, shift)

    # Why? Line created by Hai, Lukas does not know, why it exists.
    window *= size

    time_signal = scipy.zeros(stft_signal.shape[0] * shift + size - shift)

    for j, i in enumerate(range(0, len(time_signal) - size + shift, shift)):
        time_signal[i:i + size] += window * np.real(irfft(stft_signal[j]))

    # Compensate fade-in and fade-out
    if fading:
        time_signal = time_signal[
                      size - shift:len(time_signal) - (size - shift)]

    return time_signal

### Class Beamformer

In [3]:
def get_power_spectral_density_matrix(observation, mask=None, normalize=True):
    """
    Calculates the weighted power spectral density matrix.

    This does not yet work with more than one target mask.

    :param observation: Complex observations with shape (bins, sensors, frames)
    :param mask: Masks with shape (bins, frames) or (bins, 1, frames)
    :return: PSD matrix with shape (bins, sensors, sensors)
    """
    bins, sensors, frames = observation.shape
    print(observation.shape)
    print(mask.shape)

    if mask is None:
        mask = np.ones((bins, frames))
    if mask.ndim == 2:
        #mask = mask[:, np.newaxis, :]
        mask = mask[:,:,  np.newaxis]
        print(f'Mask shape is: {mask.shape}')
        #mask = mask.reshape((mask.shape[0],mask.shape[1],1))
        print(f'Mask shape is: {mask.shape}')
    
    print(f'Spectrogram shape is: {observation.shape}')

    psd = np.einsum('...dt,...et->...de', mask * observation,
                    observation.conj())
    print(f'shape of PSD is: {psd.shape}')
    if normalize:
        normalization = np.sum(mask, axis=-1, keepdims=True)
        psd /= normalization
    return psd


def condition_covariance(x, gamma):
    """see https://stt.msu.edu/users/mauryaas/Ashwini_JPEN.pdf (2.3)"""
    scale = gamma * np.trace(x) / x.shape[-1]
    scaled_eye = np.eye(x.shape[-1]) * scale
    return (x + scaled_eye) / (1 + gamma)

def get_gev_vector(target_psd_matrix, noise_psd_matrix):
    """
    Returns the GEV beamforming vector.
    :param target_psd_matrix: Target PSD matrix
        with shape (bins, sensors, sensors)
    :param noise_psd_matrix: Noise PSD matrix
        with shape (bins, sensors, sensors)
    :return: Set of beamforming vectors with shape (bins, sensors)
    """
    bins, sensors, _ = target_psd_matrix.shape
    beamforming_vector = np.empty((bins, sensors), dtype=np.complex128)
    for f in range(bins):
        try:
            eigenvals, eigenvecs = eigh(target_psd_matrix[f, :, :],
                                        noise_psd_matrix[f, :, :])
            beamforming_vector[f, :] = eigenvecs[:, -1]
        except np.linalg.LinAlgError:
            print('LinAlg error for frequency {}'.format(f))
            beamforming_vector[f, :] = (
                np.ones((sensors,)) / np.trace(noise_psd_matrix[f]) * sensors
            )
    return beamforming_vector

def gev_wrapper_on_masks(mix, noise_mask=None, target_mask=None,
                         normalization=False):
    #if noise_mask is None and target_mask is None:
     #   raise ValueError('At least one mask needs to be present.')

    org_dtype = mix.dtype
    mix = mix.astype(np.complex128)
    mix = mix.T
    print(mix.shape)
    #if noise_mask is not None:
     #   noise_mask = noise_mask.T
    #if target_mask is not None:
     #   target_mask = target_mask.T

    target_psd_matrix = get_power_spectral_density_matrix(
        mix, target_mask, normalize=False)
    noise_psd_matrix = get_power_spectral_density_matrix(
        mix, noise_mask, normalize=True)
    noise_psd_matrix = condition_covariance(noise_psd_matrix, 1e-6)
    noise_psd_matrix /= np.trace(
        noise_psd_matrix, axis1=-2, axis2=-1)[..., None, None]
    W_gev = get_gev_vector(target_psd_matrix, noise_psd_matrix)
    W_gev = phase_correction(W_gev)

    if normalization:
        W_gev = blind_analytic_normalization(W_gev, noise_psd_matrix)
        
    output = apply_beamforming_vector(W_gev, mix)
    output = output.astype(org_dtype)
    target = np.abs(target_psd_matrix).astype(np.float32)
    #print(target.shape)
    #target = apply_beamforming_vector(target, mix)

    return output.T

def phase_correction(vector):
        """Phase correction to reduce distortions due to phase inconsistencies
        Args:
        vector: Beamforming vector with shape (..., bins, sensors).
        Returns: Phase corrected beamforming vectors. Lengths remain.
        """
        
        w = vector.copy()
        F, D = w.shape
        for f in range(1, F):
            w[f, :] *= np.exp(-1j*np.angle(
                np.sum(w[f, :] * w[f-1, :].conj(), axis=-1, keepdims=True)))
        return w
    
def apply_beamforming_vector(vector, mix):
    return np.einsum('...a,...at->...t', vector.conj(), mix)

### Metric Function

In [4]:
def getSeparationMetrics(audio1, audio2, audio1_gt, audio2_gt):
    reference_sources = np.concatenate((np.expand_dims(audio1_gt, axis=0), np.expand_dims(audio2_gt, axis=0)), axis=0)
    estimated_sources = np.concatenate((np.expand_dims(audio1, axis=0), np.expand_dims(audio2, axis=0)), axis=0)
    (sdr, sir, sar, perm) = mir_eval.separation.bss_eval_sources(reference_sources, estimated_sources, False)
    return np.mean(sdr), np.mean(sir), np.mean(sar)
def compute_snr(reference_signal, estimated_signal):
    signal_power = np.sum(reference_signal ** 2)
    noise_power = np.sum((reference_signal - estimated_signal) ** 2)
    snr = 10 * np.log10(signal_power / noise_power)
    return snr

### Loading audio files

In [5]:
target_0, sr = librosa.load("E:/AV-speech-separation/data/VoxCeleb2/results/simulated/id04366_8L6xm1OyG0w_00069VSid05594_RcwKkgqgPSA_00085/00069_mic0_voice0.wav")
target_1, sr = librosa.load("E:/AV-speech-separation/data/VoxCeleb2/results/simulated/id04366_8L6xm1OyG0w_00069VSid05594_RcwKkgqgPSA_00085/00069_mic1_voice0.wav")
target_2, sr = librosa.load("E:/AV-speech-separation/data/VoxCeleb2/results/simulated/id04366_8L6xm1OyG0w_00069VSid05594_RcwKkgqgPSA_00085/00069_mic2_voice0.wav")
target_3, sr = librosa.load("E:/AV-speech-separation/data/VoxCeleb2/results/simulated/id04366_8L6xm1OyG0w_00069VSid05594_RcwKkgqgPSA_00085/00069_mic3_voice0.wav")

noise_0, sr = librosa.load("E:/AV-speech-separation/data/VoxCeleb2/results/simulated/id04366_8L6xm1OyG0w_00069VSid05594_RcwKkgqgPSA_00085/00085_mic0_voice1.wav")
noise_1, sr = librosa.load("E:/AV-speech-separation/data/VoxCeleb2/results/simulated/id04366_8L6xm1OyG0w_00069VSid05594_RcwKkgqgPSA_00085/00085_mic1_voice1.wav")
noise_2, sr = librosa.load("E:/AV-speech-separation/data/VoxCeleb2/results/simulated/id04366_8L6xm1OyG0w_00069VSid05594_RcwKkgqgPSA_00085/00085_mic2_voice1.wav")
noise_3, sr = librosa.load("E:/AV-speech-separation/data/VoxCeleb2/results/simulated/id04366_8L6xm1OyG0w_00069VSid05594_RcwKkgqgPSA_00085/00085_mic3_voice1.wav")

mix_0, sr = librosa.load("E:/AV-speech-separation/data/VoxCeleb2/results/simulated/id04366_8L6xm1OyG0w_00069VSid05594_RcwKkgqgPSA_00085/00069_mic0_voice0_and_00085_mic0_voice1_mixed.wav")
mix_1, sr = librosa.load("E:/AV-speech-separation/data/VoxCeleb2/results/simulated/id04366_8L6xm1OyG0w_00069VSid05594_RcwKkgqgPSA_00085/00069_mic1_voice0_and_00085_mic1_voice1_mixed.wav")
mix_2, sr = librosa.load("E:/AV-speech-separation/data/VoxCeleb2/results/simulated/id04366_8L6xm1OyG0w_00069VSid05594_RcwKkgqgPSA_00085/00069_mic2_voice0_and_00085_mic2_voice1_mixed.wav")
mix_3, sr = librosa.load("E:/AV-speech-separation/data/VoxCeleb2/results/simulated/id04366_8L6xm1OyG0w_00069VSid05594_RcwKkgqgPSA_00085/00069_mic3_voice0_and_00085_mic3_voice1_mixed.wav")

### Ideal Ratio Masks

In [6]:
def cirm(y, s, K=10, C=0.1, flat=True):
    y = y.astype('complex128')
    s = s.astype('complex128')
    mr = (np.real(y) * np.real(s) + np.imag(y) * np.imag(s))/(np.real(y)**2 + np.imag(y)**2)
    mi = (np.real(y) * np.imag(s) - np.imag(y) * np.real(s))/(np.real(y)**2 + np.imag(y)**2)
    m = mr + 1j * mi
    if flat:
        return m
    else:
        return K * ((1 - np.exp(-C * m))/(1 + np.exp(-C * m)))

### Multichannel 

In [7]:
multi_channel_mix = np.stack((mix_0, mix_1, mix_2, mix_3), axis=0)

Y = stft(multi_channel_mix, time_dim=1).transpose((1, 0, 2))

display(Audio(multi_channel_mix, rate=16000)) 


  window = window(size)


### loading masks

In [8]:
target_Mask0 = np.load("E:/AV-speech-separation/data/VoxCeleb2/results/simulated/id04366_8L6xm1OyG0w_00069VSid05594_RcwKkgqgPSA_00085/00069_mic0_voice0_mask.npy")
target_Mask1 = np.load("E:/AV-speech-separation/data/VoxCeleb2/results/simulated/id04366_8L6xm1OyG0w_00069VSid05594_RcwKkgqgPSA_00085/00069_mic1_voice0_mask.npy")
target_Mask2 = np.load("E:/AV-speech-separation/data/VoxCeleb2/results/simulated/id04366_8L6xm1OyG0w_00069VSid05594_RcwKkgqgPSA_00085/00069_mic2_voice0_mask.npy")
target_Mask3 = np.load("E:/AV-speech-separation/data/VoxCeleb2/results/simulated/id04366_8L6xm1OyG0w_00069VSid05594_RcwKkgqgPSA_00085/00069_mic3_voice0_mask.npy")

noise_Mask0 = np.load("E:/AV-speech-separation/data/VoxCeleb2/results/simulated/id04366_8L6xm1OyG0w_00069VSid05594_RcwKkgqgPSA_00085/00085_mic0_voice1_mask.npy")
noise_Mask1 = np.load("E:/AV-speech-separation/data/VoxCeleb2/results/simulated/id04366_8L6xm1OyG0w_00069VSid05594_RcwKkgqgPSA_00085/00085_mic1_voice1_mask.npy")
noise_Mask2 = np.load("E:/AV-speech-separation/data/VoxCeleb2/results/simulated/id04366_8L6xm1OyG0w_00069VSid05594_RcwKkgqgPSA_00085/00085_mic2_voice1_mask.npy")
noise_Mask3 = np.load("E:/AV-speech-separation/data/VoxCeleb2/results/simulated/id04366_8L6xm1OyG0w_00069VSid05594_RcwKkgqgPSA_00085/00085_mic3_voice1_mask.npy")


### Stacking Masks

In [9]:
noise_mask = np.stack((noise_Mask0[0], noise_Mask1[0], noise_Mask2[0], noise_Mask3[0]), axis=-1)
target_mask = np.stack((target_Mask0[0], target_Mask1[0], target_Mask2[0], target_Mask3[0]), axis=-1)
print('Shape of target_mask:', target_mask.shape)
noise_mask_real = noise_mask[0,0, :, :,:]
noise_mask_imag = noise_mask[0,1, :, :,:]
target_mask_real = target_mask[0,0, :, :,:]
target_mask_imag = target_mask[0,1, :, :,:]

# Compute the magnitude of the masks
noise_mask_magnitude = np.sqrt(noise_mask_real**2 + noise_mask_imag**2)
target_mask_magnitude = np.sqrt(target_mask_real**2 + target_mask_imag**2)


Shape of target_mask: (1, 2, 256, 256, 4)


In [10]:
N_mask = np.median(noise_mask_magnitude, axis=1)
X_mask = np.median(target_mask_magnitude, axis=1)

N_mask = np.pad(N_mask, ((0, 1), (0, 0)), mode='constant', constant_values=1e-16)  # Shape (257, 2)
X_mask = np.pad(X_mask, ((0, 1), (0, 0)), mode='constant', constant_values=1e-16) 

print(N_mask.shape)
print(X_mask.shape)

(257, 4)
(257, 4)


In [11]:
Y_hat = gev_wrapper_on_masks(Y, N_mask, X_mask )
Y_noise = gev_wrapper_on_masks(Y, X_mask, N_mask )

(257, 4, 454)
(257, 4, 454)
(257, 4)
Mask shape is: (257, 4, 1)
Mask shape is: (257, 4, 1)
Spectrogram shape is: (257, 4, 454)
shape of PSD is: (257, 4, 4)
(257, 4, 454)
(257, 4)
Mask shape is: (257, 4, 1)
Mask shape is: (257, 4, 1)
Spectrogram shape is: (257, 4, 454)
shape of PSD is: (257, 4, 4)
(257, 4, 454)
(257, 4, 454)
(257, 4)
Mask shape is: (257, 4, 1)
Mask shape is: (257, 4, 1)
Spectrogram shape is: (257, 4, 454)
shape of PSD is: (257, 4, 4)
(257, 4, 454)
(257, 4)
Mask shape is: (257, 4, 1)
Mask shape is: (257, 4, 1)
Spectrogram shape is: (257, 4, 454)
shape of PSD is: (257, 4, 4)


In [12]:
X_hat=istft(Y_hat,512)
X_noise=istft(Y_noise,512)

Shape of stft_signal is:257
size // 2 + 1 is: =257
Shape of stft_signal is:257
size // 2 + 1 is: =257


  window = window(size)
  time_signal = scipy.zeros(stft_signal.shape[0] * shift + size - shift)


In [13]:
X_avg = (target_0+target_1+target_2+target_3)/4
X_noise = (noise_0+noise_1+noise_2+noise_3)/4

### Averaging over mics

In [14]:
fs = 16000
X_target_hat = X_hat[:115719].flatten()
X_noise_hat =X_noise[:115719].flatten()
target_segment = X_avg.flatten()
noise_segment = X_noise.flatten()

In [15]:
X_target_hat = X_hat[:115719].flatten()
X_noise_hat =X_noise[:115719].flatten()
targets = [target_0, target_1, target_2, target_3]
noises = [noise_0, noise_1, noise_2, noise_3]
SDR = []
SIR = []
SAR = []
PESQ_SCORE = []
for i in range(4):
    target_segment = targets[i].flatten()
    noise_segment = noises[i].flatten()
    sdr, sir , sar = getSeparationMetrics(X_target_hat, X_noise_hat, target_segment, noise_segment)
    pesq_score1 = pesq(fs, target_segment, X_target_hat, 'wb')
    snr = compute_snr(target_segment, X_target_hat)
    SDR.append(sdr)
    SIR.append(sir)
    SAR.append(sar)
    PESQ_SCORE.append(pesq_score1)


print("SDR:", np.mean(SDR))
print("SIR:", np.mean(SIR))
print("SAR:", np.mean(SAR))
print("PESQ Score:", np.mean(PESQ_SCORE))


SDR: 0.5350253097015285
SIR: 13.926089683377487
SAR: 1.8786379393627781
PESQ Score: 1.206543505191803
