In [33]:
import mir_eval as me
import os
import numpy as np
import soundfile as sf
import sounddevice as sd
from sklearn.model_selection import KFold
import IPython.display as ipd

## 1 Load data

In [72]:
ref_src_mixes = []
ref_src_vocals = []
est_src_vocals = []
ref_src_acc = []
est_src_acc = []

for filename in os.listdir('../../wav_files/'):
    if 'mix' in filename:
        f, _ = sf.read('../../wav_files/' + filename)
        ref_src_mixes.append(f)
    elif 'vocal_sample' in filename:
        f, _ = sf.read('../../wav_files/' + filename)
        est_src_vocals.append(f)
    elif 'vocals' in filename:
        f, _ = sf.read('../../wav_files/' + filename)
        ref_src_vocals.append(f)


In [73]:
for filename in os.listdir('../../wav_files/predicted_acc'):
    f, _ = sf.read('../../wav_files/predicted_acc/' + filename)
    est_src_acc.append(f)
    
for filename in os.listdir('../../wav_files/truth_acc'):
    f, _ = sf.read('../../wav_files/truth_acc/' + filename)
    ref_src_acc.append(f)

In [80]:
ref_src_mixes = np.array(ref_src_mixes)
ref_src_vocals = np.array(ref_src_vocals)
est_src_vocals = np.array(est_src_vocals)
ref_src_acc = np.array(ref_src_acc)
est_src_acc = np.array(est_src_acc)

## 2 Metrics

https://craffel.github.io/mir_eval/#module-mir_eval.separation

### 2.1 Evaluate
Computes ```bss_eval_sources``` (for fewer than 3 dimensions) and ```bss_eval_images```.

``` python
mir_eval.separation.evaluate(reference_sources, estimated_sources, **kwargs)
```

#### Returns

	
- **scores:** ```dict```, dictionary of scores, where the key is the metric name (str) and the value is the (float) score achieved.

### 2.2 Blind Source Separation (BSS)
``` python
mir_eval.separation.bss_eval_sources(reference_sources, estimated_sources, compute_permutation=True)
```

#### Use

From "WAVE-U-NET: A MULTI-SCALE NEURAL NETWORK FOR END-TO-END AUDIO SOURCE SEPARATION"

_Since the collection of segment-wise vocal SDR values across the dataset is not normally distributed (compare Fig- ure 3 for vocals), the mean and standard deviation are not sufficient to adequately summarise it. As a workaround, **we take the median over segments**, as it is robust against outliers and intuitively describes the minimum performance that is achieved 50% of the time. To describe the spread of the distribution, we use the median absolute deviation (MAD) as a rank-based equivalent to the standard deviation (SD). It is defined as the median of the absolute deviations from the overall median and is easily interpretable, since a value of x means that 50% of values have an absolute difference from the median that is lower than x._

#### Returns


- **sdr:** ```np.ndarray, shape=(nsrc,)```, vector of Signal to Distortion Ratios (SDR)


- **sir:** ```np.ndarray, shape=(nsrc,)```, vector of Source to Interference Ratios (SIR)


- **sar:** ```np.ndarray, shape=(nsrc,)```, vector of Sources to Artifacts Ratios (SAR)


- perm: not used

#### Normalized SDR (NSDR)

$NSDR(S_e, S_r, S_m) = SDR(S_e, S_r) - SDR(S_m, S_r)$

where $S_e$ is the estimated isolated signal, $S_r$ is the reference isolated signal, and $S_m$ is the mixed signal.

In [81]:
def calculate_nsdr(reference_sources, estimated_sources, mix_sources):
    sdr1, _, _, _ = me.separation.bss_eval_sources(reference_sources, 
                                                   estimated_sources, 
                                                   compute_permutation=False)
    sdr2, _, _, _ = me.separation.bss_eval_sources(reference_sources, 
                                                   mix_sources, 
                                                   compute_permutation=False)
    nsdr = sdr1 - sdr2
    return nsdr

## 3 Evaluate

### 3.1 Accompaniment

#### 3.1.1 Evaluate

In [121]:
def evaluate(ref_src, est_src, mix_src):
    sdrs = []
    sirs = []
    sars = []
    nsdrs = []
    for ref, est, mix in zip(ref_src, est_src, mix_src):
        _, sir, sar, _ = me.separation.bss_eval_sources(ref, est, compute_permutation=False)
        nsdr = calculate_nsdr(ref, est, mix)

        sirs.extend(sir)
        sars.extend(sar)
        nsdrs.append(nsdr)
        
    return sirs, sars, nsdrs

In [6]:
def evaluate2(ref_src, est_src, mix_src):
    print('Calculating sir and sar...')
    _, sir, sar, _ = me.separation.bss_eval_sources(ref_src, est_src, compute_permutation=False)
    print('Calculating sir and nsdr...')
    nsdr = calculate_nsdr(ref_src, est_src, mix_src)
    return sir, sar, nsdr

In [247]:
k = 26

In [248]:
sirs_acc, sars_acc, nsdrs_acc = evaluate(ref_src_acc[0:k], 
                                         est_src_acc[0:k], 
                                         ref_src_mixes[0:k])

In [249]:
means = {
    'NSDR Instrumental': np.mean(nsdrs_acc),
    'SIR Instrumental': np.mean(sirs_acc),
    'SAR Instrumental': np.mean(sars_acc)
}
means

{'NSDR Instrumental': 33.350531612200825,
 'SIR Instrumental': 19.1520445083575,
 'SAR Instrumental': 11.656918441385333}

#### 3.1.2 MIREX

In [None]:
mirex_results = {
    'NSDR Instrumental': 7.945,
    'SIR Instrumental': 21.975,
    'SAR Instrumental': 15.462
    }

### 3.2 Vocals

#### 3.2.1 Remove tracks that don't contain vocals

In [81]:
ref_src_mixes_clean = []
ref_src_vocals_clean = []
est_src_vocals_clean = []

for i in range(est_src_vocals.shape[0]):
    if np.count_nonzero(ref_src_vocals[i]):
        ref_src_mixes_clean.append(ref_src_mixes[i])
        ref_src_vocals_clean.append(ref_src_vocals[i])
        est_src_vocals_clean.append(est_src_vocals[i])
        
ref_src_mixes_clean = np.array(ref_src_mixes_clean)
ref_src_vocals_clean = np.array(ref_src_vocals_clean)
est_src_vocals_clean = np.array(est_src_vocals_clean)

#### 3.2.2 Evaluate

In [213]:
k = 25

In [214]:
sirs_voc, sars_voc, nsdrs_voc = evaluate2(ref_src_vocals_clean[0:k], 
                                         est_src_vocals_clean[0:k], 
                                         ref_src_mixes[0:k])

In [215]:
means = {
    'NSDR Vocal': np.mean(nsdrs_voc),
    'SIR Vocal': np.mean(sirs_voc),
    'SAR Vocal': np.mean(sars_voc)
}
means

{'NSDR Vocal': -0.635265668437744,
 'SIR Vocal': -15.354027118129455,
 'SAR Vocal': -4.8972753524836}

#### 3.2.3 MIREX

In [None]:
mirex_results = {
    'NSDR Vocal': 8.681,
    'SIR Vocal': 15.308,
    'SAR Vocal': 11.301
}

### 3.3 Results

In [223]:
means = {
    'NSDR Vocal': np.mean(nsdrs_voc),
    'NSDR Instrumental': np.mean(nsdrs_acc),
    'SIR Vocal': np.mean(sirs_voc),
    'SIR Instrumental': np.mean(sirs_acc),
    'SAR Vocal': np.mean(sars_voc),
    'SAR Instrumental': np.mean(sars_acc)
}
means

{'NSDR Vocal': -0.635265668437744,
 'NSDR Instrumental': 34.38182717694601,
 'SIR Vocal': -15.354027118129455,
 'SIR Instrumental': 20.99578091076507,
 'SAR Vocal': -4.8972753524836,
 'SAR Instrumental': 11.372903548211967}

## 4 Comparison with MIREX results

In [None]:
mirex_results = {
    'NSDR Vocal': 8.681,
    'NSDR Instrumental': 7.945,
    'SIR Vocal': 15.308,
    'SIR Instrumental': 21.975,
    'SAR Vocal': 11.301,
    'SAR Instrumental': 15.462
    }

## 5 Compare with self-evaluation
Run a file containing the original vocals/accompaniment against itself.

In [237]:
file1, _ = sf.read("./mir_eval test/bass.wav")
file2, _ = sf.read("./mir_eval test/drums.wav")
file3, _ = sf.read("./mir_eval test/vocals.wav")
file4, _ = sf.read("./mir_eval test/other.wav")
file0 = file1 + file2 + file3 + file4

In [242]:
vocal = np.array([file3.T[0], file3.T[0]])
mix = np.array([file0.T[0], file0.T[0]])

In [243]:
sirs_test, sars_test, nsdrs_test = evaluate2(vocal, 
                                             vocal, 
                                             mix)

In [244]:
sirs_test

array([247.76600046, 247.76600046])

In [245]:
sars_test

array([247.78332681, 247.78332681])

In [246]:
nsdrs_test

array([268.42704394, 268.42704394])

## Compare with Spleeter

#### Vocals

In [108]:
ref_sources_voc = []
est_sources_spleeter_voc = []
est_sources_our_voc = []

spleeter_voc, sr1 = sf.read("./spleeter test/good 2/021 - James May - On The Line_voc_spleeter.wav")
truth_voc, sr2 = sf.read("./spleeter test/good 2/021 - James May - On The Line_voc_truth.wav")
our_voc, _ = sf.read("./spleeter test/good 2/021 - James May - On The Line_voc.wav")
spleeter_voc = librosa.core.resample(spleeter_voc.T, sr1, sr2).T
spleeter_voc = np.mean(spleeter_voc, axis=1)
spleeter_voc = spleeter_voc[:-1]
ref_sources_voc.append(truth_voc)
est_sources_spleeter_voc.append(spleeter_voc)
est_sources_our_voc.append(our_voc)

spleeter_voc, sr1 = sf.read("./spleeter test/good 1/019 - James Elder _ Mark M Thompson - The English Actor_voc_spleeter.wav")
truth_voc, sr2 = sf.read("./spleeter test/good 1/019 - James Elder _ Mark M Thompson - The English Actor_voc_truth.wav")
our_voc, _ = sf.read("./spleeter test/good 1/019 - James Elder _ Mark M Thompson - The English Actor_voc.wav")
spleeter_voc = librosa.core.resample(spleeter_voc.T, sr1, sr2).T
spleeter_voc = np.mean(spleeter_voc, axis=1)
spleeter_voc = spleeter_voc[:-1]
ref_sources_voc.append(truth_voc)
est_sources_spleeter_voc.append(spleeter_voc)
est_sources_our_voc.append(our_voc)

ref_sources_voc = np.array(ref_sources_voc)
est_sources_spleeter_voc = np.array(est_sources_spleeter_voc)
est_sources_our_voc = np.array(est_sources_our_voc)

In [120]:
est_sources_spleeter_voc.shape
ref_sources_voc.shape

(2, 97536)

In [109]:
ipd.Audio(spleeter_voc, rate=sr2)

#### Accompaniment

In [140]:
ref_sources_acc = []
est_sources_spleeter_acc = []
est_sources_our_acc = []

spleeter_acc, _ = sf.read("./spleeter test/good 2/021 - James May - On The Line_acc_spleeter.wav")
truth_acc, _ = sf.read("./spleeter test/good 2/021 - James May - On The Line_acc_truth.wav")
our_acc, _ = sf.read("./spleeter test/good 2/021 - James May - On The Line_acc.wav")

spleeter_acc = librosa.core.resample(spleeter_acc.T, sr1, sr2).T
spleeter_acc = np.mean(spleeter_acc, axis=1)
spleeter_acc = spleeter_acc[:-1]

ref_sources_acc.append(truth_acc)
est_sources_spleeter_acc.append(spleeter_acc)
est_sources_our_acc.append(our_acc)



spleeter_acc, _ = sf.read("./spleeter test/good 1/019 - James Elder _ Mark M Thompson - The English Actor_acc_spleeter.wav")
truth_acc, _ = sf.read("./spleeter test/good 1/019 - James Elder _ Mark M Thompson - The English Actor_acc_truth.wav")
our_acc, _ = sf.read("./spleeter test/good 1/019 - James Elder _ Mark M Thompson - The English Actor_acc.wav")

spleeter_acc = librosa.core.resample(spleeter_acc.T, sr1, sr2).T
spleeter_acc = np.mean(spleeter_acc, axis=1)
spleeter_acc = spleeter_acc[:-1]

ref_sources_acc.append(truth_acc)
est_sources_spleeter_acc.append(spleeter_acc)
est_sources_our_acc.append(our_acc)

ref_sources_acc = np.array(ref_sources_acc)
est_sources_spleeter_acc = np.array(est_sources_spleeter_acc)
est_sources_our_acc = np.array(est_sources_our_acc)

#### Mixes

In [134]:
mixes = []
mix, _ = sf.read("./spleeter test/good 2/021 - James May - On The Line_mix.wav")
mixes.append(mix)
mix, _ = sf.read("./spleeter test/good 1/019 - James Elder _ Mark M Thompson - The English Actor_mix.wav")
mixes.append(mix)
mixes = np.array(mixes)

In [135]:
def evaluate2(ref_src, est_src, mix_src):
    print('Calculating sir and sar...')
    _, sir, sar, _ = me.separation.bss_eval_sources(ref_src, est_src, compute_permutation=False)
    print('Calculating sir and nsdr...')
    nsdr = calculate_nsdr(ref_src, est_src, mix_src)
    return sir, sar, nsdr

#### Evaluate vocals 

Spleeter vocals

In [137]:
evaluate2(ref_sources_voc, est_sources_spleeter_voc, mixes)

Calculating sir and sar...
Calculating sir and nsdr...


(array([35.11521907, 34.88561246]),
 array([12.22519728, 11.19333657]),
 array([15.77550488, 10.45640842]))

Our vocals

In [138]:
evaluate2(ref_sources_voc, est_sources_our_voc, mixes)

Calculating sir and sar...
Calculating sir and nsdr...


(array([28.8420368 , 32.57053358]),
 array([9.15634944, 9.00715685]),
 array([12.67820877,  8.26868412]))

#### Evaluate accopmaniment

Spleeter accompaniment

In [141]:
evaluate2(ref_sources_acc, est_sources_spleeter_acc, mixes)

Calculating sir and sar...
Calculating sir and nsdr...


(array([35.646741  , 27.78558257]),
 array([14.57222219,  9.09279021]),
 array([10.8622637 ,  9.38202169]))

Our accompaniment

In [142]:
evaluate2(ref_sources_acc, est_sources_our_acc, mixes)

Calculating sir and sar...
Calculating sir and nsdr...


(array([34.98132275, 27.22426646]),
 array([11.84482897,  8.59722024]),
 array([8.14740898, 8.88458756]))