In [3]:
import mir_eval as me
import numpy as np
import soundfile as sf
import sounddevice as sd
from sklearn.model_selection import KFold

## 1 Load data

In [4]:
'''file0, sr = sf.read("./mir_eval test/mixture.wav" )
file1, _ = sf.read("./mir_eval test/bass.wav")
file2, _ = sf.read("./mir_eval test/drums.wav")
file3, _ = sf.read("./mir_eval test/vocals.wav")
file4, _ = sf.read("./mir_eval test/other.wav")

reference_sources = np.array([file0, file1])
estimated_sources = np.array([file2, file3])
reference_sources'''

'file0, sr = sf.read("./mir_eval test/mixture.wav" )\nfile1, _ = sf.read("./mir_eval test/bass.wav")\nfile2, _ = sf.read("./mir_eval test/drums.wav")\nfile3, _ = sf.read("./mir_eval test/vocals.wav")\nfile4, _ = sf.read("./mir_eval test/other.wav")\n\nreference_sources = np.array([file0, file1])\nestimated_sources = np.array([file2, file3])\nreference_sources'

In [7]:
reference_sources, _ = sf.read("./mir_eval test/from mek/vocals_downsampled.wav")
estimated_sources, _ = sf.read("./mir_eval test/from mek/vocal_05.wav")
mix_sources, _ = sf.read("./mir_eval test/from mek/mix_05.wav")

reference_sources = np.array([reference_sources, reference_sources])
estimated_sources = np.array([estimated_sources, estimated_sources])
mix_sources = np.array([mix_sources, mix_sources])

## 2 Metrics

https://craffel.github.io/mir_eval/#module-mir_eval.separation

### 2.1 Evaluate
Computes ```bss_eval_sources``` (for fewer than 3 dimensions) and ```bss_eval_images```.

``` python
mir_eval.separation.evaluate(reference_sources, estimated_sources, **kwargs)
```

#### Returns

	
- **scores:** ```dict```, dictionary of scores, where the key is the metric name (str) and the value is the (float) score achieved.

### 2.2 Blind Source Separation (BSS)
``` python
mir_eval.separation.bss_eval_sources(reference_sources, estimated_sources, compute_permutation=True)
```

#### Use

From "WAVE-U-NET: A MULTI-SCALE NEURAL NETWORK FOR END-TO-END AUDIO SOURCE SEPARATION"

_Since the collection of segment-wise vocal SDR values across the dataset is not normally distributed (compare Fig- ure 3 for vocals), the mean and standard deviation are not sufficient to adequately summarise it. As a workaround, **we take the median over segments**, as it is robust against outliers and intuitively describes the minimum performance that is achieved 50% of the time. To describe the spread of the distribution, we use the median absolute deviation (MAD) as a rank-based equivalent to the standard deviation (SD). It is defined as the median of the absolute deviations from the overall median and is easily interpretable, since a value of x means that 50% of values have an absolute difference from the median that is lower than x._

#### Returns


- **sdr:** ```np.ndarray, shape=(nsrc,)```, vector of Signal to Distortion Ratios (SDR)


- **sir:** ```np.ndarray, shape=(nsrc,)```, vector of Source to Interference Ratios (SIR)


- **sar:** ```np.ndarray, shape=(nsrc,)```, vector of Sources to Artifacts Ratios (SAR)


- perm: not used

#### Normalized SDR (NSDR)

$NSDR(S_e, S_r, S_m) = SDR(S_e, S_r) - SDR(S_m, S_r)$

where $S_e$ is the estimated isolated signal, $S_r$ is the reference isolated signal, and $S_m$ is the mixed signal.

In [8]:
def nsdr(reference_sources, estimated_sources, mix_sources):
    sdr1, _, _, _ = me.separation.bss_eval_sources(reference_sources, estimated_sources, compute_permutation=True)
    sdr2, _, _, _ = me.separation.bss_eval_sources(reference_sources, mix_sources, compute_permutation=True)
    
    return sdr1 - sdr2

In [9]:
sdr, sir, sar, _ = me.separation.bss_eval_sources(reference_sources, estimated_sources, compute_permutation=True)

In [10]:
sir

array([227.98471633, 227.98471633])

In [11]:
sar

array([4.10238682, 4.10238682])

In [12]:
nsdr_out = nsdr(reference_sources, estimated_sources, mix_sources)
nsdr_out

array([8.18227768, 8.18227768])

### 2.3 Image to spatial distortion

``` python
mir_eval.separation.bss_eval_images(reference_sources, estimated_sources, compute_permutation=True)
```

#### Returns

- **sdr:** ```np.ndarray, shape=(nsrc,)```, vector of Signal to Distortion Ratios (SDR)


- **isr:** ```np.ndarray, shape=(nsrc,)```, vector of source Image to Spatial distortion Ratios (ISR)


- **sir:** ```np.ndarray, shape=(nsrc,)```, vector of Source to Interference Ratios (SIR)


- **sar:** ```np.ndarray, shape=(nsrc,)```, vector of Sources to Artifacts Ratios (SAR)


- perm: not used

In [13]:
sdr, isr, sir, sar, _ = me.separation.bss_eval_images(reference_sources, estimated_sources, compute_permutation=True)

In [14]:
sir

array([227.98471633, 227.98471633])

In [15]:
sar

array([4.10238682, 4.10238682])

In [16]:
nsdr_out = nsdr(reference_sources, estimated_sources, mix_sources)
nsdr_out

array([8.18227768, 8.18227768])

### 2.4 Image to spatial distortion framewise

``` python
mir_eval.separation.bss_eval_images_framewise(reference_sources, estimated_sources, window=1323000, hop=661500, compute_permutation=False)
```

#### Returns

- **sdr:** ```np.ndarray, shape=(nsrc,)```, vector of Signal to Distortion Ratios (SDR)


- **sir:** ```np.ndarray, shape=(nsrc,)```, vector of Source to Interference Ratios (SIR)


- **sar:** ```np.ndarray, shape=(nsrc,)```, vector of Sources to Artifacts Ratios (SAR)


- perm: not used

In [17]:
sdr, isr, sir, sar, _ = me.separation.bss_eval_images_framewise(reference_sources, 
                                                              estimated_sources, 
                                                              window=1323000, 
                                                              hop=661500, 
                                                              compute_permutation=False)

In [18]:
sir

array([[227.98471633],
       [227.98471633]])

In [19]:
sar

array([[4.10238682],
       [4.10238682]])

In [20]:
nsdr_out = nsdr(reference_sources, estimated_sources, mix_sources)
nsdr_out

array([8.18227768, 8.18227768])

### Helper functions

#### Validate input data

Checks that the input data to a metric are valid, and throws helpful errors if not.

``` python
mir_eval.separation.validate(reference_sources, estimated_sources)
```

In [21]:
me.separation.validate(reference_sources, estimated_sources)

## 3 Evaluate all

In [26]:
_, sirs, sars, _ = me.separation.bss_eval_sources(reference_sources, 
                                                     estimated_sources, 
                                                     compute_permutation=True)

nsdrs = nsdr(reference_sources, estimated_sources, mix_sources)

means = {
    'NSDR Vocal': np.mean(nsdrs),
    'NSDR Instrumental': None,
    'SIR Vocal': np.mean(sirs),
    'SIR Instrumental': None,
    'SAR Vocal': np.mean(sars),
    'SAR Instrumental': None
}
means

{'NSDR Vocal': 8.182277682555021,
 'NSDR Instrumental': None,
 'SIR Vocal': 227.98471632599598,
 'SIR Instrumental': None,
 'SAR Vocal': 4.10238682484913,
 'SAR Instrumental': None}

## 4 Comparison with MIREX results

In [None]:
mired_results = {
    'NSDR Vocal': 8.681,
    'NSDR Instrumental': 7.945,
    'SIR Vocal': 15.308,
    'SIR Instrumental': 21.975,
    'SAR Vocal': 11.301,
    'SAR Instrumental': 15.462
    }