In [38]:
import os
import subprocess
import random
import numpy as np
from IPython.display import Audio

# MSOS

In [3]:
# Download the MsOS
url = "https://salford.figshare.com/ndownloader/articles/6901475/versions/4"
download_path = os.path.join(os.getcwd(), 'data')

if not os.path.exists(download_path):
    subprocess.run(f'wget {url}', shell=True)
    subprocess.run(f'unzip 4', shell=True)
    subprocess.run(f'unzip MSoS_challenge_2018_Development_v1-00.zip', shell=True)
    subprocess.run(f'mv Development {download_path}', shell=True)
    subprocess.run(f'rm MSoS*.zip', shell=True)
    subprocess.run(f'rm *.csv', shell=True)
    subprocess.run(f'rm 4', shell=True)

In [4]:
# Audiofiles
sources = ['Effects', 'Human', 'Music', 'Urban', 'Nature']
AUDIOFILES = {}
for source in sources:
    source_dir = os.path.join(download_path, source)
    AUDIOFILES[source] = [os.path.join(source_dir, filename) for filename in os.listdir(source_dir)]
for source in sources:
    print ('Source : {}, \t Number of files: {}'.format(source, len(AUDIOFILES[source])))

Source : Effects, 	 Number of files: 300
Source : Human, 	 Number of files: 300
Source : Music, 	 Number of files: 300
Source : Urban, 	 Number of files: 300
Source : Nature, 	 Number of files: 300


# Musdb

In [6]:
import musdb
mus = musdb.DB(download=True)
mus[0].audio

Downloading MUSDB 7s Sample Dataset to /home/jovyan/MUSDB18/MUSDB18-7...
Done!


array([[-8.84704590e-02, -3.79333496e-02],
       [-1.07421875e-01, -5.22460938e-02],
       [-8.42590332e-02, -5.48400879e-02],
       ...,
       [-8.23974609e-04,  3.05175781e-05],
       [-8.54492188e-04,  7.01904297e-04],
       [-8.54492188e-04,  1.31225586e-03]])

In [11]:
track = random.choice(mus.tracks)
track.chunk_duration = 5.0
track.chunk_start = random.uniform(0, track.duration - track.chunk_duration)
x = track.audio.T
y = track.targets['vocals'].audio.T

In [35]:
track = random.choice(mus.tracks)
print (track.audio.shape)
track.chunk_duration = 5.0
track_chunk_start = random.uniform(0, track.duration - track.chunk_duration)
print (track.audio.shape)

(300032, 2)
(220500, 2)


In [39]:
import museval
# provide an estimate
estimates = {
    'vocals': np.random.random(track.audio.shape),
    'accompaniment': np.random.random(track.audio.shape)
}

# evaluates using BSSEval v4, and writes results to `./eval`
print(museval.eval_mus_track(track, estimates, output_dir="./eval"))

vocals          ==> SDR: -17.506  SIR:   7.700  ISR: -12.018  SAR:  -4.098  
accompaniment   ==> SDR: -14.066  SIR: -16.224  ISR:  -0.706  SAR:  -4.105  



# MUSDB18 to DataLoader

In [None]:
# mix sources on the fly

# important parameters :
#     root              : ~/MUSDB18/MUSDB18-7
#     split             : train or test
#     samples_per_track : how many samples to take on one track (song)
#     seq_duration      : how long the sample is in seconds
#     target            : which source to isolate from the mix
#     dtype             : torch.float32

In [58]:
import torch
import tqdm

In [69]:
class MUSDBDataset(torch.utils.data.Dataset):
    def __init__(self, root, split, target, samples_per_track, seq_duration, dtype):
        self.mus = musdb.DB(root = root, split = split)
        self.split = split
        self.samples_per_track = samples_per_track
        self.seq_duration = seq_duration
        self.dtype = dtype
        self.target = target
        
    def __getitem__(self, index):
        audio_sources = []
        track = self.mus.tracks[index // self.samples_per_track]
        # at training time we assemble a custom mix
        if self.split == 'train' and self.seq_duration:
            for k, source in enumerate(self.mus.setup['sources']):
                # memorize index of target source
                if source == self.target:
                    target_ind = k
                
                # set the excerpt duration
                track.chunk_duration = self.seq_duration
                # set random start position
                track.chunk_start = random.uniform(0, track.duration - self.seq_duration)
                # load source audio and apply time domain source_augmentations
                audio = torch.tensor(track.sources[source].audio.T, dtype = self.dtype)
                audio_sources.append(audio)
            # create stem tensor of shape (source, channel, samples)
            stems = torch.stack(audio_sources, dim=0)
            # # apply linear mix over source index=0
            x = stems.sum(0)
            
            if target_ind is not None:
                y = stems[target_ind]
            # assuming vocal/accompaniment scenario if target!=source
            else:
                vocind = list(self.mus.setup['sources'].keys()).index('vocals')
                # apply time domain subtraction
                y = x - stems[vocind]
        
        # for validation and test, we deterministically yield the full
        # pre-mixed musdb track
        else:
            # get the non-linear source mix straight from musdb
            x = torch.tensor(
                track.audio.T,
                dtype = self.dtype
            )
            y = torch.tensor(
                track.targets[self.target].audio.T,
                dtype = self.dtype
            )
        
        return x, y
    
    def __len__(self):
        return len(self.mus.tracks) * self.samples_per_track

    
def load_dataset(root, samples_per_track, target, seq_duration, dtype):
    train_dataset = MUSDBDataset(
        root = root,
        split='train',
        samples_per_track = samples_per_track,
        seq_duration = seq_duration,
        target = target,
        dtype = dtype
    )

    valid_dataset = MUSDBDataset(
        root = root,
        split='valid',
        samples_per_track=1,
        seq_duration = None,
        target = target,
        dtype = dtype
    )

    return train_dataset, valid_dataset

In [74]:
train_dataset, test_dataset = load_dataset(root = "~/MUSDB18/MUSDB18-7",
                                           samples_per_track = 2,
                                           seq_duration = 5,
                                           dtype = torch.float32,
                                           target = "vocals")

In [76]:
total_training_duration = 0
for k in tqdm.tqdm(range(len(train_dataset))):
    x, y = train_dataset[k]
    total_training_duration += x.shape[1] / 44100 #train_dataset.sample_rate

100%|██████████| 260/260 [05:50<00:00,  1.35s/it]


In [77]:
total_training_duration

1300.0