In [1]:
import os
import yaml
import numpy as np
import matplotlib.pyplot as plt
import soundfile as sf
import librosa
import librosa.display
import IPython.display as ipd

%matplotlib inline

In [2]:
root_dir = '/media/apelykh/bottomless-pit/datasets/mixing/MedleyDB/Audio'

---
# MedleyDB data surgery

To prepare the MedleyDB dataset for feeding into the mixing model, we have to:
1. Sum the stems to produce unmixed tracks (that will be fed into a source separation model);
2. Group stems into categories such as `drums`, `bass`, `vocals`, `other`. The list of categories should correspond to the source separation model output to enable the full pipeline;

### 1. Sum stems to pruduce unmixed tracks

In [3]:
for song_name in os.listdir(root_dir):
    print(song_name)
    
    mix_path = os.path.join(root_dir, song_name, '{}_MIX.wav'.format(song_name))
    mix, sr = librosa.load(mix_path, sr=44100)
    
    stems_path = os.path.join(root_dir, song_name, '{}_STEMS'.format(song_name))
    sum_track = np.zeros_like(mix)

    for track_name in os.listdir(stems_path):
        track_path = os.path.join(stems_path, track_name)
        track, sr = librosa.load(track_path, sr=44100)
        sum_track += track
    
    sum_track_path = os.path.join(root_dir, song_name, '{}_SUM.wav'.format(song_name))
    librosa.output.write_wav(sum_track_path, sum_track, sr=44100, norm=True)

AClassicEducation_NightOwl
AimeeNorwich_Child
AimeeNorwich_Flying
AlexanderRoss_GoodbyeBolero
AlexanderRoss_VelvetCurtain
AmarLal_Rest
AmarLal_SpringDay1
Auctioneer_OurFutureFaces
AvaLuna_Waterduct
BigTroubles_Phantom
BrandonWebster_DontHearAThing
BrandonWebster_YesSirICanFly
CelestialShore_DieForUs
ChrisJacoby_BoothShotLincoln
ChrisJacoby_PigsFoot
ClaraBerryAndWooldog_AirTraffic
ClaraBerryAndWooldog_Boys
ClaraBerryAndWooldog_Stella
ClaraBerryAndWooldog_TheBadGuys
MusicDelta_ChineseDrama
MusicDelta_ChineseHenan
MusicDelta_ChineseJiangNan
MusicDelta_ChineseXinJing
MusicDelta_ChineseYaoZu
MusicDelta_CoolJazz
MusicDelta_Country1
MusicDelta_Country2
MusicDelta_Disco
MusicDelta_FreeJazz
MusicDelta_FunkJazz
MusicDelta_FusionJazz
MusicDelta_Gospel
MusicDelta_GriegTrolltog
MusicDelta_Grunge
MusicDelta_Hendrix
MusicDelta_InTheHalloftheMountainKing
MusicDelta_LatinJazz
MusicDelta_ModalJazz
MusicDelta_Pachelbel
MusicDelta_Punk
MusicDelta_Reggae
MusicDelta_Rock
MusicDelta_Rockabilly
MusicDelta_Sha



NoBackendError: 

### 2. Group stems into categories

Show the full set of instruments to split them into `drums`, `bass`, `vocals`, `other` subsetsets

In [100]:
instruments = set()

for song_name in os.listdir(root_dir):
    info_file = os.path.join(root_dir, song_name, '{}_METADATA.yaml'.format(song_name))
    
    with open(info_file) as f:
        info = yaml.full_load(f)

        for stem in info['stems']:
            instrument = info['stems'][stem]['instrument']
            instruments.add(instrument)

print(instruments)

{'viola section', 'soprano saxophone', 'drum set', 'tabla', 'bamboo flute', 'tuba', 'piccolo', 'bongo', 'fx/processed sound', 'snare drum', 'trombone section', 'dizi', 'french horn section', 'bassoon', 'toms', 'harmonica', 'female singer', 'lap steel guitar', 'cymbal', 'vocalists', 'brass section', 'sampler', 'gu', 'clarinet section', 'cello section', 'liuqin', 'male speaker', 'violin section', 'viola', 'bass drum', 'gong', 'clean electric guitar', 'horn section', 'doumbek', 'mandolin', 'drum machine', 'tack piano', 'chimes', 'cello', 'zhongruan', 'distorted electric guitar', 'electric bass', 'erhu', 'double bass', 'glockenspiel', 'timpani', 'clarinet', 'yangqin', 'trombone', 'electric piano', 'accordion', 'oboe', 'flute section', 'oud', 'acoustic guitar', 'baritone saxophone', 'french horn', 'kick drum', 'trumpet', 'Main System', 'vibraphone', 'male singer', 'guzheng', 'auxiliary percussion', 'tenor saxophone', 'trumpet section', 'scratches', 'darbuka', 'banjo', 'violin', 'melodica', 

In [68]:
drum_instruments = set([
    'drum set', 'kick drum', 'bass drum', 'snare drum', 'toms', 'cymbal', 'gong',
    'tabla', 'darbuka', 'bongo', 'doumbek', 'tambourine', 'drum machine', 'timpani',
    'auxiliary percussion', 'shaker', 'claps'
])

bass_instruments = set(['electric bass', 'double bass'])

vocal_instruments = set([
    'male singer', 'male rapper', 'male speaker', 'female singer', 'vocalists'
])

In [101]:
def group_stem_ids(song_path):
    _, song_name = os.path.split(song_path)
    info_file = os.path.join(song_path, '{}_METADATA.yaml'.format(song_name))
    
    with open(info_file) as f:
        info = yaml.full_load(f)

    drum_stems = []
    bass_stems = []
    vocal_stems = []
    other_stems = []
        
    for stem in info['stems']:
        stem_id = stem[1:]
        instrument = info['stems'][stem]['instrument']
        component = info['stems'][stem]['component']
        
        if instrument in drum_instruments:
            drum_stems.append(stem_id)
        elif instrument in bass_instruments or component == 'bass':
            bass_stems.append(stem_id)
        elif instrument in vocal_instruments:
            vocal_stems.append(stem_id)
        else:
            other_stems.append(stem_id)
            
    return drum_stems, bass_stems, vocal_stems, other_stems


def sum_stems(song_path, stem_ids: list, category: str, sr=44100):
    _, song_name = os.path.split(song_path)    
    stems_dir = os.path.join(song_path, '{}_STEMS'.format(song_name))
    joined_stems_dir = os.path.join(song_path, '{}_STEMS_JOINED'.format(song_name))

    if not os.path.exists(joined_stems_dir):
        os.makedirs(joined_stems_dir)
    
    # skipping processed folders on the second run
    if len(os.listdir(joined_stems_dir)) == 4:
        print('[.] skipping ...')
        return
    
    mix_path = os.path.join(song_path, '{}_MIX.wav'.format(song_name))
    mix, _ = librosa.load(mix_path, sr)
    
    summed_track = np.zeros_like(mix)
    for stem_id in stem_ids:
        stem_name = '{}_STEM_{}.wav'.format(song_name, stem_id)
        stem_path = os.path.join(stems_dir, stem_name)
        track, _ = librosa.load(stem_path, sr)
        summed_track += track

    joined_stem_path = os.path.join(
        joined_stems_dir, '{}_STEM_{}.wav'.format(song_name, category.upper()))
    sf.write(joined_stem_path, summed_track, sr)

    print('[+] {} saved'.format(category))

In [79]:
for song_name in os.listdir(root_dir):
    print(song_name)
    
    song_path = os.path.join(root_dir, song_name)

    drum_stems, bass_stems, vocal_stems, other_stems = group_stem_ids(song_path)
    print(drum_stems, bass_stems, vocal_stems, other_stems)

    sum_stems(song_path, drum_stems, category='drums')
    sum_stems(song_path, bass_stems, category='bass')
    sum_stems(song_path, vocal_stems, category='vocals')
    sum_stems(song_path, other_stems, category='other')
    
    print('-' * 40)

AClassicEducation_NightOwl
['02', '11'] ['01'] ['08', '10', '13'] ['03', '04', '05', '06', '07', '09', '12']
[.] skipping ...
[.] skipping ...
[.] skipping ...
[.] skipping ...
----------------------------------------
AimeeNorwich_Child
['01'] ['02'] ['04', '05'] ['03', '06', '07']
[.] skipping ...
[.] skipping ...
[.] skipping ...
[.] skipping ...
----------------------------------------
AimeeNorwich_Flying
['01'] ['03'] [] ['02', '04', '05', '06', '07', '08', '09', '10', '11', '12', '13', '14', '15']
[.] skipping ...
[.] skipping ...
[.] skipping ...
[.] skipping ...
----------------------------------------
AlexanderRoss_GoodbyeBolero
['05'] ['04'] ['06'] ['01', '02', '03', '07', '08', '09']
[.] skipping ...
[.] skipping ...
[.] skipping ...
[.] skipping ...
----------------------------------------
AlexanderRoss_VelvetCurtain
['05'] ['04'] ['06', '09'] ['01', '02', '03', '07', '08', '10']
[.] skipping ...
[.] skipping ...
[.] skipping ...
[.] skipping ...
----------------------------

['01', '05'] ['02'] ['03', '10'] ['04', '06', '07', '08', '09', '11']
[.] skipping ...
[.] skipping ...
[.] skipping ...
[.] skipping ...
----------------------------------------
TheSoSoGlos_Emergency
['03', '06'] ['01'] ['02', '05'] ['04', '07', '08', '09', '10']
[.] skipping ...
[.] skipping ...
[.] skipping ...
[.] skipping ...
----------------------------------------
Creepoid_OldTree
['04', '10'] ['02'] ['01', '07', '08', '09'] ['03', '05', '06']
[.] skipping ...
[.] skipping ...
[.] skipping ...
[.] skipping ...
----------------------------------------
CroqueMadame_Oil
['04'] ['03'] [] ['01', '02']
[.] skipping ...
[.] skipping ...
[.] skipping ...
[.] skipping ...
----------------------------------------
CroqueMadame_Pilot
['04'] ['03'] [] ['01', '02']
[.] skipping ...
[.] skipping ...
[.] skipping ...
[.] skipping ...
----------------------------------------
DreamersOfTheGhetto_HeavyLove
['04', '05'] ['01'] ['02', '06', '08'] ['03', '07', '09']
[.] skipping ...
[.] skipping ...


[+] drums saved
[+] bass saved
[+] vocals saved
[+] other saved
----------------------------------------
_MatthewEntwistle_ImpressionsOfSaturn
[] [] [] ['01', '02', '03', '04', '05', '06', '07', '08', '09']
[+] drums saved
[+] bass saved
[+] vocals saved
[+] other saved
----------------------------------------
_MatthewEntwistle_Lontano
[] [] ['02'] ['01']
[+] drums saved
[+] bass saved
[+] vocals saved
[+] other saved
----------------------------------------
_MatthewEntwistle_TheArch
['04', '19'] [] [] ['01', '02', '03', '05', '06', '07', '08', '09', '10', '11', '12', '13', '14', '15', '16', '17', '18']
[+] drums saved
[+] bass saved
[+] vocals saved
[+] other saved
----------------------------------------
_MatthewEntwistle_TheFlaxenField
[] [] [] ['01', '02', '03', '04', '05']
[+] drums saved
[+] bass saved
[+] vocals saved
[+] other saved
----------------------------------------
_MichaelKropf_AllGoodThings
[] [] [] ['01', '02']
[+] drums saved
[+] bass saved
[+] vocals saved
[+] othe

### Manual correction

During the first run I've missed synthesizers that were playing the bass part.
Fixing it manually for:

[Lushlife_ToynbeeSuite,
TheSoSoGlos_Emergency,
EthanHein_HarmonicaFigure,
HeladoNegro_MitadDelMundo,
MusicDelta_InTheHalloftheMountainKing]

In [86]:
song_name = 'TheSoSoGlos_Emergency'
song_path = os.path.join(root_dir, song_name)

drum_stems = ['03', '06']
bass_stems = ['01', '08']
vocal_stems = ['02', '05']
other_stems = ['04', '07', '09', '10']

sum_stems(song_path, drum_stems, category='drums')
sum_stems(song_path, bass_stems, category='bass')
sum_stems(song_path, vocal_stems, category='vocals')
sum_stems(song_path, other_stems, category='other')

[+] drums saved
[+] bass saved
[+] vocals saved
[+] other saved


### Checking tracks origin

In [4]:
num_songs = 0

for song_name in os.listdir(root_dir):
    info_file = os.path.join(root_dir, song_name, '{}_METADATA.yaml'.format(song_name))
    
    if not os.path.exists(info_file):
        continue
    
    with open(info_file) as f:
        info = yaml.full_load(f)

        if info['origin'] != 'Weathervane Music':
            print(song_name)
            num_songs += 1
            
print(str(num_songs) + ' total')

AimeeNorwich_Child
AimeeNorwich_Flying
AlexanderRoss_GoodbyeBolero
AlexanderRoss_VelvetCurtain
AmarLal_Rest
AmarLal_SpringDay1
MusicDelta_ChineseDrama
MusicDelta_ChineseHenan
MusicDelta_ChineseJiangNan
MusicDelta_ChineseXinJing
MusicDelta_ChineseYaoZu
MusicDelta_CoolJazz
MusicDelta_Country1
MusicDelta_Country2
MusicDelta_Disco
MusicDelta_FreeJazz
MusicDelta_FunkJazz
MusicDelta_FusionJazz
MusicDelta_Gospel
MusicDelta_GriegTrolltog
MusicDelta_Grunge
MusicDelta_Hendrix
MusicDelta_InTheHalloftheMountainKing
MusicDelta_LatinJazz
MusicDelta_ModalJazz
MusicDelta_Pachelbel
MusicDelta_Punk
MusicDelta_Reggae
MusicDelta_Rock
MusicDelta_Rockabilly
MusicDelta_Shadows
KarimDouaidy_Yatora
LizNelson_Coldwar
LizNelson_ImComingHome
LizNelson_Rainfall
MatthewEntwistle_AnEveningWithOliver
MatthewEntwistle_DontYouEver
MatthewEntwistle_FairerHopes
MatthewEntwistle_ImpressionsOfSaturn
MatthewEntwistle_Lontano
MatthewEntwistle_TheArch
MatthewEntwistle_TheFlaxenField
Meaxic_TakeAStep
Meaxic_YouListen
MichaelKr