In [2]:
import librosa
import os
import gzip
import numpy as np
from subprocess import call
import medleydb as mdb
import glob
import time

import scipy
from scipy import interpolate
print('librosa version: {:s}'.format(librosa.__version__))
print('scipy version: {:s}'.format(scipy.__version__))
print('numpy version: {:s}'.format(np.__version__))

librosa version: 0.5.0
scipy version: 0.18.1
numpy version: 1.11.3


In [3]:
vocalSet = set(['female singer', 'male rapper', 'male singer', 'male speaker', 'vocalists'])

In [4]:
# CQT Parameters
params_CQT = {
    'sr':44100, 
    'hop_length':1024, 
    'fmin':None, #Minimum frequency. Defaults to C1 ~= 32.70 Hz
    'n_bins':288, 
    'bins_per_octave':36, 
    'tuning':None, #Tuning offset in fractions of a bin (cents).
    'filter_scale':1, #Filter scale factor. Small values (<1) use shorter windows for improved time resolution.
    'norm':1, #Type of norm to use for basis function normalization.
    'sparsity':0.01, #Sparsify the CQT basis by discarding up to sparsity fraction of the energy in each basis.
    }

In [5]:
mdb.AUDIO_PATH

'/scratch/js7561/datasets/MedleyDB/Audio'

In [6]:
# parameters
deformer = 'stretch1'
audio_folder = '/scratch/js7561/datasets/MedleyDB_output/{:s}/audio/'.format(deformer)
output_folder = '/scratch/js7561/datasets/MedleyDB_output/{:s}/features/'.format(deformer)

compute_cqt = True
compute_activation = True

assert os.path.isdir(audio_folder)
if not os.path.isdir(output_folder):
    os.mkdir(output_folder)

# multitrack generator
trackList = os.listdir(mdb.AUDIO_PATH)
mtrack_generator = mdb.load_multitracks(trackList)

counter = 0
for track in mtrack_generator:
    
    audiofiles = glob.glob(os.path.join(audio_folder, '{:s}_*.wav'.format(track.track_id)))
    for audiofile in audiofiles:
        
        filename = os.path.basename(audiofile).replace('.wav', '')  # no extension
        print(counter, filename)
        
        cqtfolder = 'cqt{:d}_{:d}_{:d}_{:d}'.format(
            params_CQT['sr'], params_CQT['hop_length'], 
            params_CQT['n_bins']//params_CQT['bins_per_octave'],
            params_CQT['bins_per_octave'])
        if not os.path.isdir(os.path.join(output_folder, cqtfolder)):
            os.mkdir(os.path.join(output_folder, cqtfolder))
        cqtfilename = '{:s}_cqt.npy.gz'.format(filename)
        cqtfullpath = os.path.join(output_folder, cqtfolder, cqtfilename)

        actfolder = 'vocal_activation{:d}_{:d}'.format(params_CQT['sr'], params_CQT['hop_length'])
        if not os.path.isdir(os.path.join(output_folder, actfolder)):
            os.mkdir(os.path.join(output_folder, actfolder))
        actfilename = '{:s}_vocalactivation.npy.gz'.format(filename)
        actfullpath = os.path.join(output_folder, actfolder, actfilename)

        if compute_cqt:

            # load audio
            x, sr = librosa.load(audiofile, sr=params_CQT['sr'])

            # compute and save log-CQT
            logcqt = librosa.logamplitude(np.abs(librosa.cqt(y=x, **params_CQT)), ref=1.0)
            np.save(gzip.open(cqtfullpath, 'wb'), logcqt)
        else:
            # for activation only
            logcqt = np.load(gzip.open(cqtfullpath, 'rb'))

        # compute vocal activation curve (corrected version)
        if compute_activation:
            vocalIdx = ([(track.stems[k].stem_idx - 1) for k in track.stems.keys() if 
                         track.stems[k].instrument[0] in vocalSet])

            try:
                if len(vocalIdx) != 0:
                    stem_activations = np.array(track.stem_activations)[:, 1:]
                    time = np.array(track.stem_activations)[:, 0]
                    vocal_activation = np.max(stem_activations[:, vocalIdx], 1)

                    # interpolation function for vocal activation curve
                    # some issue with end point(off by a few ms) so fill with last value
                    f = interpolate.interp1d(time, vocal_activation, kind='linear',
                                             bounds_error=False, fill_value=vocal_activation[-1])

                    # vocal activation lables per frame
                    specLabel = [f(i * params_CQT['hop_length'] / params_CQT['sr']) for i in range(logcqt.shape[1])]
                    specLabel = np.array(specLabel)
                else:
                    specLabel = np.zeros(logcqt.shape[1])

                # save label
                np.save(gzip.open(actfullpath, 'wb'), specLabel)
            except IndexError as err:
                print('MISSING ACTIVATION: {:s}'.format(track.track_id))
                print(err)

    counter += 1

0 MusicDelta_FreeJazz_stretch1_1
0 MusicDelta_FreeJazz_stretch1_2
0 MusicDelta_FreeJazz_stretch1_3
0 MusicDelta_FreeJazz_stretch1_0
1 TablaBreakbeatScience_CaptainSky_stretch1_0
1 TablaBreakbeatScience_CaptainSky_stretch1_1
1 TablaBreakbeatScience_CaptainSky_stretch1_3
1 TablaBreakbeatScience_CaptainSky_stretch1_2
2 TablaBreakbeatScience_PhaseTransition_stretch1_0
2 TablaBreakbeatScience_PhaseTransition_stretch1_1
2 TablaBreakbeatScience_PhaseTransition_stretch1_2
2 TablaBreakbeatScience_PhaseTransition_stretch1_3
3 MusicDelta_GriegTrolltog_stretch1_3
3 MusicDelta_GriegTrolltog_stretch1_0
3 MusicDelta_GriegTrolltog_stretch1_2
3 MusicDelta_GriegTrolltog_stretch1_1
4 CroqueMadame_Oil_stretch1_3
4 CroqueMadame_Oil_stretch1_0
4 CroqueMadame_Oil_stretch1_2
4 CroqueMadame_Oil_stretch1_1
5 TablaBreakbeatScience_RockSteady_stretch1_3
5 TablaBreakbeatScience_RockSteady_stretch1_1
5 TablaBreakbeatScience_RockSteady_stretch1_0
5 TablaBreakbeatScience_RockSteady_stretch1_2
6 KarimDouaidy_Yatora_st