In [1]:
import numpy as np
import pandas as pd
import os
import glob

import warnings
warnings.filterwarnings('ignore')

import sys
sys.path.append("C:/Work/IFS/rp_extract")

import librosa 

os.environ['PATH'] += os.pathsep + "D:/Research/Tools/ffmpeg/bin"

import audiofile_read as ar

In [2]:
DATA_DIR    = "E:/Data/MIR/EU_SOUNDS"
FEATURE_DIR = "E:/Data/MIR/EU_SOUNDS_FEATURES_LIBROSA"

In [3]:
feature_extractor = {}

#### Chroma

In [4]:
feature_extractor["chroma"] = lambda S, sr : \
\
        librosa.feature.chroma_stft(S          = S   ,         # power spectrogram
                                    sr         = sr,           # samplerate
                                    norm       = np.inf,       # Column-wise normalization
                                    tuning     = None)         # Deviation from A440 tuning in fractional bins (cents). 
                                                               #    If None, it is automatically estimated.

#### MFCC

In [5]:
feature_extractor["mfcc"] = lambda S, sr : \
\
        librosa.feature.mfcc(S      = S,                       # power spectrogram
                             sr     = sr,                      # samplerate
                             n_mfcc = 13)                      # number of MFCCs to return

#### RMSE

In [6]:
feature_extractor["rmse"] = lambda S, sr : \
\
        librosa.feature.rmse(S  = S)                           # power spectrogram

#### Spectral Centroid

In [7]:
feature_extractor["spectral_centroid"] = lambda S, sr : \
\
        librosa.feature.spectral_centroid(S  = S,              # power spectrogram
                                          sr = sr)             # samplerate

#### Spectral Bandwidth

In [8]:
feature_extractor["spectral_bandwidth"] = lambda S, sr : \
\
        librosa.feature.spectral_bandwidth(S  = S,             # power spectrogram
                                           sr = sr)            # samplerate

#### Spectral Contrast

In [9]:
feature_extractor["spectral_contrast"] = lambda S, sr : \
\
        librosa.feature.spectral_contrast(sr         = sr,     # sampling rate of y
                                          S          = S,      # power spectrogram
                                          freq       = None,   # Center frequencies for spectrogram bins. If None, then 
                                                               #   FFT bin center frequencies are used. Otherwise, it 
                                                               #   can be a single array of d center frequencies, or 
                                                               #   a matrix of center frequencies as constructed by 
                                                               #   librosa.core.ifgram, centroid=None, norm=True, p=2)
                                          fmin       = 200.0,  # Frequency cutoff for the first bin [0, fmin] Subsequent
                                                               #   bins will cover [fmin, 2*fmin], [2*fmin, 4*fmin], etc.
                                          n_bands    = 6,      # number of frequency bands
                                          quantile   = 0.02,   # quantile for determining peaks and valleys
                                          linear     = False)  # If True, return the linear difference of magnitudes: 
                                                               #   peaks - valleys.
                                                               # If False, return the logarithmic difference: 
                                                               #   log(peaks) - log(valleys).

#### Spectral Rolloff

In [10]:
feature_extractor["spectral_rolloff"] = lambda S, sr : \
\
        librosa.feature.spectral_rolloff(sr           = sr,     # sampling rate of y
                                         S            = S,      # power spectrogram
                                         freq         = None,   # Center frequencies for spectrogram bins. If None, then 
                                                                #   FFT bin center frequencies are used. Otherwise, it 
                                                                #   can be a single array of d center frequencies, or 
                                                                #   a matrix of center frequencies as constructed by 
                                                                #   librosa.core.ifgram, centroid=None, norm=True, p=2)
                                         roll_percent = 0.85)

#### Tonnetz

In [11]:
feature_extractor["tonnetz"] = lambda chroma, sr : \
\
        librosa.feature.tonnetz(sr     = sr,                    # sampling rate of y
                                chroma = chroma)                # Normalized energy for each chroma bin at each frame.
                                                                #   If None, a cqt chromagram is performed.

#### Zero Crossing Rate

In [12]:
feature_extractor["zero_crossing_rate"] = lambda y : \
\
        librosa.feature.zero_crossing_rate(y            = y,    # audio time series
                                           frame_length = 2048, # Length of the frame over which to compute 
                                                                # zero crossing rates
                                           hop_length   = 512,  # hop length if provided y, sr instead of S
                                           center       = True) # If True, frames are centered by padding the edges of y. 
                                                                #   This is similar to the padding in librosa.core.stft, 
                                                                #   but uses edge-value copies instead of reflection.

In [13]:
feature_extractor.keys()

['spectral_bandwidth',
 'rmse',
 'spectral_rolloff',
 'mfcc',
 'chroma',
 'spectral_contrast',
 'tonnetz',
 'spectral_centroid',
 'zero_crossing_rate']

#### Beats per Minute

In [14]:
def calc_bpm(S, sr, audio_length):
             
    onset_env = librosa.onset.onset_strength(S         = S,     # pre-computed (log-power) spectrogram
                                             detrend   = False, # Filter the onset strength to remove the DC component
                                             centering = True,  # Shift the onset function by n_fft / (2 * hop_length) frames
                                             feature   = None,  # Function for computing time-series features, eg, scaled
                                                                #   spectrograms. By default, uses 
                                                                #   librosa.feature.melspectrogram with fmax=8000.0
                                             aggregate = None)  # Aggregation function to use when combining onsets at 
                                                                #   different frequency bins.Default: np.mean
    
    
    if audio_length > 60:
        duration = 40.0
        offset   = 10.0
    else:
        duration = audio_length
        offset   = 0.0
                          
             
    bpm = librosa.beat.estimate_tempo(onset_env,                # onset_envelope : onset strength envelope
                                      sr         = sr,          # sampling rate of the time series
                                      start_bpm  = 120,         # initial guess of the BPM
                                      std_bpm    = 1.0,         # standard deviation of tempo distribution
                                      ac_size    = 4.0,         # length (in seconds) of the auto-correlation window
                                      duration   = duration,    # length of signal (in seconds) to use in estimating tempo
                                      offset     = offset)      # offset (in seconds) of signal sample to use in estimating
                                                                # tempo
             
    return bpm

In [15]:
def extract_librosa_features(wavedata, samplerate, dest_path):
    
    try:
        
        # merge audio channels
        wavedata           = wavedata.mean(axis=1)
        
        # calculate spectrogram
        spectrogram, phase = librosa.magphase(librosa.stft(wavedata, n_fft = 2048))

        # extract features
        chroma             = feature_extractor["chroma"](spectrogram, samplerate)
        mfcc               = feature_extractor["mfcc"](spectrogram, samplerate)
        rmse               = feature_extractor["rmse"](spectrogram, samplerate)
        spectral_centroid  = feature_extractor["spectral_centroid"](spectrogram, samplerate)
        spectral_bandwidth = feature_extractor["spectral_bandwidth"](spectrogram, samplerate)
        spectral_contrast  = feature_extractor["spectral_contrast"](spectrogram, samplerate)
        spectral_rolloff   = feature_extractor["spectral_rolloff"](spectrogram, samplerate)
        tonnetz            = feature_extractor["tonnetz"](chroma, samplerate)
        zero_crossing_rate = feature_extractor["zero_crossing_rate"](wavedata)
        
        audio_length       = wavedata.shape[0] / float(samplerate)
        bpm                = calc_bpm(spectrogram, samplerate, audio_length)
        
        # store data
        np.savez(dest_path,
                 chroma             = chroma,
                 mfcc               = mfcc,
                 rmse               = rmse,
                 spectral_centroid  = spectral_centroid,
                 spectral_bandwidth = spectral_bandwidth,
                 spectral_contrast  = spectral_contrast,
                 spectral_rolloff   = spectral_rolloff,
                 tonnetz            = tonnetz,
                 zero_crossing_rate = zero_crossing_rate,
                 bpm                = bpm)
        
    except Exception as e:
        print "**", e

In [16]:
audio_files_todo = []

for dir_name in glob.glob(os.path.join(DATA_DIR, "*")):
    
    src_path = dir_name.replace("\\","/")
    dst_path = src_path.replace(DATA_DIR, FEATURE_DIR)
    
    if not os.path.exists(dst_path):
        os.makedirs(dst_path)
    
    for mp3_name in glob.glob(os.path.join(src_path, "*.mp3")):
        
        audio_files_todo.append(mp3_name.replace("\\","/"))
        

In [17]:
len(audio_files_todo)

391529

In [20]:
i = 1

num_audio_files = len(audio_files_todo)

for mp3_name in audio_files_todo:

    src_filename = mp3_name.replace("\\","/")
    dst_filename = "%s" % (src_filename.replace(DATA_DIR, FEATURE_DIR))

    


    if os.path.exists("%s.npz" % (dst_filename)) or \
       os.path.exists("%s.DecoderError" % (dst_filename)):
        i += 1
        continue

    print "> [%d of %d (%.2f)]: %s" % (i, num_audio_files, (i / float(num_audio_files)), src_filename)
    
    try:

        (samplerate, samplewidth, wavedata) = ar.mp3_read(src_filename)

        extract_librosa_features(wavedata, samplerate, dst_filename)
        print "   Stored to: %s.npz" % (dst_filename)

        del samplerate, samplewidth, wavedata


    except ar.DecoderException as e:
        print "*** DecoderException", e
        f = open("%s.DecoderError" % (dst_filename), 'w')
        f.write(str(e))
        f.close()
        pass

    except Exception as e:
        print "*** Exception", e

    if i % 1000 == 0:
        print i
    
    i += 1

> [25150 of 391529 (0.06)]: E:/Data/MIR/EU_SOUNDS/15601/5FBC756C6C029AD667640367E835FB24D9A0BFC1.mp3
** index 0 is out of bounds for axis 0 with size 0
   Stored to: E:/Data/MIR/EU_SOUNDS_FEATURES_LIBROSA/15601/5FBC756C6C029AD667640367E835FB24D9A0BFC1.mp3.npz
> [26131 of 391529 (0.07)]: E:/Data/MIR/EU_SOUNDS/2022420/cjcpctcluj_MUP_033_mp3.mp3


KeyboardInterrupt: 