In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import librosa
import os

In [4]:
# 1. Get the file path to an included audio example
filename = librosa.example('nutcracker')


# 2. Load the audio as a waveform `y`
#    Store the sampling rate as `sr`
y, sr = librosa.load(filename)

# 3. Run the default beat tracker
tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr)

print('Estimated tempo: {:.2f} beats per minute'.format(tempo))

# 4. Convert the frame indices of beat events into timestamps
beat_times = librosa.frames_to_time(beat_frames, sr=sr)

Downloading file 'Kevin_MacLeod_-_P_I_Tchaikovsky_Dance_of_the_Sugar_Plum_Fairy.ogg' from 'https://librosa.org/data/audio/Kevin_MacLeod_-_P_I_Tchaikovsky_Dance_of_the_Sugar_Plum_Fairy.ogg' to '/Users/adam/Library/Caches/librosa'.


Estimated tempo: 107.67 beats per minute


**NOTE:** Download/install/update `PySoundFile`

**BELOW:** Divide audio into harmonic and percussive elements, get tempo based on percussive elements.

In [5]:
# Load the example clip
y, sr = librosa.load(librosa.ex('nutcracker'))

# Set the hop length; at 22050 Hz, 512 samples ~= 23ms
hop_length = 512

# ---------------------
# Separate harmonics and percussives into two waveforms
y_harmonic, y_percussive = librosa.effects.hpss(y)

# Beat track on the percussive signal
tempo, beat_frames = librosa.beat.beat_track(y=y_percussive,
                                             sr=sr)
# ---------------------

# Compute MFCC features from the raw signal
mfcc = librosa.feature.mfcc(y=y, sr=sr, hop_length=hop_length, n_mfcc=13)

# And the first-order differences (delta features)
mfcc_delta = librosa.feature.delta(mfcc)

# Stack and synchronize between beat events
# This time, we'll use the mean value (default) instead of median
beat_mfcc_delta = librosa.util.sync(np.vstack([mfcc, mfcc_delta]),
                                    beat_frames)

# Compute chroma features from the harmonic signal
chromagram = librosa.feature.chroma_cqt(y=y_harmonic,
                                        sr=sr)

# Aggregate chroma features between beat events
# We'll use the median value of each feature between beat frames
beat_chroma = librosa.util.sync(chromagram,
                                beat_frames,
                                aggregate=np.median)

# Finally, stack all beat-synchronous features together
beat_features = np.vstack([beat_chroma, beat_mfcc_delta])



In [6]:
chromagram

array([[0.624562  , 0.6000365 , 0.3891103 , ..., 0.12988181, 0.26408905,
        0.46952796],
       [0.45346972, 0.31484362, 0.29985076, ..., 0.14454272, 0.13794322,
        0.20688398],
       [0.4300288 , 0.4736917 , 0.3885005 , ..., 0.5249095 , 0.4183366 ,
        0.35342664],
       ...,
       [0.6775574 , 0.46899557, 0.31649804, ..., 0.15098049, 0.20618644,
        0.41384566],
       [0.4585575 , 0.43789858, 0.31573945, ..., 0.27923036, 0.28339976,
        0.5822882 ],
       [1.        , 1.        , 1.        , ..., 0.96677846, 0.8960364 ,
        1.        ]], dtype=float32)

In [7]:
beat_features

array([[ 0.32793146,  0.03347357,  0.10583013, ...,  0.15275124,
         0.07555592,  0.08132137],
       [ 0.31918573,  0.04075494,  0.0842821 , ...,  0.09635098,
         0.12809394,  0.09471031],
       [ 0.4736917 ,  0.05757335,  0.09427972, ...,  0.44614288,
         0.18659325,  0.13297296],
       ...,
       [-0.06533479, -0.14681545,  0.05347487, ..., -0.17101115,
         0.62330383,  0.12903129],
       [-0.02285836, -0.13743223, -0.01169371, ..., -0.25085154,
         0.25030592,  0.00825023],
       [ 0.02850422, -0.20077428, -0.05553005, ...,  0.07408718,
        -0.6145776 ,  0.07621999]], dtype=float32)

In [8]:
beat_chroma

array([[0.32793146, 0.03347357, 0.10583013, ..., 0.15275124, 0.07555592,
        0.08132137],
       [0.31918573, 0.04075494, 0.0842821 , ..., 0.09635098, 0.12809394,
        0.09471031],
       [0.4736917 , 0.05757335, 0.09427972, ..., 0.44614288, 0.18659325,
        0.13297296],
       ...,
       [0.26491377, 0.01814758, 0.03974865, ..., 0.18357868, 0.9031806 ,
        0.04013696],
       [0.20277342, 0.01895499, 0.05991492, ..., 0.15096286, 0.18061754,
        0.05557969],
       [0.2608637 , 0.04809818, 0.35861766, ..., 0.40117043, 0.29769996,
        0.35132593]], dtype=float32)