In [None]:
# a demo notebook that demonstrate how to use the trained network to make prediction of a song's drum hits

# from inputing a music youtube link or an music audio 
# to getting the drum_hit prediction result as a dataframe

In [1]:
# Since we are not pip installing this package, we will need to manually specify the module path for packages import
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import pandas as pd
import numpy as np
from tensorflow import keras
import librosa

from input_transform import drum_extraction, drum_to_frame, get_yt_audio

In [2]:
model = keras.models.load_model('Trained Network/complete_network.h5')
model

<keras.engine.sequential.Sequential at 0x7f6f2c629610>

In [3]:
# this download the audio file from the youtube link

path = get_yt_audio('https://www.youtube.com/watch?v=XPpTgCho5ZA')
path

'/content/Maroon 5 - This Love (Official Music Video).webm'

In [4]:
# extract the drum audio part from the entire audio input
# this will return a numpy ndarray and a sampling rate number

drum_track, sr = drum_extraction(path, kernel='demucs')

# of course, the user can just directly input their local audio into the above drum_extraction function without using the youtube link
# and if the input is a drum-only audio, just directly load the file like this and skip the drum_extraction process:
# drum_track, sr = librosa.load('filepath')

df, bpm=drum_to_frame(drum_track,sample_rate=sr,
            hop_length=1024,
            backtrack=False,
            estimated_bpm=96,
            fixed_clip_length=False,
            resolution=16)

df.head()

# each row is an drum_hit clip identified by the onset_detection algorithm

100%|████████████████████████████████████████████████████████████████████████| 252.0/252.0 [01:00<00:00,  4.16seconds/s]
100%|████████████████████████████████████████████████████████████████████████| 252.0/252.0 [00:54<00:00,  4.66seconds/s]
100%|████████████████████████████████████████████████████████████████████████| 252.0/252.0 [00:50<00:00,  4.95seconds/s]
100%|████████████████████████████████████████████████████████████████████████| 252.0/252.0 [00:52<00:00,  4.79seconds/s]


-----------------------------
resolution = 16. 
16 note duration is set, this means the duration of the sliced audio clip will have the same duration as an 16 note in the song
It is recommended to set the resolution value either 8 or 16, if not familiar with song structure
-----------------------------


Unnamed: 0,audio_clip,sample_start,sample_end,sampling_rate,peak_sample
0,"[-0.00066857255, -0.0007356501, -0.00059176853...",47264,55040,50020,48128
1,"[-0.0007406838, -0.00020681047, 0.00082117674,...",61600,69376,50020,62464
2,"[0.0014672545, 0.0016850219, 0.0016810744, 0.0...",74912,82688,50020,75776
3,"[0.00035065282, 0.0006129599, 0.0007161393, 0....",116896,124672,50020,117760
4,"[0.33516127, 0.34582675, 0.28047448, 0.3394826...",131232,139008,50020,132096


In [5]:
bpm

95.703125

In [6]:
# generate the mel_frequency spectrograms for each instance

pred_x = []

for i in range(df.shape[0]):
    pred_x.append(librosa.feature.melspectrogram(y=df.audio_clip.iloc[i], 
                          sr=df.sampling_rate.iloc[i], n_mels=128, fmax=8000))

X = np.array(pred_x)
X = X.reshape(X.shape[0],X.shape[1],X.shape[2],1)
X.shape

# 823 instances of 128*18 size mel-spectrogram (1 channel)

(823, 128, 18, 1)

In [7]:
# predict each drum_hit instance (row)

result = []
pred_raw = model.predict(X)
pred = np.round(pred_raw)

# sometimes the prediction will be "empty", namely, no positive labels
# in this situation, we just label the class with the highest estimated probability score as positive

for i in range(pred_raw.shape[0]):
  prediction = pred[i]
  if sum(prediction) == 0:
    raw = pred_raw[i]
    new = np.zeros(6)
    ind = raw.argmax()
    new[ind] = 1
    result.append(new)
  else:
    result.append(prediction)

result = np.array(result)
result.shape

(823, 6)

In [8]:
# the six different type of drum hits that we trained the network on

drum_hits = ['SD','HH','KD','RC','TT','CC']

In [9]:
prediction = pd.DataFrame(result, columns = drum_hits)
prediction.head(10)

# each row implies the predicted result of that particular drum_hit instance
# drum play are multi-hit sometimes, so some rows will contain multiple positive labels

Unnamed: 0,SD,HH,KD,RC,TT,CC
0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,1.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,1.0,0.0
7,0.0,0.0,0.0,0.0,1.0,0.0
8,1.0,0.0,0.0,0.0,0.0,0.0
9,1.0,0.0,0.0,0.0,0.0,0.0


In [10]:
# merge the original dataframe with the prediction

df.reset_index(inplace=True)
prediction.reset_index(inplace=True)

result = df.merge(prediction,left_on='index', right_on= 'index')
result.drop(columns=['index'],inplace=True)
result.head()

Unnamed: 0,audio_clip,sample_start,sample_end,sampling_rate,peak_sample,SD,HH,KD,RC,TT,CC
0,"[-0.00066857255, -0.0007356501, -0.00059176853...",47264,55040,50020,48128,0.0,0.0,0.0,0.0,1.0,0.0
1,"[-0.0007406838, -0.00020681047, 0.00082117674,...",61600,69376,50020,62464,0.0,0.0,1.0,0.0,0.0,0.0
2,"[0.0014672545, 0.0016850219, 0.0016810744, 0.0...",74912,82688,50020,75776,1.0,0.0,0.0,0.0,0.0,0.0
3,"[0.00035065282, 0.0006129599, 0.0007161393, 0....",116896,124672,50020,117760,0.0,1.0,0.0,0.0,0.0,0.0
4,"[0.33516127, 0.34582675, 0.28047448, 0.3394826...",131232,139008,50020,132096,1.0,0.0,0.0,0.0,0.0,0.0


In [None]:
## Done