In [None]:
# a demo notebook that demonstrate how to use the trained network to make prediction of a song's drum hits

# from inputing a music youtube link to getting the prediction as a dataframe

In [1]:
import pandas as pd
import numpy as np
from tensorflow import keras
import librosa

from input_transform import drum_extraction, drum_to_frame, get_yt_audio

  _resample_loop_p(x, t_out, interp_win, interp_delta, num_table, scale, y)


In [2]:
model = keras.models.load_model('Trained Network/complete_network.h5')

model

<keras.engine.sequential.Sequential at 0x7fe9ec5de490>

In [3]:
# this download the audio file from the youtube link

path = get_yt_audio('https://www.youtube.com/watch?v=XPpTgCho5ZA')
path

'/content/Maroon 5 - This Love (Official Music Video).webm'

In [5]:
# extract the drum audio part from the entire audio input
# this will return the drum audio in a numpy ndarray format
# and a sampling rate number

drum_track, sr = drum_extraction(path, kernel='demucs')

# of course, the user can just directly input their local audio into the above drum_extraction function without using the youtube link
# if the input is a drum-only audio, just directly load the file like this and skip the drum_extraction process:
# drum_track, sr = librosa.load('filepath')

df, bpm=drum_to_frame(drum_track,sample_rate=sr,
            hop_length=1024,
            backtrack=False,
            estimated_bpm=80,
            fixed_clip_length=False,
            resolution=8)

df.head()

100%|████████████████████████████████████████████████████████████████████████| 252.0/252.0 [01:18<00:00,  3.20seconds/s]
100%|████████████████████████████████████████████████████████████████████████| 252.0/252.0 [00:54<00:00,  4.60seconds/s]
100%|████████████████████████████████████████████████████████████████████████| 252.0/252.0 [00:52<00:00,  4.78seconds/s]
100%|████████████████████████████████████████████████████████████████████████| 252.0/252.0 [00:52<00:00,  4.78seconds/s]


-----------------------------
resolution = 8. 
8 note duration is set, this means the duration of the sliced audio clip will have the same duration as an 8 note in the song
It is recommended to set the resolution value either 8 or 16, if not familiar with song structure
-----------------------------


Unnamed: 0,audio_clip,sample_start,sample_end,sampling_rate,peak_sample
0,"[-0.00029830355, -0.00043199328, -0.0002039886...",47264,61952,26481,48128
1,"[-8.706457e-05, 0.00086787745, 0.001929407, 0....",61600,76288,26481,62464
2,"[0.00089226663, 0.00088984886, 0.00039579926, ...",74912,89600,26481,75776
3,"[0.0014335099, 0.0022009485, 0.0020388144, 0.0...",116896,131584,26481,117760
4,"[0.2672331, 0.33943364, 0.3489445, 0.52502334,...",131232,145920,26481,132096


In [6]:
bpm

95.703125

In [7]:
# generate the mel_frequency spectrograms for each drum_hit instance

pred_x = []

for i in range(df.shape[0]):
    pred_x.append(librosa.feature.melspectrogram(y=df.audio_clip.iloc[i], 
                          sr=df.sampling_rate.iloc[i], n_mels=128, fmax=8000))

X = np.array(pred_x)
X = X.reshape(X.shape[0],X.shape[1],X.shape[2],1)
X.shape

# 818 instances of 128*18 size mel-spectrogram (1 channel)

(818, 128, 18, 1)

In [8]:
# predict each drum_hit instance (row)

pred = model.predict(X)
pred = np.round(pred)
pred.shape

(818, 10)

In [9]:
drum_hits = ['SD','HH_close','KD','RC','FT','HT','HH_open','SD_xstick','MT','CC']

prediction = pd.DataFrame(pred, columns = drum_hits)
prediction.head(10)

# each row implies the predicted result of that particular drum_hit instance
# drum play are multi-hit sometimes, so some rows will contain multiple positive labels

Unnamed: 0,SD,HH_close,KD,RC,FT,HT,HH_open,SD_xstick,MT,CC
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
# merge the original dataframe with the prediction

df.reset_index(inplace=True)
prediction.reset_index(inplace=True)

result = df.merge(prediction,left_on='index', right_on= 'index')
result.drop(columns=['index'],inplace=True)
result.head()

Unnamed: 0,audio_clip,sample_start,sample_end,sampling_rate,peak_sample,SD,HH_close,KD,RC,FT,HT,HH_open,SD_xstick,MT,CC
0,"[-0.00029830355, -0.00043199328, -0.0002039886...",47264,61952,26481,48128,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"[-8.706457e-05, 0.00086787745, 0.001929407, 0....",61600,76288,26481,62464,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"[0.00089226663, 0.00088984886, 0.00039579926, ...",74912,89600,26481,75776,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"[0.0014335099, 0.0022009485, 0.0020388144, 0.0...",116896,131584,26481,117760,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"[0.2672331, 0.33943364, 0.3489445, 0.52502334,...",131232,145920,26481,132096,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
## Done