In [1]:
import pandas as pd
import numpy as np
import librosa
from sklearn.model_selection import train_test_split

In [2]:
# load the processed data from "data_preparation"

# this notebook is the demonstration of how we transform the result df into
# 4d numpy arrays which can be directly fit to Keras network for training

# this is a sample that contains only 100 instances for this demonstration

df = pd.read_pickle('gmd_sample.pkl')
df.head()

Unnamed: 0,label,start,end,track_id,audio_wav,sampling_rate
0,"[HH, KD]",0.0,0.08,534,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",44100
1,HH,0.14,0.24,534,"[0.011169434, 0.0058898926, 0.0050964355, 0.00...",44100
2,HH,0.29,0.39,534,"[-0.0013122559, 0.00091552734, 0.00091552734, ...",44100
3,HH,0.46,0.56,534,"[-0.0011901855, -0.0011291504, -0.0010375977, ...",44100
4,KD,0.47,0.57,534,"[0.00015258789, -6.1035156e-05, 0.0, 0.0001831...",44100


In [3]:
# label encoding            
# some instances there are multiple positive labels

# six unique drum hit types
drum_hits = ['SD','HH','KD','RC','TT','CC']

for label in drum_hits:
    df[label] = df.apply(lambda row: label in row['label'],axis=1)
    df[label] = df[label].astype(int)

y = df[drum_hits]
y.shape

(100, 6)

In [4]:
y.head()

Unnamed: 0,SD,HH,KD,RC,TT,CC
0,0,1,1,0,0,0
1,0,1,0,0,0,0
2,0,1,0,0,0,0
3,0,1,0,0,0,0
4,0,0,1,0,0,0


In [5]:
# turn the autio_wav into the representation of Mel_frequency Spectrograms

mel_train = []

for i in range(df.shape[0]):
    mel_train.append(librosa.feature.melspectrogram(y=df.audio_wav.iloc[i], 
                            sr=df.sampling_rate.iloc[i], n_mels=128, fmax=8000))

X = np.array(mel_train)
X = X.reshape(X.shape[0],X.shape[1],X.shape[2],1)
X.shape
    
# this is in the format of 
# a 4d array of (number of instances, y axis shape, x axis shape, 1 channel)

(100, 128, 18, 1)

In [6]:
# 100 Mel-spec representation of 100 drum_hit instances, each has the shape (128,18), has 1 channel
# (normal image classification tasks the instances will have multiple channels)
# the task we're doing is audio classification, which will only have 1 channel for all representation formats

In [7]:
# save the numpy ndarray data
# which can be directly fed to Keras network for training
# can use np.save(filename, array) to save the ndarray data as npy files

y = np.array(y)
np.save('gmd_mel_sample.npy',X)
np.save('gmd_label_sample.npy',y)

# use np.load(filename) to load whenever we need it

In [8]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
x_train.shape, x_test.shape

((80, 128, 18, 1), (20, 128, 18, 1))

In [None]:
###      don't run this cell      ###

# after finishing designing a network structure in Keras, data can be directly fit into the model like this for training

history = model.fit(x_train, y_train, 
           validation_data=(x_test, y_test),
           epochs=15, 
           batch_size=32)