In [1]:
import pandas as pd
import numpy as np
import librosa
from sklearn.model_selection import train_test_split

In [2]:
# load the processed data from "data_preparation"

# this notebook is the demonstration of how we transform the result df into
# 4d numpy arrays which can be directly fit to Keras network for training

# this is a sample that contains only 100 instances for this demonstration

df = pd.read_pickle('gmd_sample100.pkl')
df.head()

Unnamed: 0,start,end,track_id,audio_wav,label
0,0.0,0.08,804,"[0.040863037, 0.03353882, 0.052520752, 0.09002...",SD
1,0.02,0.12,804,"[0.040863037, 0.03353882, 0.052520752, 0.09002...",SD
2,0.05,0.15,804,"[0.040863037, 0.03353882, 0.052520752, 0.09002...",HH_close
3,0.26,0.36,804,"[0.040863037, 0.03353882, 0.052520752, 0.09002...",KD
4,0.81,0.81,804,"[0.040863037, 0.03353882, 0.052520752, 0.09002...",HT


In [3]:
# label encoding            
# some instances there are multiple positive labels

drum_hits = ['SD','HH_close','KD','RC','FT','HT','HH_open','SD_xstick','MT','CC']

for label in drum_hits:
    df[label] = df.apply(lambda row: label in row['label'],axis=1)
    df[label] = df[label].astype(int)

y = df[drum_hits]
y.shape

(100, 10)

In [4]:
y.head()

Unnamed: 0,SD,HH_close,KD,RC,FT,HT,HH_open,SD_xstick,MT,CC
0,1,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0


In [5]:
# turn the autio_wav into the representation of short-time fourier transformation (stft)

stft_train = []

for i in range(df.shape[0]):
    stft_train.append(np.abs(librosa.stft(df.audio_wav.iloc[i])))

X1 = np.array(stft_train)
X1 = X1.reshape(X1.shape[0],X1.shape[1],X1.shape[2],1)
X1.shape
    
# this is in the format of 
# a 4d array of (number of instances, y axis shape, x axis shape, 1 channel)

(100, 1025, 8, 1)

In [6]:
# 100 stft representation of 100 instances, each has the shape (1025,8), has 1 channel

# (normal image classification tasks the instances will have multiple channels)
# the task we're doing is audio classification, which will only have 1 channel for all representation formats

In [7]:
# turn the autio_wav into the representation of Mel-Frequency Spectrogram 
# this usually takes the longest time to process

mel_train = []

for i in range(df.shape[0]):
    mel_train.append(librosa.feature.melspectrogram(y=df.audio_wav.iloc[i], 
                                                    sr=44100, n_mels=128, fmax=8000))

X2 = np.array(mel_train)
X2 = X2.reshape(X2.shape[0],X2.shape[1],X2.shape[2],1)
X2.shape

(100, 128, 8, 1)

In [8]:
# turn the autio_wav into the representation of Mel-Frequency Cepstral Coefficients

mfcc_train = []

for i in range(df.shape[0]):
    mfcc_train.append(librosa.feature.mfcc(y=df.audio_wav.iloc[i], 
                                           sr=44100))       

X3 = np.array(mfcc_train)
X3 = X3.reshape(X3.shape[0],X3.shape[1],X3.shape[2],1)

X3.shape

(100, 20, 8, 1)

### Save transformed data into numpy array

#### These can be directly fit to an empty Keras Network for training

In [9]:
# can use np.save(filename, array) to save the ndarray data as npy files

y = np.array(y)
np.save('gmd_stft_sample100.npy',X1)
np.save('gmd_label_sample100.npy',y)

# use np.load(filename) to load whenever we need it

In [10]:
x_train, x_test, y_train, y_test = train_test_split(X1, y, test_size=0.2, random_state=42)
x_train.shape, x_test.shape

((80, 1025, 8, 1), (20, 1025, 8, 1))

In [None]:
###      don't run this cell      ###

# after finishing designing a network structure in Keras, data can be directly fit into the model like this for training

history = model.fit(x_train, y_train, 
                    validation_data=(x_test, y_test), 
                    validation_split=0.2, 
                    epochs=15, 
                    batch_size=64)