In [2]:
import pdb
import time
import numpy as np
import librosa

import tensorflow as tf

from networks import *

In [3]:
def flatten_sequence(sequence, factor):
    
    seq_length = sequence.shape[-1]
    length = seq_length//factor
    seq_length_diff = seq_length - length

    sequence_flat = np.zeros(tf.size(sequence).numpy()*factor)
    for n in range(len(sequence)):
        point = n*length
        if n==0:
            sequence_flat[:seq_length] = sequence[n]
        else:
            sequence_flat[point:point+seq_length_diff] = sequence_flat[point:point+seq_length_diff] + sequence[n][:-length]
            sequence_flat[point+seq_length_diff:point+seq_length_diff+length] = sequence[n][-length:]

    sequence_flat = sequence_flat[:point+seq_length]
    
    for n in range(factor-1):
        point = n*length
        sequence_flat[point:point+length] = sequence_flat[point:point+length]/(n+1)
        if n==0:
            sequence_flat[-point-length:] = sequence_flat[-point-length:]/(n+1)
        else:
            sequence_flat[-point-length:-point] = sequence_flat[-point-length:-point]/(n+1)
        
    sequence_flat[(factor-1)*length:-(factor-1)*length] = sequence_flat[(factor-1)*length:-(factor-1)*length]/factor
    
    return sequence_flat

In [4]:
# Onset Detection

num_iter = 10
num_stages = 6

sequence_length = 16
factor_div = 4

#Models_OD = ['CNN','BRNN','RNN','RNN_Stateful','CRNN_1_Timing']
Models_OD = ['CNN','BRNN','RNN']
Times_OD = np.zeros((num_iter,num_stages,len(Models_OD)))

# Preprocess Parameters

hop_size = 128
frame_size = 256

for a in range(len(Models_OD)):

    for N in range(num_iter):

        Model_OD = Models_OD[a]

        print('\n')
        print([N,a])
        print('\n')

        # Onset Detection Parameters

        if Model_OD=='BRNN':
            sequence_length = 16
            model = BRNN_1(sequence_length,0)
            hop = 4
        elif Model_OD=='CNN':
            sequence_length = 16
            model = CNN_T_1(sequence_length,0)
            hop = 4
        elif Model_OD=='RNN':
            sequence_length = 8
            model = RNN_1(sequence_length,0)
            hop = 1
        elif Model_OD=='RNN_Stateful':
            sequence_length = 16
            model = RNN_Stateful_1(sequence_length,0)
            hop = 1
        elif Model_OD=='CRNN_Timing':
            sequence_length = 10
            model = CRNN_1_Timing(13,[16,32,64],sequence_length,0)
            hop = 1

        # Onset Detection

        audio = np.random.rand(120*22050)

        start = time.time()

        Tensor_All_0 = np.abs(librosa.stft(audio, n_fft=frame_size, hop_length=hop_size, win_length=frame_size, window='hann')).T
        print(Tensor_All_0.shape)
        Tensor_All_0 = np.flipud(Tensor_All_0[:,1:])
        print(Tensor_All_0.shape)

        Time_1 = time.time()-start
        print('Spectrogram Time: {:.4f}'.format(Time_1))
        print('Spectrogram Time per second: {:.4f}'.format(Time_1/(len(audio)/22050)))
        print('\n')
        Times_OD[N,0,a] = Time_1/(len(audio)/22050)
        start = time.time()

        length = Tensor_All_0.shape[0]-sequence_length+1
        Tensor_All = np.zeros(shape=(length,sequence_length,Tensor_All_0.shape[1]))
        for n in range(sequence_length):
            Tensor_All[:,n] = Tensor_All_0[n:length+n]
        if Model_OD!='RNN':
            Tensor_All = Tensor_All[::factor_div]

        Time_2 = time.time()-start
        print('Preprocess Time: {:.4f}'.format(Time_2))
        print('Preprocess Time per second: {:.4f}'.format(Time_2/(len(audio)/22050)))
        print('\n')
        Times_OD[N,1,a] = Time_2/(len(audio)/22050)
        start = time.time()

        Tensor_All = np.log(Tensor_All+1e-4)
        Tensor_All = (Tensor_All-0)/(1-0+1e-16)
        if Model_OD=='CNN' or Model_OD=='CRNN_Timing':
            Tensor_All = np.expand_dims(Tensor_All,axis=-1)

        Time_3 = time.time()-start
        print('Prediction Time: {:.4f}'.format(Time_3))
        print('Prediction Time per second: {:.4f}'.format(Time_3/(len(audio)/22050)))
        print('\n')
        Times_OD[N,2,a] = Time_3/(len(audio)/22050)
        start = time.time()

        predictions = model.predict(Tensor_All)

        Time_4 = time.time()-start
        print('Prediction Time: {:.4f}'.format(Time_4))
        print('Prediction Time per second: {:.4f}'.format(Time_4/(len(audio)/22050)))
        print('\n')
        Times_OD[N,3,a] = Time_4/(len(audio)/22050)
        start = time.time()

        #predictions = model(Tensor_All)
        if Model_OD!='RNN' and Model_OD!='CRNN_Timing':
            Prediction = flatten_sequence(tf.math.sigmoid(predictions), factor_div)

        Time_5 = time.time()-start
        print('Prediction Time: {:.4f}'.format(Time_5))
        print('Prediction Time per second: {:.4f}'.format(Time_5/(len(audio)/22050)))
        print('\n')
        Times_OD[N,4,a] = Time_5/(len(audio)/22050)
        start = time.time()

        Time_OD = Time_1 + Time_2 + Time_3 + Time_4
        print('Total Time: {:.4f}'.format(Time_OD))
        print('Total Time per second: {:.4f}'.format(Time_OD/(len(audio)/22050)))
        print('\n')
        Times_OD[N,5,a] = Time_OD/(len(audio)/22050)



[0, 0]


(20672, 129)
(20672, 128)
Spectrogram Time: 0.0929
Spectrogram Time per second: 0.0008


Preprocess Time: 0.2065
Preprocess Time per second: 0.0017


Prediction Time: 0.2634
Prediction Time per second: 0.0022


Prediction Time: 11.5263
Prediction Time per second: 0.0961


Prediction Time: 0.0000
Prediction Time per second: 0.0000


Total Time: 12.0891
Total Time per second: 0.1007




[1, 0]


(20672, 129)
(20672, 128)
Spectrogram Time: 0.0964
Spectrogram Time per second: 0.0008


Preprocess Time: 0.2082
Preprocess Time per second: 0.0017


Prediction Time: 0.2203
Prediction Time per second: 0.0018


Prediction Time: 2.0333
Prediction Time per second: 0.0169


Prediction Time: 0.0000
Prediction Time per second: 0.0000


Total Time: 2.5583
Total Time per second: 0.0213




[2, 0]


(20672, 129)
(20672, 128)
Spectrogram Time: 0.0935
Spectrogram Time per second: 0.0008


Preprocess Time: 0.2113
Preprocess Time per second: 0.0018


Prediction Time: 0.2299
Prediction Time per sec

In [4]:
print('CNN: ' + str(np.mean(Times_OD[:,-1,0])))
print('BRNN: ' + str(np.mean(Times_OD[:,-1,1])))
print('RNN: ' + str(np.mean(Times_OD[:,-1,2])))

CNN: 0.029315366347630815
BRNN: 0.02816485067208608
RNN: 0.015786938667297364


In [5]:
print('CNN: ' + str(np.mean(Times_OD[:,-1,0])))
print('BRNN: ' + str(np.mean(Times_OD[:,-1,1])))
print('RNN: ' + str(np.mean(Times_OD[:,-1,2])))

CNN: 0.03049141228199005
BRNN: 0.02603413045406342
RNN: 0.01654434204101563


In [7]:
print('CRNN: ' + str(np.mean(Times_OD[:,-1,0])))

CRNN: 0.013609429200490316
