In [8]:
import pdb
import time
import numpy as np
import librosa

import tensorflow as tf

from networks import *

In [9]:
def flatten_sequence(sequence, factor):
    
    seq_length = sequence.shape[-1]
    length = seq_length//factor
    seq_length_diff = seq_length - length

    sequence_flat = np.zeros(tf.size(sequence).numpy()*factor)
    for n in range(len(sequence)):
        point = n*length
        if n==0:
            sequence_flat[:seq_length] = sequence[n]
        else:
            sequence_flat[point:point+seq_length_diff] = sequence_flat[point:point+seq_length_diff] + sequence[n][:-length]
            sequence_flat[point+seq_length_diff:point+seq_length_diff+length] = sequence[n][-length:]

    sequence_flat = sequence_flat[:point+seq_length]
    
    for n in range(factor-1):
        point = n*length
        sequence_flat[point:point+length] = sequence_flat[point:point+length]/(n+1)
        if n==0:
            sequence_flat[-point-length:] = sequence_flat[-point-length:]/(n+1)
        else:
            sequence_flat[-point-length:-point] = sequence_flat[-point-length:-point]/(n+1)
        
    sequence_flat[(factor-1)*length:-(factor-1)*length] = sequence_flat[(factor-1)*length:-(factor-1)*length]/factor
    
    return sequence_flat

In [3]:
# Onset Detection

num_iter = 20
num_stages = 6

sequence_length = 16
factor_div = 4

Models_OD = ['CNN','RNN','CRNN','Online_RNN']
#Models_OD = ['CRNN']
Times_OD = np.zeros((num_iter,num_stages,len(Models_OD)))

# Preprocess Parameters

hop_size = 128
frame_size = 256

for a in range(len(Models_OD)):

    for N in range(num_iter):

        Model_OD = Models_OD[a]

        print('\n')
        print([N,a])
        print('\n')

        # Onset Detection Parameters

        if Model_OD=='RNN':
            sequence_length = 16
            model = BRNN_1(sequence_length,0)
            hop = 4
        elif Model_OD=='CNN':
            sequence_length = 16
            model = CNN_T_1(sequence_length,0)
            hop = 4
        elif Model_OD=='Online_RNN':
            sequence_length = 10
            model = RNN_1(sequence_length,0)
            hop = 1
        elif Model_OD=='CRNN':
            sequence_length = 16
            model = CRNN_1S_2D(sequence_length,0)
            hop = 1

        # Onset Detection

        audio = np.random.rand(120*22050)

        start = time.time()

        Tensor_All_0 = np.abs(librosa.stft(audio, n_fft=frame_size, hop_length=hop_size, win_length=frame_size, window='hann')).T
        print(Tensor_All_0.shape)
        Tensor_All_0 = np.flipud(Tensor_All_0[:,1:])
        print(Tensor_All_0.shape)

        Time_1 = time.time()-start
        print('Spectrogram Time: {:.4f}'.format(Time_1))
        print('Spectrogram Time per second: {:.4f}'.format(Time_1/(len(audio)/22050)))
        print('\n')
        Times_OD[N,0,a] = Time_1/(len(audio)/22050)
        start = time.time()

        length = Tensor_All_0.shape[0]-sequence_length+1
        Tensor_All = np.zeros(shape=(length,sequence_length,Tensor_All_0.shape[1]))
        for n in range(sequence_length):
            Tensor_All[:,n] = Tensor_All_0[n:length+n]
        if Model_OD!='Online_RNN':
            Tensor_All = Tensor_All[::factor_div]

        Time_2 = time.time()-start
        print('Preprocess Time: {:.4f}'.format(Time_2))
        print('Preprocess Time per second: {:.4f}'.format(Time_2/(len(audio)/22050)))
        print('\n')
        Times_OD[N,1,a] = Time_2/(len(audio)/22050)
        start = time.time()

        Tensor_All = np.log(Tensor_All+1e-4)
        Tensor_All = (Tensor_All-0)/(1-0+1e-16)
        if Model_OD=='CNN' or Model_OD=='CRNN':
            Tensor_All = np.expand_dims(Tensor_All,axis=-1)

        Time_3 = time.time()-start
        print('Prediction Time: {:.4f}'.format(Time_3))
        print('Prediction Time per second: {:.4f}'.format(Time_3/(len(audio)/22050)))
        print('\n')
        Times_OD[N,2,a] = Time_3/(len(audio)/22050)
        if Model_OD=='CRNN':
            Tensor_All = np.random.rand(Tensor_All.shape[0],sequence_length,8,128,1)
        start = time.time()

        predictions = model(Tensor_All)

        Time_4 = time.time()-start
        print('Prediction Time: {:.4f}'.format(Time_4))
        print('Prediction Time per second: {:.4f}'.format(Time_4/(len(audio)/22050)))
        print('\n')
        Times_OD[N,3,a] = Time_4/(len(audio)/22050)
        start = time.time()

        #predictions = model(Tensor_All)
        if Model_OD!='Online_RNN' and Model_OD!='CRNN':
            Prediction = flatten_sequence(tf.math.sigmoid(predictions), factor_div)

        Time_5 = time.time()-start
        print('Prediction Time: {:.4f}'.format(Time_5))
        print('Prediction Time per second: {:.4f}'.format(Time_5/(len(audio)/22050)))
        print('\n')
        Times_OD[N,4,a] = Time_5/(len(audio)/22050)
        start = time.time()

        Time_OD = Time_1 + Time_2 + Time_3 + Time_4
        print('Total Time: {:.4f}'.format(Time_OD))
        print('Total Time per second: {:.4f}'.format(Time_OD/(len(audio)/22050)))
        print('\n')
        Times_OD[N,5,a] = Time_OD/(len(audio)/22050)



[0, 0]


(20672, 129)
(20672, 128)
Spectrogram Time: 0.0874
Spectrogram Time per second: 0.0007


Preprocess Time: 0.1982
Preprocess Time per second: 0.0017


Prediction Time: 0.2292
Prediction Time per second: 0.0019


Prediction Time: 2.0582
Prediction Time per second: 0.0172


Prediction Time: 2.4531
Prediction Time per second: 0.0204


Total Time: 2.5731
Total Time per second: 0.0214




[1, 0]


(20672, 129)
(20672, 128)
Spectrogram Time: 0.0707
Spectrogram Time per second: 0.0006


Preprocess Time: 0.2017
Preprocess Time per second: 0.0017


Prediction Time: 0.2128
Prediction Time per second: 0.0018


Prediction Time: 1.9374
Prediction Time per second: 0.0161


Prediction Time: 2.4797
Prediction Time per second: 0.0207


Total Time: 2.4225
Total Time per second: 0.0202




[2, 0]


(20672, 129)
(20672, 128)
Spectrogram Time: 0.0713
Spectrogram Time per second: 0.0006


Preprocess Time: 0.1918
Preprocess Time per second: 0.0016


Prediction Time: 0.2183
Prediction Time per secon

In [9]:
print('CNN: ' + str(np.mean(Times_OD[:,-1,0])) + ' +- ' + str(np.std(Times_OD[:,-1,0])) + '|' + str(np.std(Times_OD[:,-1,0])*0.43826932358995874))
print('RNN: ' + str(np.mean(Times_OD[:,-1,1])) + ' +- ' + str(np.std(Times_OD[:,-1,1])) + '|' + str(np.std(Times_OD[:,-1,1])*0.43826932358995874))
print('CRNN: ' + str(np.mean(Times_OD[:,-1,2])) + ' +- ' + str(np.std(Times_OD[:,-1,2])) + '|' + str(np.std(Times_OD[:,-1,2])*0.43826932358995874))
print('Online_RNN: ' + str(np.mean(Times_OD[:,-1,3])) + ' +- ' + str(np.std(Times_OD[:,-1,3])) + '|' + str(np.std(Times_OD[:,-1,3])*0.43826932358995874))
# MKL: 0.010587399303913116 +- 0.00016365101657776944|7.172322034034813e-05

CNN: 0.020885973175366716 +- 0.0004697723257743631|0.00020588679945841184
RNN: 0.014202524721622464 +- 0.00031882290951794405|0.00013973030089941196
CRNN: 0.12667637636264167 +- 0.006358427621735013|0.002786703772873514
Online_RNN: 0.007851984103520712 +- 0.00029159350514345304|0.00012779648826244632


In [44]:
(1.96/5)*74

29.008000000000003

In [10]:
# Onset Detection

num_iter = 20
num_stages = 6

sequence_length = 16
factor_div = 4

Models_OD = ['CNN','RNN','CRNN','Online_RNN']
#Models_OD = ['CRNN']
Times_OD = np.zeros((num_iter,num_stages,len(Models_OD)))

# Preprocess Parameters

hop_size = 128
frame_size = 256

for a in range(len(Models_OD)):

    for N in range(num_iter):

        Model_OD = Models_OD[a]

        print('\n')
        print([N,a])
        print('\n')

        # Onset Detection Parameters

        if Model_OD=='RNN':
            sequence_length = 16
            model = BRNN_1(sequence_length,0)
            hop = 4
        elif Model_OD=='CNN':
            sequence_length = 16
            model = CNN_T_1(sequence_length,0)
            hop = 4
        elif Model_OD=='Online_RNN':
            sequence_length = 10
            model = RNN_1(sequence_length,0)
            hop = 1
        elif Model_OD=='CRNN':
            sequence_length = 16
            model = CRNN_1S_2D(sequence_length,0)
            hop = 1

        # Onset Detection

        audio = np.random.rand(10*22050)

        start = time.time()

        Tensor_All_0 = np.abs(librosa.stft(audio, n_fft=frame_size, hop_length=hop_size, win_length=frame_size, window='hann')).T
        print(Tensor_All_0.shape)
        Tensor_All_0 = np.flipud(Tensor_All_0[:,1:])
        print(Tensor_All_0.shape)

        Time_1 = time.time()-start
        print('Spectrogram Time: {:.4f}'.format(Time_1))
        print('Spectrogram Time per second: {:.4f}'.format(Time_1/(len(audio)/22050)))
        print('\n')
        Times_OD[N,0,a] = Time_1/(len(audio)/22050)
        start = time.time()

        length = Tensor_All_0.shape[0]-sequence_length+1
        Tensor_All = np.zeros(shape=(length,sequence_length,Tensor_All_0.shape[1]))
        for n in range(sequence_length):
            Tensor_All[:,n] = Tensor_All_0[n:length+n]
        if Model_OD!='Online_RNN':
            Tensor_All = Tensor_All[::factor_div]

        Time_2 = time.time()-start
        print('Preprocess Time: {:.4f}'.format(Time_2))
        print('Preprocess Time per second: {:.4f}'.format(Time_2/(len(audio)/22050)))
        print('\n')
        Times_OD[N,1,a] = Time_2/(len(audio)/22050)
        start = time.time()

        Tensor_All = np.log(Tensor_All+1e-4)
        Tensor_All = (Tensor_All-0)/(1-0+1e-16)
        if Model_OD=='CNN' or Model_OD=='CRNN':
            Tensor_All = np.expand_dims(Tensor_All,axis=-1)

        Time_3 = time.time()-start
        print('Prediction Time: {:.4f}'.format(Time_3))
        print('Prediction Time per second: {:.4f}'.format(Time_3/(len(audio)/22050)))
        print('\n')
        Times_OD[N,2,a] = Time_3/(len(audio)/22050)
        if Model_OD=='CRNN':
            Tensor_All = np.random.rand(Tensor_All.shape[0],sequence_length,8,128,1)
        start = time.time()

        predictions = model(Tensor_All)

        Time_4 = time.time()-start
        print('Prediction Time: {:.4f}'.format(Time_4))
        print('Prediction Time per second: {:.4f}'.format(Time_4/(len(audio)/22050)))
        print('\n')
        Times_OD[N,3,a] = Time_4/(len(audio)/22050)
        start = time.time()

        #predictions = model(Tensor_All)
        if Model_OD!='Online_RNN' and Model_OD!='CRNN':
            Prediction = flatten_sequence(tf.math.sigmoid(predictions), factor_div)

        Time_5 = time.time()-start
        print('Prediction Time: {:.4f}'.format(Time_5))
        print('Prediction Time per second: {:.4f}'.format(Time_5/(len(audio)/22050)))
        print('\n')
        Times_OD[N,4,a] = Time_5/(len(audio)/22050)
        start = time.time()

        Time_OD = Time_1 + Time_2 + Time_3 + Time_4
        print('Total Time: {:.4f}'.format(Time_OD))
        print('Total Time per second: {:.4f}'.format(Time_OD/(len(audio)/22050)))
        print('\n')
        Times_OD[N,5,a] = Time_OD/(len(audio)/22050)



[0, 0]


(1723, 129)
(1723, 128)
Spectrogram Time: 0.0119
Spectrogram Time per second: 0.0012


Preprocess Time: 0.0194
Preprocess Time per second: 0.0019


Prediction Time: 0.0294
Prediction Time per second: 0.0029


Prediction Time: 0.5336
Prediction Time per second: 0.0534


Prediction Time: 0.2886
Prediction Time per second: 0.0289


Total Time: 0.5943
Total Time per second: 0.0594




[1, 0]


(1723, 129)
(1723, 128)
Spectrogram Time: 0.0071
Spectrogram Time per second: 0.0007


Preprocess Time: 0.0162
Preprocess Time per second: 0.0016


Prediction Time: 0.0168
Prediction Time per second: 0.0017


Prediction Time: 0.2421
Prediction Time per second: 0.0242


Prediction Time: 0.2357
Prediction Time per second: 0.0236


Total Time: 0.2822
Total Time per second: 0.0282




[2, 0]


(1723, 129)
(1723, 128)
Spectrogram Time: 0.0097
Spectrogram Time per second: 0.0010


Preprocess Time: 0.0155
Preprocess Time per second: 0.0016


Prediction Time: 0.0164
Prediction Time per second: 0.0

In [12]:
print('CNN: ' + str(np.mean(Times_OD[:,-1,0])) + ' +- ' + str(np.std(Times_OD[:,-1,0])) + '|' + str(np.std(Times_OD[:,-1,0])*0.43826932358995874))
print('RNN: ' + str(np.mean(Times_OD[:,-1,1])) + ' +- ' + str(np.std(Times_OD[:,-1,1])) + '|' + str(np.std(Times_OD[:,-1,1])*0.43826932358995874))
print('CRNN: ' + str(np.mean(Times_OD[:,-1,2])) + ' +- ' + str(np.std(Times_OD[:,-1,2])) + '|' + str(np.std(Times_OD[:,-1,2])*0.43826932358995874))
print('Online_RNN: ' + str(np.mean(Times_OD[:,-1,3])) + ' +- ' + str(np.std(Times_OD[:,-1,3])) + '|' + str(np.std(Times_OD[:,-1,3])*0.43826932358995874))
# MKL: 0.010587399303913116 +- 0.00016365101657776944|7.172322034034813e-05

CNN: 0.03077136754989624 +- 0.007016713097041222|0.0030752101028650607
RNN: 0.023345335721969604 +- 0.0025444759181406777|0.001115165739534454
CRNN: 0.1387440872192383 +- 0.006381819382093017|0.002796955663863195
Online_RNN: 0.008086923360824585 +- 0.000901286064398591|0.0003950060338050265
