In [18]:
import os
import pdb
import time
import random
import numpy as np
import scipy as sp
import librosa
import matplotlib.pyplot as plt

import tensorflow as tf

from sklearn.metrics import f1_score
from mir_eval.onset import f_measure

from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

from networks import *
from utils import set_seeds

In [19]:
def flatten_sequence(sequence, factor):
    
    seq_length = sequence.shape[-1]
    length = seq_length//factor
    seq_length_diff = seq_length - length

    sequence_flat = np.zeros(tf.size(sequence).numpy()*factor)
    for n in range(len(sequence)):
        point = n*length
        if n==0:
            sequence_flat[:seq_length] = sequence[n]
        else:
            sequence_flat[point:point+seq_length_diff] = sequence_flat[point:point+seq_length_diff] + sequence[n][:-length]
            sequence_flat[point+seq_length_diff:point+seq_length_diff+length] = sequence[n][-length:]

    sequence_flat = sequence_flat[:point+seq_length]
    
    for n in range(factor-1):
        point = n*length
        sequence_flat[point:point+length] = sequence_flat[point:point+length]/(n+1)
        if n==0:
            sequence_flat[-point-length:] = sequence_flat[-point-length:]/(n+1)
        else:
            sequence_flat[-point-length:-point] = sequence_flat[-point-length:-point]/(n+1)
        
    sequence_flat[(factor-1)*length:-(factor-1)*length] = sequence_flat[(factor-1)*length:-(factor-1)*length]/factor
    
    return sequence_flat

In [None]:
arr = np.array([[0,0,0,0,0,1,0,0],[0,0,0,0,1,0,0,0],[0,0,0,1,0,0,0,0],[0,0,0,0,0,0,0,0],[0,0,1,0,0,0,0,0]])

#def add_signal(a, b, ai=0, bi=0):
def flatten_sequence(sequence, 8):
    assert ai >= 0
    assert bi >= 0

    al = len(a)
    bl = len(b)
    cl = max(ai + al, bi + bl)
    c = np.zeros(cl)
    c[ai: ai + al] += a
    c[bi: bi + bl] += b
    return c

flatten_sequence(arr, arr)

In [48]:
# Onset Detection

num_iter = 10
num_stages = 4

sequence_length = 16
factor_div = 4

Models_OD = ['RNN']
Times_OD = np.zeros((num_iter,num_stages,len(Models_OD)))

# Preprocess Parameters

hop_size = 128
frame_size = 256

for a in range(len(Models_OD)):

    for N in range(num_iter):

        Model_OD = Models_OD[a]

        print('\n')
        print([N,a])
        print('\n')

        # Onset Detection Parameters

        if Model_OD=='BRNN':
            model = BRNN_1(16, 0)
            sequence_length = 16
            factor_div = 4
        elif Model_OD=='CNN':
            model = CNN_T_1(16, 0)
            sequence_length = 16
            factor_div = 4
        elif Model_OD=='RNN':
            model = RNN_1(8, 0)
            sequence_length = 8
            factor_div = 8

        # Onset Detection

        audio = np.random.rand(120*22050)

        start = time.time()

        Tensor_All = np.abs(librosa.stft(audio, n_fft=frame_size, hop_length=hop_size, win_length=frame_size, window='hann')).T
        print(Tensor_All.shape)
        Tensor_All = np.flipud(Tensor_All[:,1:])
        print(Tensor_All.shape)

        Time_1 = time.time()-start
        print('Spectrogram Time: {:.4f}'.format(Time_1))
        print('Spectrogram Time per second: {:.4f}'.format(Time_1/(len(audio)/22050)))
        print('\n')
        Times_OD[N,0,a] = Time_1/(len(audio)/22050)
        start = time.time()

        cut_length = int(Tensor_All.shape[0]/sequence_length)*sequence_length
        Tensor_All_0 = Tensor_All[:cut_length]
        div_sequence_length = sequence_length//factor_div
        Tensor_All = np.zeros((int(Tensor_All_0.shape[0]/div_sequence_length)-(factor_div-1), sequence_length, Tensor_All_0.shape[1]))
        for n in range(int(Tensor_All_0.shape[0]/div_sequence_length)-(factor_div-1)):
            point = n*div_sequence_length
            Tensor_All[n] = Tensor_All_0[point:point+sequence_length]

        Tensor_All = np.log(Tensor_All+1e-4)
        Tensor_All = (Tensor_All-0)/(1-0+1e-16)
        if Model_OD=='CNN':
            Tensor_All = np.expand_dims(Tensor_All,axis=-1)

        Time_2 = time.time()-start
        print('Preprocess Time: {:.4f}'.format(Time_2))
        print('Preprocess Time per second: {:.4f}'.format(Time_2/(len(audio)/22050)))
        print('\n')
        Times_OD[N,1,a] = Time_2/(len(audio)/22050)
        start = time.time()

        predictions = model.predict(Tensor_All)
        #predictions = model(Tensor_All)
        if Model_OD!='RNN':
            Prediction = flatten_sequence(tf.math.sigmoid(predictions), factor_div)

        Time_3 = time.time()-start
        print('Prediction Time: {:.4f}'.format(Time_3))
        print('Prediction Time per second: {:.4f}'.format(Time_3/(len(audio)/22050)))
        print('\n')
        Times_OD[N,2,a] = Time_3/(len(audio)/22050)
        start = time.time()

        Time_OD = Time_1 + Time_2 + Time_3
        print('Total Time: {:.4f}'.format(Time_OD))
        print('Total Time per second: {:.4f}'.format(Time_OD/(len(audio)/22050)))
        print('\n')
        Times_OD[N,3,a] = Time_OD/(len(audio)/22050)



[0, 0]


(20672, 129)
(20672, 128)
Spectrogram Time: 0.1268
Spectrogram Time per second: 0.0011


Preprocess Time: 1.6779
Preprocess Time per second: 0.0140


Prediction Time: 2.1538
Prediction Time per second: 0.0179


Total Time: 3.9585
Total Time per second: 0.0330




[1, 0]


(20672, 129)
(20672, 128)
Spectrogram Time: 0.1316
Spectrogram Time per second: 0.0011


Preprocess Time: 1.2067
Preprocess Time per second: 0.0101


Prediction Time: 1.5122
Prediction Time per second: 0.0126


Total Time: 2.8504
Total Time per second: 0.0238




[2, 0]


(20672, 129)
(20672, 128)
Spectrogram Time: 0.1343
Spectrogram Time per second: 0.0011


Preprocess Time: 1.2105
Preprocess Time per second: 0.0101


Prediction Time: 1.5000
Prediction Time per second: 0.0125


Total Time: 2.8447
Total Time per second: 0.0237




[3, 0]


(20672, 129)
(20672, 128)
Spectrogram Time: 0.1292
Spectrogram Time per second: 0.0011


Preprocess Time: 1.2148
Preprocess Time per second: 0.0101


Prediction Time: 1.5

In [49]:
print('CNN: ' + str(np.mean(Times_OD[:,2,0])))
print('BRNN: ' + str(np.mean(Times_OD[:,2,1])))
print('RNN: ' + str(np.mean(Times_OD[:,2,2])))

CNN: 0.01346932013829549


IndexError: index 1 is out of bounds for axis 2 with size 1

In [5]:
np.mean(Times_OD, axis=0)

array([[[[2.39531775e+00, 2.28234763e+00, 2.41398926e+00],
         [2.22199209e+00, 2.43765733e+00, 2.37444961e+00]],

        [[2.42609396e+00, 2.43066318e+00, 2.45239549e+00],
         [2.46355112e+00, 2.47731419e+00, 2.44633756e+00]]],


       [[[1.20051503e-01, 6.20655847e-01, 7.21005034e-01],
         [7.62685776e-02, 6.66998839e-01, 7.34952497e-01]],

        [[1.65684080e-01, 1.25961144e+00, 1.46138747e+00],
         [1.63192844e-01, 1.25456052e+00, 1.47684548e+00]]],


       [[[2.35757685e-01, 3.37651706e+00, 4.18946073e+00],
         [2.65000377e+00, 5.04672352e+01, 9.20342315e+01]],

        [[2.59603786e-01, 7.17884929e+00, 9.93898633e+00],
         [2.72750571e+00, 9.66009681e+01, 1.79088894e+02]]],


       [[[5.64195633e-02, 3.72443914e-02, 5.91342211e-02],
         [6.70372248e-02, 4.10733700e-02, 6.23371124e-02]],

        [[5.54090977e-02, 3.19685221e-02, 5.94607115e-02],
         [6.68742418e-02, 3.28506470e-02, 6.27214670e-02]]],


       [[[2.80754650e+00, 6.3167

In [15]:
np.mean(Times_OD,axis=0)[-1]/(len(audio)/44100)

array([[[0.00502397, 0.01130355, 0.01321259],
        [0.00897465, 0.09593788, 0.17036661]],

       [[0.00520157, 0.01950699, 0.02489528],
        [0.00970085, 0.17959969, 0.32760374]]])

In [7]:
num_iter = 10
num_stages = 5

Times_UC = np.zeros((num_iter, num_stages, len(num_specs),len(Modes_UC),len(Models_UC)))

for N in range(num_iter):

    for a in range(len(num_specs)):
        
        for b in range(len(Modes_UC)):

            for c in range(len(Models_UC)):

                print('\n')
                print('\n')
                print('\n')
                print([N,a,b,c])
                print('\n')
                print('\n')
                print('\n')

                # Global Params

                cuda = torch.cuda.is_available()
                sig = nn.Sigmoid()

                Threshold = 0

                Mode_UC = Modes_UC[b]
                Model_UC = Models_UC[c]
                
                # Preprocess Parameters

                hop_size = 441
                hop_size_ms = hop_size/44100

                frame_size = 2048
                num_spec = num_specs[a]
                delta_bool = False

                # Utterance Classification Parameters

                z_dim = 16
                dropout = 0.0
                time_length_uc = 32

                if Mode_UC=='min':
                    num_filt = 32
                    layers = [1,1,1,1]
                    filters_height = [3,3,3,3]
                    filters_width = [3,3,3,3]
                elif Mode_UC=='max':
                    num_filt = 128
                    layers = [3,3,3,3]
                    filters_height = [5,5,5,5]
                    filters_width = [9,9,9,9]

                if Model_UC=='AE' and num_spec==32:
                    model_class = AE(layers=layers, filters_height=filters_height, filters_width=filters_width, dropout=dropout, h_dim=2*2, z_dim=z_dim, num_filt=num_filt)
                elif Model_UC=='AE' and num_spec==64:
                    model_class = AE(layers=layers, filters_height=filters_height, filters_width=filters_width, dropout=dropout, h_dim=2*4, z_dim=z_dim, num_filt=num_filt)
                elif Model_UC=='VAE' and num_spec==32:
                    model_class = VAE(layers=layers, filters_height=filters_height, filters_width=filters_width, dropout=dropout, h_dim=2*2, z_dim=z_dim, num_filt=num_filt)
                elif Model_UC=='VAE' and num_spec==64:
                    model_class = VAE(layers=layers, filters_height=filters_height, filters_width=filters_width, dropout=dropout, h_dim=2*4, z_dim=z_dim, num_filt=num_filt)

                X_train = np.random((10000, z_dim))
                y_train = np.random(10000)

                neigh = KNeighborsClassifier(n_neighbors=3)
                neigh.fit(X_train, y_train)

                # Utterance Classification
                
                start = time.time()

                Spec = librosa.feature.melspectrogram(audio[hop_size:], sr=44100, n_fft=frame_size, hop_length=hop_size, n_mels=num_spec).T
                
                Time_1 = time.time()-start
                print('Spectrogram Time: {:.4f}'.format(Time_1))
                print('Spectrogram Time per second: {:.4f}'.format(Time_1/(len(audio)/44100)))
                print('\n')
                Times_UC[N,0,a,b,c] = Time_1
                start = time.time()
                
                zp = np.zeros((time_length_uc, Spec.shape[-1]))
                Spec = np.concatenate((Spec, zp))

                onsets_time_frames = onsets//hop_size

                Tensor_All = np.zeros((len(onsets_time_frames), time_length_uc, Spec.shape[-1]))
                for n in range(len(onsets_time_frames)):
                    place = onsets_time_frames[n]
                    Tensor_All[n] = Spec[place:place+time_length_uc]

                std = np.std(Tensor_All)
                mean = np.mean(Tensor_All)
                Tensor_All = (Tensor_All-mean)/std

                with torch.no_grad():

                    fix_seeds(0)

                    data = torch.from_numpy(Tensor_All)
                    data = data.float()

                    Time_2 = time.time()-start
                    print('Preprocess Time: {:.4f}'.format(Time_2))
                    print('Preprocess Time per second: {:.4f}'.format(Time_2/(len(audio)/44100)))
                    print('\n')
                    Times_UC[N,1,a,b,c] = Time_2
                    start = time.time()

                    if Model_UC=='AE':
                        rec, mu = model_class(data)
                    elif Model_UC=='VAE':
                        rec, mu, logvar = model_class(data)

                    #mu = mu.double()

                Time_3 = time.time()-start
                print('Embedding Time: {:.4f}'.format(Time_3))
                print('Embedding Time per second: {:.4f}'.format(Time_3/(len(audio)/44100)))
                print('\n')
                Times_UC[N,2,a,b,c] = Time_3
                start = time.time()

                mu_data = mu.detach().numpy()
                y_hat = neigh.predict(mu_data)

                Time_4 = time.time()-start
                print('Classification Time: {:.4f}'.format(Time_4))
                print('Classification Time per second: {:.4f}'.format(Time_4/(len(audio)/44100)))
                print('\n')
                Times_UC[N,3,a,b,c] = Time_4
                start = time.time()

                Time_UC = Time_2 + Time_3 + Time_4
                print('Total Time: {:.4f}'.format(Time_UC))
                print('Total Time per second: {:.4f}'.format(Time_UC/(len(audio)/44100)))
                Times_UC[N,4,a,b,c] = Time_UC

                







[0, 0, 0, 0]






Spectrogram Time: 2.2856
Spectrogram Time per second: 0.0041


Preprocess Time: 0.0859
Preprocess Time per second: 0.0002


Embedding Time: 0.3400
Embedding Time per second: 0.0006


Classification Time: 0.3401
Classification Time per second: 0.0006


Total Time: 0.7660
Total Time per second: 0.0014






[0, 0, 0, 1]






Spectrogram Time: 2.1847
Spectrogram Time per second: 0.0039


Preprocess Time: 0.0310
Preprocess Time per second: 0.0001


Embedding Time: 0.3309
Embedding Time per second: 0.0006


Classification Time: 0.3590
Classification Time per second: 0.0006


Total Time: 0.7209
Total Time per second: 0.0013






[0, 0, 1, 0]






Spectrogram Time: 2.1757
Spectrogram Time per second: 0.0039


Preprocess Time: 0.0281
Preprocess Time per second: 0.0001


Embedding Time: 24.1836
Embedding Time per second: 0.0433


Classification Time: 0.3457
Classification Time per second: 0.0006


Total Time: 24.5574
Total Time per second: 0.0439






[0, 0, 1, 1]



Spectrogram Time: 2.3363
Spectrogram Time per second: 0.0042


Preprocess Time: 0.0305
Preprocess Time per second: 0.0001


Embedding Time: 0.3413
Embedding Time per second: 0.0006


Classification Time: 0.3355
Classification Time per second: 0.0006


Total Time: 0.7074
Total Time per second: 0.0013






[3, 0, 1, 0]






Spectrogram Time: 2.1809
Spectrogram Time per second: 0.0039


Preprocess Time: 0.0266
Preprocess Time per second: 0.0000


Embedding Time: 24.2711
Embedding Time per second: 0.0434


Classification Time: 0.3711
Classification Time per second: 0.0007


Total Time: 24.6689
Total Time per second: 0.0441






[3, 0, 1, 1]






Spectrogram Time: 2.4674
Spectrogram Time per second: 0.0044


Preprocess Time: 0.0269
Preprocess Time per second: 0.0000


Embedding Time: 24.1758
Embedding Time per second: 0.0433


Classification Time: 0.3483
Classification Time per second: 0.0006


Total Time: 24.5510
Total Time per second: 0.0439






[3, 1, 0, 0]






Spectrogram Time: 

Embedding Time: 24.3558
Embedding Time per second: 0.0436


Classification Time: 0.3595
Classification Time per second: 0.0006


Total Time: 24.7421
Total Time per second: 0.0443






[6, 0, 1, 1]






Spectrogram Time: 2.3479
Spectrogram Time per second: 0.0042


Preprocess Time: 0.0305
Preprocess Time per second: 0.0001


Embedding Time: 24.2990
Embedding Time per second: 0.0435


Classification Time: 0.3669
Classification Time per second: 0.0007


Total Time: 24.6964
Total Time per second: 0.0442






[6, 1, 0, 0]






Spectrogram Time: 2.4134
Spectrogram Time per second: 0.0043


Preprocess Time: 0.0689
Preprocess Time per second: 0.0001


Embedding Time: 0.6220
Embedding Time per second: 0.0011


Classification Time: 0.3482
Classification Time per second: 0.0006


Total Time: 1.0391
Total Time per second: 0.0019






[6, 1, 0, 1]






Spectrogram Time: 2.3971
Spectrogram Time per second: 0.0043


Preprocess Time: 0.0522
Preprocess Time per second: 0.0001


Embedding Time: 0.

Classification Time: 0.3453
Classification Time per second: 0.0006


Total Time: 24.6272
Total Time per second: 0.0441






[9, 1, 0, 0]






Spectrogram Time: 2.2707
Spectrogram Time per second: 0.0041


Preprocess Time: 0.0679
Preprocess Time per second: 0.0001


Embedding Time: 0.6126
Embedding Time per second: 0.0011


Classification Time: 0.3470
Classification Time per second: 0.0006


Total Time: 1.0275
Total Time per second: 0.0018






[9, 1, 0, 1]






Spectrogram Time: 2.2278
Spectrogram Time per second: 0.0040


Preprocess Time: 0.0520
Preprocess Time per second: 0.0001


Embedding Time: 0.6281
Embedding Time per second: 0.0011


Classification Time: 0.3359
Classification Time per second: 0.0006


Total Time: 1.0160
Total Time per second: 0.0018






[9, 1, 1, 0]






Spectrogram Time: 2.2082
Spectrogram Time per second: 0.0040


Preprocess Time: 0.0513
Preprocess Time per second: 0.0001


Embedding Time: 45.4472
Embedding Time per second: 0.0813


Classification Time:

In [8]:
np.save('_Times_UC.npy', Times_UC)

In [9]:
np.mean(Times_UC, axis=0)

array([[[[2.23620355e+00, 2.34903247e+00],
         [2.19495065e+00, 2.24763200e+00]],

        [[2.30333200e+00, 2.29815183e+00],
         [2.34389408e+00, 2.29183965e+00]]],


       [[[4.31814671e-02, 2.64518499e-02],
         [2.72370338e-02, 2.88480759e-02]],

        [[7.26819515e-02, 5.27944803e-02],
         [5.46581745e-02, 5.41644335e-02]]],


       [[[3.08004785e-01, 3.23238969e-01],
         [2.42248369e+01, 2.44132411e+01]],

        [[6.17770839e-01, 6.63885903e-01],
         [4.62865723e+01, 4.56588579e+01]]],


       [[[3.41786146e-01, 3.40113449e-01],
         [3.53116655e-01, 3.50147629e-01]],

        [[3.47223926e-01, 3.46603703e-01],
         [3.46275091e-01, 3.49383187e-01]]],


       [[[6.92972398e-01, 6.89804268e-01],
         [2.46051906e+01, 2.47922368e+01]],

        [[1.03767672e+00, 1.06328409e+00],
         [4.66875056e+01, 4.60624056e+01]]]])

In [16]:
np.mean(Times_UC,axis=0)[-1]/(len(audio)/44100)

array([[[0.00124004, 0.00123437],
        [0.04402983, 0.04436454]],

       [[0.00185687, 0.0019027 ],
        [0.0835451 , 0.08242651]]])