In [46]:

import numpy as np
import tensorflow as tf
import keras
import keras.backend as K
from keras.layers import Input, Conv1D, Conv2D, Dense, Activation, Concatenate, TimeDistributed, Lambda, Reshape, Dropout, Permute
from keras.layers import Multiply, Add, UpSampling1D, MaxPooling1D, BatchNormalization, Bidirectional, LSTM, GRU, MaxPooling2D
from Layers import Conv1D_local, Dense_local, SAAF, Conv1D_tied, Slice, LogMelSpectrogram



sr = 24000

def Frontend(batchsize_, win_length, filters, kernel_size_1, melspec=False, 
            output_dim=64, CRNN_output=False):
    # CRNN_output adds channel dimension to the output (1 channel, data_format=channel last) 
    # for use in any Conv2D model
    x = Input(shape=(batchsize_, win_length,1), name='input')

    if melspec is False:
        conv = Conv1D(filters, kernel_size_1, strides=1, padding='same',
                        kernel_initializer='lecun_uniform', input_shape=(win_length, 1))
        
        activation_abs = Activation(K.abs, name='conv_activation') 
        # Original CAFx model uses softplus activation function
        activation_sp = tf.keras.layers.ReLU()
        max_pooling = MaxPooling1D(pool_size=win_length//output_dim, data_format='channels_last')

        conv_smoothing = Conv1D_local(filters, kernel_size_1*2, strides=1, padding='same',
                                    kernel_initializer='lecun_uniform')
        
        
        X = TimeDistributed(conv, name='conv')(x)
        X_abs = TimeDistributed(activation_abs, name='conv_activation')(X)
        M = TimeDistributed(conv_smoothing, name='conv_smoothing')(X_abs)
        M = TimeDistributed(activation_sp, name='conv_smoothing_activation')(M)
        frontend_output = TimeDistributed(max_pooling, name='max_pooling')(M)
    
    elif melspec is True:
        
        X = TimeDistributed(tf.keras.layers.Lambda(lambda x: tf.squeeze(x, [-1])))(x)
        #frontend_output = TimeDistributed(LogMelSpectrogram(sr, 512,64,128))(x)
        X = TimeDistributed(tf.keras.layers.Lambda(lambda x: tf.pad(x, ([0,0],[int(256//2),int(256//2)]))))(X)
        X = TimeDistributed(tf.keras.layers.Lambda(lambda x: tf.signal.stft(x, frame_length=256, frame_step=65,fft_length=256)))(X)
        X = TimeDistributed(tf.keras.layers.Lambda(lambda x: tf.cast(tf.math.abs(x),dtype=tf.float32)))(X)
        filterbank = tf.signal.linear_to_mel_weight_matrix(
            num_mel_bins=128,
            num_spectrogram_bins=256 // 2 + 1,
            sample_rate=24000,
            lower_edge_hertz=0,
            upper_edge_hertz=sr//2)
        X = TimeDistributed(tf.keras.layers.Lambda(lambda x: tf.linalg.matmul(x,
                                     filterbank)))(X)
        frontend_output = TimeDistributed(tf.keras.layers.Lambda(lambda x: 10*tf.cast(tf.experimental.numpy.log10(tf.keras.backend.clip(x**2,1e-10,None)),dtype=tf.float32)))(x)
    
    
    
    if CRNN_output is True:
        frontend_output = frontend_output[..., tf.newaxis]
    else:
        pass
    

    model = tf.keras.Model(inputs=[x], outputs=[frontend_output], name='Frontend')

    return model


def LSTM_backend(batchsize_, win_length, filters, kernel_size_1, n_of_classes, 
            melspec=False, output_dim=64, frame_level_classification=False, dense_units=32,
            activation='tanh'):
   
    frontend = Frontend(batchsize_, win_length, filters, kernel_size_1, melspec=melspec, output_dim=output_dim)

    bi_rnn = Bidirectional(LSTM(filters//2, activation=activation, stateful=False,
                                 return_sequences=True, dropout=0.1,
                                 recurrent_dropout=0.1, name='BiLSTM'))
    bi_rnn1 = LSTM(filters//2, activation=activation, stateful=False,
                                 return_sequences=True, dropout=0.1,
                                 recurrent_dropout=0.1, name='LSTM_1')
    if frame_level_classification is True:
        bi_rnn2 = LSTM(filters//2, activation=activation, stateful=False,
                                 return_sequences=False, dropout=0.1,
                                 recurrent_dropout=0.1, name='LSTM_2')
    elif frame_level_classification is False:
        bi_rnn2 = LSTM(filters//2, activation=activation, stateful=False,
                                 return_sequences=True, dropout=0.1,
                                 recurrent_dropout=0.1, name='LSTM_2')
    
    


    Z = TimeDistributed(bi_rnn, name='BiLSTM')(frontend.output)
    Z = TimeDistributed(bi_rnn1, name='LSTM1')(Z)
    Z = TimeDistributed(bi_rnn2, name='LSTM2')(Z)
    
    z = TimeDistributed(keras.layers.Dense(dense_units, activation=activation, name='Dense_Xtra'))(Z)
    y = TimeDistributed(keras.layers.Dense(n_of_classes, name='Dense_layer', activation='sigmoid'))(z)
   

    model = tf.keras.Model(inputs=[frontend.input], outputs=[y], name='LSTM')
    
    initial_learning_rate = 0.001
    lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate,
    decay_steps=1000,
    decay_rate=0.96,
    staircase=True)


    # Compile the model
    model.compile(tf.keras.optimizers.Adam(learning_rate=lr_schedule,)
                    , loss='binary_crossentropy', metrics='accuracy') 


    return model

LSTM_RAe2e = LSTM_backend(10, 4096, 128, 64, n_of_classes=11, frame_level_classification=False, output_dim=64, melspec=True)
LSTM_RAe2e.summary()

Model: "LSTM"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_42 (InputLayer)       [(None, 10, 64, 128)]     0         
                                                                 
 time_distributed_315 (TimeD  (None, 10, 64, 128)      0         
 istributed)                                                     
                                                                 
 BiLSTM (TimeDistributed)    (None, 10, 64, 128)       98816     
                                                                 
 LSTM1 (TimeDistributed)     (None, 10, 64, 64)        49408     
                                                                 
 LSTM2 (TimeDistributed)     (None, 10, 64, 64)        33024     
                                                                 
 time_distributed_316 (TimeD  (None, 10, 64, 32)       2080      
 istributed)                                                  