In speech processing we may use really huge networks cause majority of the processing bottleneck is on CPU and running the CTC function...

Also, CLARIN mobile corpus seems to be too small for advanced network

In [1]:
#!/usr/bin/python3

# For demonstration purposes - Paweł Tomasik
# for CLARIN_MOBILE - generally it is unnormalized

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

from keras.models import Model
from keras.layers import LSTM, Conv1D, Dropout, LeakyReLU, Dense, Input, Lambda, TimeDistributed, Flatten, Conv2D, BatchNormalization, GRU, Bidirectional
from keras.optimizers import Adam
from keras.utils import to_categorical
from keras.initializers import Orthogonal
from keras.callbacks import Callback

from keras.backend import ctc_batch_cost, expand_dims
import keras.backend as K

import editdistance  # For digit error rate
import keras
import librosa
import numpy as np
import os

import gc
X = Y = None
gc.collect()

LENGTH = 1700 * 128
RECS = 1384 * 4
TRANSL = 307

X = np.zeros([RECS, LENGTH, 1], np.float32)
Y = np.zeros([RECS, TRANSL], np.int16)
counter = 0

for i in range(4):
    Xpart = np.load("datasets/clarin-long/data/clarin-mfcc-rec-pure-timedomain-{}.npy".format(i))
    Ypart = np.load("datasets/clarin-long/data/clarin-mfcc-trans-pure-timedomain-{}.npy".format(i))
    recs = Xpart.shape[0]
    reclen = Xpart.shape[1]
    translen = Ypart.shape[1]
    X[counter : counter + recs, :reclen, :] = Xpart
    Y[counter : counter + recs, :translen] = Ypart
    counter += recs
    
print(counter, RECS)
counter //= 32
counter *= 32

X = X[:counter]
Y = Y[:counter]

Xpart.shape

#X = np.clip(X, -3, 3)
MEAN = X.mean()
STD = X.std()

valid = np.random.random(X.shape[0]) > 0.9

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


5523 5536


In [2]:
import matplotlib.pyplot as plt
first = lambda x: len(x) - np.where(x[::-1].cumsum())[0][0]
first(X[0])

138560

In [3]:
mc = keras.callbacks.ModelCheckpoint('models/mfcc-ctc-{epoch:08d}-bigger.h5', 
                                     save_weights_only=False, period=5)

from keras.constraints import max_norm

class MyCallback(keras.callbacks.Callback):
    def __init__(self, fname):
        self.fname = fname
        super(MyCallback, self).__init__()
        
    def on_epoch_end(self, epoch, logs={}):
        content = "Epoch: {}, loss: {}\n".format(epoch, logs.get('loss'))
        with open(self.fname, "a") as f:
            f.write(content)

class StopOnConvergence(Callback):
    def __init__(self, max_repetitions=10):
        super().__init__()
        self.max_repetitions = max_repetitions

    def on_train_begin(self, logs=None):
        self.repetitions = 0
        self.last_loss = np.inf

    def on_epoch_end(self, batch, logs=None):
        logs = logs or {}
        loss = logs.get('val_loss')
        if loss is not None:
            if loss > self.last_loss:
                self.repetitions += 1
            else:
                self.last_loss = loss
                self.repetitions = 0
            if self.repetitions > self.max_repetitions:
                self.model.stop_training = True

            
logger = MyCallback("./training-bigger-log.txt")
NFEATS = 1

shift = np.exp(2 * np.pi * 1j / 512)


class AdamScale(Adam):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self._scale_decay = 0.999
        self.var = None 
    
    @keras.legacy.interfaces.legacy_get_updates_support
    def get_updates(self, loss, params):
        if self.var is None:
            self.var = K.variable([1.], name='czt_scale')
            self.weights.append(self.var)
        super().get_updates(loss, params)
        self.updates = [(self.var * x if x.shape.as_list() == [2] else x) for x in self.updates]
        self.updates.append((self.var, self._scale_decay * self.var))
        return self.updates



class CZT(keras.layers.Layer):
    def build(self, input_shape):
        self.z = self.add_weight(shape=[2], initializer=lambda shape: K.variable([np.real(shift), np.imag(shift)]),
                                 name='kernel_z', constraint=max_norm(1.))
        self.w = self.add_weight(shape=[2], initializer=lambda shape: K.variable([1, 0]),
                                 name='kernel_w', constraint=max_norm(1.))
    
    def compute_output_shape(self, input_shape):
        assert input_shape and len(input_shape) >= 2
        assert input_shape[-1]
        output_shape = list(input_shape)
        output_shape[-1] = 257
        return tuple(output_shape)
    
    def call(self, inputs):
        ints = K.reshape(K.arange(257, dtype='complex64'), [257, 1])
        k = K.reshape(K.arange(512, dtype='complex64'), [1, 512])
        z = K.cast(self.z[0], dtype='complex64_ref') + 1j * K.cast(self.z[1], dtype='complex64_ref')
        w = K.cast(self.w[0], dtype='complex64_ref') + 1j * K.cast(self.w[1], dtype='complex64_ref')
        weights = K.dot(z * K.ones([257, 1], dtype='complex64'), K.reshape(w, [1, -1]) ** (-k)) ** (-ints)
        print(z.shape, w.shape, weights.shape)
        czt = K.dot(K.cast(inputs, dtype='complex64_ref'), K.transpose(weights))
        return K.abs(czt)
    
def mk_model(max_label_length):
    feature_input = Input(shape = (None, NFEATS))
    conv_layer = Conv1D(512, 512, strides=128, activation='relu', bias=False)
    conv_layer.build((None, NFEATS))
    conv_layer.trainable = False
    conv_layer.set_weights([np.eye(512, dtype=np.float32).reshape([512, 1, 512])])
    conv_layer = conv_layer(feature_input)
    layer = CZT()(conv_layer)
    layer = Lambda(lambda x: (x - MEAN) / STD)(layer)
    layer = Lambda(K.expand_dims)(layer)
    layer_1 = Conv2D(12, [5,1], activation='linear', strides=(2,1), kernel_initializer=Orthogonal(), padding='same')(layer)
    layer_2 = LeakyReLU(0.01)(layer_1)
    layer = BatchNormalization()(layer_2)
    layer_1 = Conv2D(16, [5,1], activation='linear', strides=(2,1), kernel_initializer=Orthogonal(), padding='same')(layer)
    layer_2 = LeakyReLU(0.01)(layer_1)
    layer = BatchNormalization()(layer_2)
    layer = TimeDistributed(Flatten())(layer)
    layer_1 = Conv1D(512, 5, activation='linear', kernel_initializer=Orthogonal(), padding='same')(layer)
    layer_2 = LeakyReLU(0.01)(layer_1)
    layer = BatchNormalization()(layer_2)
    for i in range(7):
        layer = Bidirectional(GRU(256, return_sequences = True, recurrent_dropout=0.01, kernel_initializer=Orthogonal(), activation='linear'))(layer)
        layer = LeakyReLU(0.01)(layer)
        layer = BatchNormalization()(layer)
    layer = Dense(1024)(layer)
    layer = LeakyReLU(0.01)(layer)
    layer = BatchNormalization()(layer)
    layer_15 = Dense(NPHONES + 1, activation = 'softmax')(layer)
    label_input = Input(shape = (max_label_length,))
    input_length = Input(name='input_length', shape=[1], dtype='int64')
    label_length = Input(name='label_length', shape=[1], dtype='int64')
    loss_lambda = Lambda(lambda args:ctc_batch_cost(*args), output_shape=(1,), name='ctc')([label_input, layer_15, input_length, label_length])
    model = Model([feature_input, label_input, input_length, label_length], [loss_lambda])
    model.summary()
    predictive = Model(feature_input, layer_15)
    return model, predictive

def train(model, trainX, trainy, trainXl, trainyl, epochs = 50):
    # important: batch_size=1 bugs Tensorflow
    optimizer = AdamScale(0.0003, clipnorm=1.)
    model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=optimizer)
    return model.fit([trainX[~valid], trainy[~valid], trainXl[~valid], trainyl[~valid]], np.zeros(trainX[~valid].shape[0]), epochs = epochs,
                     batch_size = 32, callbacks=[mc, logger, StopOnConvergence(4)], validation_data=[[trainX[valid], trainy[valid], trainXl[valid], trainyl[valid]], np.zeros(trainX[valid].shape[0])])

def validate(predictions, valid_length, groundtruth, target_length):
    predictions = keras.backend.ctc_decode(predictions, valid_length, False, 1)
    predictions = predictions[0][0].eval(session=keras.backend.get_session())
    DERs = []
    for index in range(predictions.shape[0]):
        dist = float(editdistance.eval(
            [x for x in predictions[index, :] if x != -1],
            [x for x in groundtruth[index, :] if x != NPHONES]))
        DER = dist / target_length[index]
        DERs.append((DER, target_length[index]))
    return DERs

def try_else(exp, exp_else):
    try:
        return exp()
    except:
        return exp_else

if __name__=='__main__':
    data = X, Y
    NPHONES = Y.max()
    NFEATS = data[0].shape[2]
    X_lens = np.array([first(x) // (4 * 128) for x in X]).astype(np.int64)
    Y_lens = np.array([np.where(x == NPHONES)[0] for x in data[1]])
    Y_lens = np.array([x[0] if len(x) else 0 for x in Y_lens]).astype(np.int64)
    print(X.shape, Y.shape, X_lens.shape, Y_lens.shape)
    data = data[0][np.where(Y_lens)], data[1][np.where(Y_lens)], X_lens[np.where(Y_lens)].reshape(-1, 1), Y_lens[np.where(Y_lens)].reshape(-1, 1)
    print(list(map(lambda x:x.shape, data)))
    trn, predict = mk_model(data[1].shape[1])
    train(trn, *data, epochs=100) # at 300 it makes sensible predictions


(5504, 217600, 1) (5504, 307) (5504,) (5504,)
[(5504, 217600, 1), (5504, 307), (5504, 1), (5504, 1)]




() () (257, 512)
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None, 1)      0                                            
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, None, 512)    262144      input_1[0][0]                    
__________________________________________________________________________________________________
czt_1 (CZT)                     (None, None, 257)    4           conv1d_1[0][0]                   
__________________________________________________________________________________________________
lambda_1 (Lambda)               (None, None, 257)    0           czt_1[0][0]                      
____________________________________________________________________________________________

Train on 4963 samples, validate on 541 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100


In [4]:
print(predict.layers[2].get_weights())
print(np.exp(2 * np.pi * 1j / 512))

[array([0.99398977, 0.00728384], dtype=float32), array([0.99997944, 0.00639564], dtype=float32)]
(0.9999247018391445+0.012271538285719925j)


In [9]:
pr = predict.predict(X[:1])
pr = K.ctc_decode(pr, X_lens[:1])[0][0]
pr.eval(session=K.get_session())

Instructions for updating:
Create a `tf.sparse.SparseTensor` and use `tf.sparse.to_dense` instead.


array([[18, 20, 35, 20, 13, 20, 17, 22,  6, 20, 35, 22, 13, 35, 20, 21,
        20, 25, 29, 20, 24, 20, 22, 35, 27, 20, 35, 22,  6, 27, 22, 18,
        20, 13,  6, 20, 13, 20, 33, 20,  6,  6, 20, 11, 22, 13, 22, 17,
        22, 35, 20, 18]])

In [None]:
Y[:1]

In [10]:
import editdistance
editdistance.eval(list(pr.eval(session=K.get_session())[0]), list(Y[0, :Y_lens[0]])) / Y_lens[0]

0.6428571428571429

In [11]:
pr = predict.predict(X[:160])
pr = K.ctc_decode(pr, X_lens[:160])[0][0]
hypos = [list(x) for x in pr.eval(session=K.get_session())]
lens = [x.index(-1) if -1 in x else len(x) for x in hypos]
hypos = [x[:lim] for x, lim in zip(hypos, lens)]
gts = [list(x) for x in Y[:160]]
lens = [x.index(37) if 37 in x else len(x) for x in gts]
gts = [x[:lim] for x, lim in zip(gts, lens)]
[editdistance.eval(gt, hypo) / len(gt) for gt, hypo in zip(gts, hypos)]

[0.6428571428571429,
 0.7325581395348837,
 0.8064516129032258,
 0.775,
 0.7768595041322314,
 0.7021276595744681,
 0.7962962962962963,
 0.6222222222222222,
 0.6470588235294118,
 0.7652173913043478,
 0.7088607594936709,
 0.6805555555555556,
 0.6615384615384615,
 0.75,
 0.7307692307692307,
 0.75,
 0.6578947368421053,
 0.7682926829268293,
 0.7078651685393258,
 0.7831325301204819,
 0.625,
 0.7093023255813954,
 0.7525773195876289,
 0.7058823529411765,
 0.6826923076923077,
 0.7524752475247525,
 0.7272727272727273,
 0.696969696969697,
 0.7083333333333334,
 0.7474747474747475,
 0.6067415730337079,
 0.7083333333333334,
 0.7283950617283951,
 0.7014925373134329,
 0.75,
 0.7948717948717948,
 0.7108433734939759,
 0.7341772151898734,
 0.7441860465116279,
 0.7241379310344828,
 0.6666666666666666,
 0.7261904761904762,
 0.7108433734939759,
 0.7701149425287356,
 0.7692307692307693,
 0.6086956521739131,
 0.676056338028169,
 0.759493670886076,
 0.6891891891891891,
 0.7017543859649122,
 0.6447368421052632,


In [12]:
np.array(_).mean()

0.7178987010015726

In [9]:
K.zeros([2]).shape.as_list() == [2]

True

In [18]:
trn.optimizer.weights

[<tf.Variable 'AdamScale/iterations:0' shape=() dtype=int64_ref>,
 <tf.Variable 'training/AdamScale/Variable:0' shape=(2,) dtype=float32_ref>,
 <tf.Variable 'training/AdamScale/Variable_1:0' shape=(2,) dtype=float32_ref>,
 <tf.Variable 'training/AdamScale/Variable_2:0' shape=(5, 1, 1, 12) dtype=float32_ref>,
 <tf.Variable 'training/AdamScale/Variable_3:0' shape=(12,) dtype=float32_ref>,
 <tf.Variable 'training/AdamScale/Variable_4:0' shape=(12,) dtype=float32_ref>,
 <tf.Variable 'training/AdamScale/Variable_5:0' shape=(12,) dtype=float32_ref>,
 <tf.Variable 'training/AdamScale/Variable_6:0' shape=(5, 1, 12, 16) dtype=float32_ref>,
 <tf.Variable 'training/AdamScale/Variable_7:0' shape=(16,) dtype=float32_ref>,
 <tf.Variable 'training/AdamScale/Variable_8:0' shape=(16,) dtype=float32_ref>,
 <tf.Variable 'training/AdamScale/Variable_9:0' shape=(16,) dtype=float32_ref>,
 <tf.Variable 'training/AdamScale/Variable_10:0' shape=(5, 4112, 512) dtype=float32_ref>,
 <tf.Variable 'training/AdamSca

In [10]:
K.variable([9]).eval(session=K.get_session())

array([9.], dtype=float32)

In [None]:
predict.save("models/czt-stabilized.h5")