In speech processing we may use really huge networks cause majority of the processing bottleneck is on CPU and running the CTC function...

Also, CLARIN mobile corpus seems to be too small for advanced network

In [2]:
#!/usr/bin/python3

# For demonstration purposes - Paweł Tomasik
# for CLARIN_MOBILE - generally it is unnormalized

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

from keras.models import Model
from keras.layers import LSTM, Conv1D, Dropout, LeakyReLU, Dense, Input, Lambda, TimeDistributed, Flatten, Conv2D, BatchNormalization, GRU, Bidirectional
from keras.optimizers import Adam
from keras.utils import to_categorical
from keras.initializers import Orthogonal
from keras.callbacks import Callback

from keras.backend import ctc_batch_cost, expand_dims
import keras.backend as K

import editdistance  # For digit error rate
import keras
import librosa
import numpy as np
import os

import gc
X = Y = None
gc.collect()

LENGTH = 1700 * 128
RECS = 1384 * 4
TRANSL = 307

X = np.zeros([RECS, LENGTH, 1], np.float32)
Y = np.zeros([RECS, TRANSL], np.int16)
counter = 0

for i in range(4):
    Xpart = np.load("datasets/clarin-long/data/clarin-mfcc-rec-pure-timedomain-{}.npy".format(i))
    Ypart = np.load("datasets/clarin-long/data/clarin-mfcc-trans-pure-timedomain-{}.npy".format(i))
    recs = Xpart.shape[0]
    reclen = Xpart.shape[1]
    translen = Ypart.shape[1]
    X[counter : counter + recs, :reclen, :] = Xpart
    Y[counter : counter + recs, :translen] = Ypart
    counter += recs
    
print(counter, RECS)
counter //= 32
counter *= 32

X = X[:counter]
Y = Y[:counter]

Xpart.shape

#X = np.clip(X, -3, 3)
MEAN = X.mean()
STD = X.std()

valid = np.random.random(X.shape[0]) > 0.9

5523 5536


In [3]:
import matplotlib.pyplot as plt
first = lambda x: len(x) - np.where(x[::-1].cumsum())[0][0]
first(X[0])

138560

In [None]:
mc = keras.callbacks.ModelCheckpoint('models/mfcc-ctc-{epoch:08d}-bigger.h5', 
                                     save_weights_only=False, period=5)

from keras.constraints import max_norm

class MyCallback(keras.callbacks.Callback):
    def __init__(self, fname):
        self.fname = fname
        super(MyCallback, self).__init__()
        
    def on_epoch_end(self, epoch, logs={}):
        content = "Epoch: {}, loss: {}\n".format(epoch, logs.get('loss'))
        with open(self.fname, "a") as f:
            f.write(content)

class StopOnConvergence(Callback):
    def __init__(self, max_repetitions=10):
        super().__init__()
        self.max_repetitions = max_repetitions

    def on_train_begin(self, logs=None):
        self.repetitions = 0
        self.last_loss = np.inf

    def on_epoch_end(self, batch, logs=None):
        logs = logs or {}
        loss = logs.get('val_loss')
        if loss is not None:
            if loss > self.last_loss:
                self.repetitions += 1
            else:
                self.last_loss = loss
                self.repetitions = 0
            if self.repetitions > self.max_repetitions:
                self.model.stop_training = True

            
logger = MyCallback("./training-bigger-log.txt")
NFEATS = 1

shift = np.exp(2 * np.pi * 1j / 512)


class CZT(keras.layers.Layer):
    def build(self, input_shape):
        self.z = self.add_weight(shape=[2], initializer=lambda shape: K.variable([np.real(shift), np.imag(shift)]),
                                 name='kernel_z', constraint=max_norm(1.))
        self.w = self.add_weight(shape=[2], initializer=lambda shape: K.variable([1, 0]),
                                 name='kernel_w', constraint=max_norm(1.))
    
    def compute_output_shape(self, input_shape):
        assert input_shape and len(input_shape) >= 2
        assert input_shape[-1]
        output_shape = list(input_shape)
        output_shape[-1] = 257
        return tuple(output_shape)
    
    def call(self, inputs):
        ints = K.reshape(K.arange(257, dtype='complex64'), [257, 1])
        k = K.reshape(K.arange(512, dtype='complex64'), [1, 512])
        z = K.cast(self.z[0], dtype='complex64_ref') + 1j * K.cast(self.z[1], dtype='complex64_ref')
        w = K.cast(self.w[0], dtype='complex64_ref') + 1j * K.cast(self.w[1], dtype='complex64_ref')
        weights = K.dot(z * K.ones([257, 1], dtype='complex64'), K.reshape(w, [1, -1]) ** (-k)) ** (-ints)
        print(z.shape, w.shape, weights.shape)
        czt = K.dot(K.cast(inputs, dtype='complex64_ref'), K.transpose(weights))
        return K.abs(czt)
    
def mk_model(max_label_length):
    feature_input = Input(shape = (None, NFEATS))
    conv_layer = Conv1D(512, 512, strides=128, activation='relu', bias=False)
    conv_layer.build((None, NFEATS))
    conv_layer.trainable = False
    conv_layer.set_weights([np.eye(512, dtype=np.float32).reshape([512, 1, 512])])
    conv_layer = conv_layer(feature_input)
    czt = CZT()
    czt.trainable = False
    layer = czt(conv_layer)
    layer = Lambda(lambda x: (x - MEAN) / STD)(layer)
    layer = Lambda(K.expand_dims)(layer)
    layer_1 = Conv2D(12, [5,1], activation='linear', strides=(2,1), kernel_initializer=Orthogonal(), padding='same')(layer)
    layer_2 = LeakyReLU(0.01)(layer_1)
    layer = BatchNormalization()(layer_2)
    layer_1 = Conv2D(16, [5,1], activation='linear', strides=(2,1), kernel_initializer=Orthogonal(), padding='same')(layer)
    layer_2 = LeakyReLU(0.01)(layer_1)
    layer = BatchNormalization()(layer_2)
    layer = TimeDistributed(Flatten())(layer)
    layer_1 = Conv1D(512, 5, activation='linear', kernel_initializer=Orthogonal(), padding='same')(layer)
    layer_2 = LeakyReLU(0.01)(layer_1)
    layer = BatchNormalization()(layer_2)
    for i in range(7):
        layer = Bidirectional(GRU(256, return_sequences = True, recurrent_dropout=0.01, kernel_initializer=Orthogonal(), activation='linear'))(layer)
        layer = LeakyReLU(0.01)(layer)
        layer = BatchNormalization()(layer)
    layer = Dense(1024)(layer)
    layer = LeakyReLU(0.01)(layer)
    layer = BatchNormalization()(layer)
    layer_15 = Dense(NPHONES + 1, activation = 'softmax')(layer)
    label_input = Input(shape = (max_label_length,))
    input_length = Input(name='input_length', shape=[1], dtype='int64')
    label_length = Input(name='label_length', shape=[1], dtype='int64')
    loss_lambda = Lambda(lambda args:ctc_batch_cost(*args), output_shape=(1,), name='ctc')([label_input, layer_15, input_length, label_length])
    model = Model([feature_input, label_input, input_length, label_length], [loss_lambda])
    model.summary()
    predictive = Model(feature_input, layer_15)
    return model, predictive

def train(model, trainX, trainy, trainXl, trainyl, epochs = 50):
    # important: batch_size=1 bugs Tensorflow
    optimizer = Adam(0.0003, clipnorm=1.)
    model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=optimizer)
    return model.fit([trainX[~valid], trainy[~valid], trainXl[~valid], trainyl[~valid]], np.zeros(trainX[~valid].shape[0]), epochs = epochs,
                     batch_size = 32, callbacks=[mc, logger, StopOnConvergence(4)], validation_data=[[trainX[valid], trainy[valid], trainXl[valid], trainyl[valid]], np.zeros(trainX[valid].shape[0])])

def validate(predictions, valid_length, groundtruth, target_length):
    predictions = keras.backend.ctc_decode(predictions, valid_length, False, 1)
    predictions = predictions[0][0].eval(session=keras.backend.get_session())
    DERs = []
    for index in range(predictions.shape[0]):
        dist = float(editdistance.eval(
            [x for x in predictions[index, :] if x != -1],
            [x for x in groundtruth[index, :] if x != NPHONES]))
        DER = dist / target_length[index]
        DERs.append((DER, target_length[index]))
    return DERs

def try_else(exp, exp_else):
    try:
        return exp()
    except:
        return exp_else

if __name__=='__main__':
    data = X, Y
    NPHONES = Y.max()
    NFEATS = data[0].shape[2]
    X_lens = np.array([first(x) // (4 * 128) for x in X]).astype(np.int64)
    Y_lens = np.array([np.where(x == NPHONES)[0] for x in data[1]])
    Y_lens = np.array([x[0] if len(x) else 0 for x in Y_lens]).astype(np.int64)
    print(X.shape, Y.shape, X_lens.shape, Y_lens.shape)
    data = data[0][np.where(Y_lens)], data[1][np.where(Y_lens)], X_lens[np.where(Y_lens)].reshape(-1, 1), Y_lens[np.where(Y_lens)].reshape(-1, 1)
    print(list(map(lambda x:x.shape, data)))
    trn, predict = mk_model(data[1].shape[1])
    train(trn, *data, epochs=100) # at 300 it makes sensible predictions

(5504, 217600, 1) (5504, 307) (5504,) (5504,)
[(5504, 217600, 1), (5504, 307), (5504, 1), (5504, 1)]




() () (257, 512)
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None, 1)      0                                            
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, None, 512)    262144      input_1[0][0]                    
__________________________________________________________________________________________________
czt_1 (CZT)                     (None, None, 257)    4           conv1d_1[0][0]                   
__________________________________________________________________________________________________
lambda_1 (Lambda)               (None, None, 257)    0           czt_1[0][0]                      
____________________________________________________________________________________________

Train on 4981 samples, validate on 523 samples
Epoch 1/100
Epoch 2/100

```
Epoch: 0, loss: 373.05563363069706
Epoch: 1, loss: 316.1668281486249
Epoch: 2, loss: 276.7502008333572
Epoch: 3, loss: 265.25142906937964
Epoch: 4, loss: 259.5080414461673
Epoch: 5, loss: 255.1659694412008
Epoch: 6, loss: 251.643098636268
Epoch: 7, loss: 248.39188379648817
Epoch: 8, loss: 245.4191206001752
Epoch: 9, loss: 243.25270757883865
Epoch: 10, loss: 240.34960977017843
Epoch: 11, loss: 237.67886680935405
Epoch: 12, loss: 234.76903058649233
Epoch: 13, loss: 231.53033547745105
Epoch: 14, loss: 228.06671884533296
Epoch: 15, loss: 224.10345142228945
Epoch: 16, loss: 220.12683561608821
Epoch: 17, loss: 215.54999222570672
Epoch: 18, loss: 210.5114803379306
Epoch: 19, loss: 205.0885363557544
```

In [5]:
pr = predict.predict(X[:1])
pr = K.ctc_decode(pr, X_lens[:1])[0][0]
pr.eval(session=K.get_session())

Instructions for updating:
Create a `tf.sparse.SparseTensor` and use `tf.sparse.to_dense` instead.


array([[18, 13, 22, 13, 31, 10, 13, 35, 10, 28,  8, 35, 27, 22, 35, 10,
        13, 10, 21,  9, 29, 10,  4, 20, 28, 20, 35, 27, 22, 35,  9, 18,
        33, 32, 22, 18, 35, 20, 13, 13, 22, 13, 22,  6, 31, 22, 32, 13,
         7, 22, 11, 22, 13,  6, 33, 20,  7, 18]])

In [13]:
Y[:1]

array([[ 5,  3,  8, 32, 12, 36,  5, 32, 19,  8, 33, 31, 19, 23, 20,  5,
        19, 31, 32, 28,  7, 25, 16, 36, 13, 31,  2,  5, 31, 19,  5, 23,
        20, 19, 25,  5, 23, 17, 20,  5,  8,  5, 32, 32, 36, 32, 20,  5,
         2,  6,  3, 36, 17, 23,  5, 22, 36,  4, 24, 20,  5, 32, 12, 33,
        20, 12,  5,  8, 18,  5, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37,
        37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37,
        37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37,
        37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37,
        37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37,
        37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 

In [6]:
import editdistance
editdistance.eval(list(pr.eval(session=K.get_session())[0]), list(Y[0, :Y_lens[0]])) / Y_lens[0]

0.4714285714285714

In [7]:
pr = predict.predict(X[:160])
pr = K.ctc_decode(pr, X_lens[:160])[0][0]
hypos = [list(x) for x in pr.eval(session=K.get_session())]
lens = [x.index(-1) if -1 in x else len(x) for x in hypos]
hypos = [x[:lim] for x, lim in zip(hypos, lens)]
gts = [list(x) for x in Y[:160]]
lens = [x.index(37) if 37 in x else len(x) for x in gts]
gts = [x[:lim] for x, lim in zip(gts, lens)]
[editdistance.eval(gt, hypo) / len(gt) for gt, hypo in zip(gts, hypos)]

[0.4714285714285714,
 0.5581395348837209,
 0.6451612903225806,
 0.6375,
 0.6115702479338843,
 0.648936170212766,
 0.6666666666666666,
 0.5333333333333333,
 0.5147058823529411,
 0.6608695652173913,
 0.5569620253164557,
 0.6111111111111112,
 0.6923076923076923,
 0.6617647058823529,
 0.5641025641025641,
 0.6470588235294118,
 0.5526315789473685,
 0.6219512195121951,
 0.550561797752809,
 0.6144578313253012,
 0.5,
 0.5930232558139535,
 0.4948453608247423,
 0.6078431372549019,
 0.5865384615384616,
 0.5148514851485149,
 0.6623376623376623,
 0.696969696969697,
 0.5555555555555556,
 0.6060606060606061,
 0.6067415730337079,
 0.6388888888888888,
 0.5185185185185185,
 0.6567164179104478,
 0.6875,
 0.6153846153846154,
 0.6987951807228916,
 0.4936708860759494,
 0.6744186046511628,
 0.7011494252873564,
 0.5555555555555556,
 0.6071428571428571,
 0.5903614457831325,
 0.6206896551724138,
 0.7252747252747253,
 0.6739130434782609,
 0.7183098591549296,
 0.5822784810126582,
 0.5945945945945946,
 0.5614035087

In [8]:
np.array(_).mean()

0.6023905065160992

AttributeError: 'Conv1D' object has no attribute 'z'

In [10]:
predict.save("exp1_stft.h5")