In speech processing we may use really huge networks cause majority of the processing bottleneck is on CPU and running the CTC function...

Also, CLARIN mobile corpus seems to be too small for advanced network

In [1]:
#!/usr/bin/python3

# For demonstration purposes - Paweł Tomasik
# for CLARIN_MOBILE - generally it is unnormalized

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

from keras.models import Model
from keras.layers import LSTM, Conv1D, Dropout, LeakyReLU, Dense, Input, Lambda, TimeDistributed, Flatten, Conv2D, BatchNormalization, GRU, Bidirectional
from keras.optimizers import Adam
from keras.utils import to_categorical
from keras.initializers import Orthogonal
from keras.callbacks import Callback

from keras.backend import ctc_batch_cost, expand_dims
import keras.backend as K

import editdistance  # For digit error rate
import keras
import librosa
import numpy as np
import os

import gc
X = Y = None
gc.collect()

LENGTH = 1700
RECS = 1384 * 4
TRANSL = 307

X = np.zeros([RECS, LENGTH, 20], np.float32)
Y = np.zeros([RECS, TRANSL], np.int16)
counter = 0

for i in range(4):
    Xpart = np.load("datasets/clarin-long/data/clarin-mfcc-rec-{}.npy".format(i))
    Ypart = np.load("datasets/clarin-long/data/clarin-mfcc-trans-{}.npy".format(i))
    recs = Xpart.shape[0]
    reclen = Xpart.shape[1]
    translen = Ypart.shape[1]
    X[counter : counter + recs, :reclen, :] = Xpart
    Y[counter : counter + recs, :translen] = Ypart
    counter += recs
    
print(counter, RECS)
counter //= 32
counter *= 32

X = X[:counter]
Y = Y[:counter]

Xpart.shape

#X = np.clip(X, -3, 3)
MEAN = X.mean()
STD = X.std()

valid = np.random.random(X.shape[0]) > 0.9

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


5523 5536


-10640

In [3]:
mc = keras.callbacks.ModelCheckpoint('models/mfcc-ctc-{epoch:08d}-bigger.h5', 
                                     save_weights_only=False, period=5)

from keras.constraints import max_norm

class MyCallback(keras.callbacks.Callback):
    def __init__(self, fname):
        self.fname = fname
        super(MyCallback, self).__init__()
        
    def on_epoch_end(self, epoch, logs={}):
        content = "Epoch: {}, loss: {}\n".format(epoch, logs.get('loss'))
        with open(self.fname, "a") as f:
            f.write(content)

class StopOnConvergence(Callback):
    def __init__(self, max_repetitions=10):
        super().__init__()
        self.max_repetitions = max_repetitions

    def on_train_begin(self, logs=None):
        self.repetitions = 0
        self.last_loss = np.inf

    def on_epoch_end(self, batch, logs=None):
        logs = logs or {}
        loss = logs.get('val_loss')
        if loss is not None:
            if loss > self.last_loss:
                self.repetitions += 1
            else:
                self.last_loss = loss
                self.repetitions = 0
            if self.repetitions > self.max_repetitions:
                self.model.stop_training = True

            
logger = MyCallback("./training-bigger-log.txt")
NFEATS = 20

def mk_model(max_label_length):
    feature_input = Input(shape = (None, NFEATS))
    layer = Lambda(lambda x: (x - MEAN) / STD)(feature_input)
    layer = Lambda(K.expand_dims)(layer)
    layer_1 = Conv2D(12, [5,1], activation='linear', strides=(2,1), kernel_initializer=Orthogonal(), padding='same')(layer)
    layer_2 = LeakyReLU(0.01)(layer_1)
    layer = BatchNormalization()(layer_2)
    layer_1 = Conv2D(16, [5,1], activation='linear', strides=(2,1), kernel_initializer=Orthogonal(), padding='same')(layer)
    layer_2 = LeakyReLU(0.01)(layer_1)
    layer = BatchNormalization()(layer_2)
    layer = TimeDistributed(Flatten())(layer)
    layer_1 = Conv1D(512, 5, activation='linear', kernel_initializer=Orthogonal(), padding='same')(layer)
    layer_2 = LeakyReLU(0.01)(layer_1)
    layer = BatchNormalization()(layer_2)
    for i in range(7):
        layer = Bidirectional(GRU(256, return_sequences = True, recurrent_dropout=0.01, kernel_initializer=Orthogonal(), activation='linear'))(layer)
        layer = LeakyReLU(0.01)(layer)
        layer = BatchNormalization()(layer)
    layer = Dense(1024)(layer)
    layer = LeakyReLU(0.01)(layer)
    layer = BatchNormalization()(layer)
    layer_15 = Dense(NPHONES + 1, activation = 'softmax')(layer)
    label_input = Input(shape = (max_label_length,))
    input_length = Input(name='input_length', shape=[1], dtype='int64')
    label_length = Input(name='label_length', shape=[1], dtype='int64')
    loss_lambda = Lambda(lambda args:ctc_batch_cost(*args), output_shape=(1,), name='ctc')([label_input, layer_15, input_length, label_length])
    model = Model([feature_input, label_input, input_length, label_length], [loss_lambda])
    model.summary()
    predictive = Model(feature_input, layer_15)
    return model, predictive

def train(model, trainX, trainy, trainXl, trainyl, epochs = 50):
    # important: batch_size=1 bugs Tensorflow
    optimizer = Adam(0.0003, clipnorm=1.)
    model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=optimizer)
    return model.fit([trainX[~valid], trainy[~valid], trainXl[~valid], trainyl[~valid]], np.zeros(trainX[~valid].shape[0]), epochs = epochs,
                     batch_size = 32, callbacks=[mc, logger, StopOnConvergence(4)], validation_data=[[trainX[valid], trainy[valid], trainXl[valid], trainyl[valid]], np.zeros(trainX[valid].shape[0])])

def validate(predictions, valid_length, groundtruth, target_length):
    predictions = keras.backend.ctc_decode(predictions, valid_length, False, 1)
    predictions = predictions[0][0].eval(session=keras.backend.get_session())
    DERs = []
    for index in range(predictions.shape[0]):
        dist = float(editdistance.eval(
            [x for x in predictions[index, :] if x != -1],
            [x for x in groundtruth[index, :] if x != NPHONES]))
        DER = dist / target_length[index]
        DERs.append((DER, target_length[index]))
    return DERs

def try_else(exp, exp_else):
    try:
        return exp()
    except:
        return exp_else

if __name__=='__main__':
    data = X, Y
    NPHONES = Y.max()
    NFEATS = data[0].shape[2]
    X_lens = np.array([try_else(
            (lambda:np.where((x).mean(1) == (x).std(1))[0][0]),
            X.shape[1])
        for x in X])
    X_lens = np.ceil(X_lens / 4.0)
    Y_lens = np.array([np.where(x == NPHONES)[0] for x in data[1]])
    Y_lens = np.array([x[0] if len(x) else 0 for x in Y_lens])
    print(X.shape, Y.shape, X_lens.shape, Y_lens.shape)
    data = data[0][np.where(Y_lens)], data[1][np.where(Y_lens)], X_lens[np.where(Y_lens)].reshape(-1, 1), Y_lens[np.where(Y_lens)].reshape(-1, 1)
    print(list(map(lambda x:x.shape, data)))
    trn, predict = mk_model(data[1].shape[1])
    train(trn, *data, epochs=100) # at 300 it makes sensible predictions

(5504, 1700, 20) (5504, 307) (5504,) (5504,)
[(5504, 1700, 20), (5504, 307), (5504, 1), (5504, 1)]
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None, 20)     0                                            
__________________________________________________________________________________________________
lambda_1 (Lambda)               (None, None, 20)     0           input_1[0][0]                    
__________________________________________________________________________________________________
lambda_2 (Lambda)               (None, None, 20, 1)  0           lambda_1[0][0]                   
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, None, 20, 12) 72          lambda_2[0][0]                   
__________

Train on 4955 samples, validate on 549 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100


In [8]:
print(predict.layers[2].get_weights())
print(np.exp(2 * np.pi * 1j / 512))

[array([0.99565756, 0.01837571], dtype=float32), array([0.9999578 , 0.00477572], dtype=float32)]
(0.9999247018391445+0.012271538285719925j)


In [9]:
pr = predict.predict(X[:1])
pr = K.ctc_decode(pr, X_lens[:1])[0][0]
pr.eval(session=K.get_session())

Instructions for updating:
Create a `tf.sparse.SparseTensor` and use `tf.sparse.to_dense` instead.


array([[18, 20, 35, 20, 13, 20, 17, 22,  6, 20, 35, 22, 13, 35, 20, 21,
        20, 25, 29, 20, 24, 20, 22, 35, 27, 20, 35, 22,  6, 27, 22, 18,
        20, 13,  6, 20, 13, 20, 33, 20,  6,  6, 20, 11, 22, 13, 22, 17,
        22, 35, 20, 18]])

In [None]:
Y[:1]

In [5]:
import editdistance
editdistance.eval(list(pr.eval(session=K.get_session())[0]), list(Y[0, :Y_lens[0]])) / Y_lens[0]

1.1857142857142857

In [6]:
pr = predict.predict(X[:160])
pr = K.ctc_decode(pr, X_lens[:160])[0][0]
hypos = [list(x) for x in pr.eval(session=K.get_session())]
lens = [x.index(-1) if -1 in x else len(x) for x in hypos]
hypos = [x[:lim] for x, lim in zip(hypos, lens)]
gts = [list(x) for x in Y[:160]]
lens = [x.index(37) if 37 in x else len(x) for x in gts]
gts = [x[:lim] for x, lim in zip(gts, lens)]
[editdistance.eval(gt, hypo) / len(gt) for gt, hypo in zip(gts, hypos)]

[0.3,
 0.23255813953488372,
 0.45161290322580644,
 0.275,
 0.4297520661157025,
 0.2127659574468085,
 0.24074074074074073,
 0.17777777777777778,
 0.25,
 0.23478260869565218,
 0.24050632911392406,
 0.19444444444444445,
 0.16923076923076924,
 0.5441176470588235,
 0.3333333333333333,
 0.4852941176470588,
 0.27631578947368424,
 0.2926829268292683,
 0.19101123595505617,
 0.3253012048192771,
 0.16071428571428573,
 0.18604651162790697,
 0.1958762886597938,
 0.29411764705882354,
 0.23076923076923078,
 0.3564356435643564,
 0.19480519480519481,
 0.25757575757575757,
 0.2777777777777778,
 0.21212121212121213,
 0.1797752808988764,
 0.2222222222222222,
 0.20987654320987653,
 0.3283582089552239,
 0.28125,
 0.24358974358974358,
 0.4939759036144578,
 0.24050632911392406,
 0.23255813953488372,
 0.19540229885057472,
 0.19753086419753085,
 0.19047619047619047,
 0.13253012048192772,
 0.367816091954023,
 0.24175824175824176,
 0.30434782608695654,
 0.43661971830985913,
 0.27848101265822783,
 0.20270270270270

In [7]:
np.array(_).mean()

0.2719325210045632