In speech processing we may use really huge networks cause majority of the processing bottleneck is on CPU and running the CTC function...

Also, CLARIN mobile corpus seems to be too small for advanced network

In [3]:
#!/usr/bin/python3

# For demonstration purposes - Paweł Tomasik
# for CLARIN_MOBILE - generally it is unnormalized

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

from keras.models import Model
from keras.layers import LSTM, Conv1D, Dropout, LeakyReLU, Dense, Input, Lambda, TimeDistributed, Flatten, Conv2D, BatchNormalization, GRU, Bidirectional
from keras.optimizers import Adam
from keras.utils import to_categorical
from keras.initializers import Orthogonal
from keras.callbacks import Callback

from keras.backend import ctc_batch_cost, expand_dims
import keras.backend as K

import editdistance  # For digit error rate
import keras
import librosa
import numpy as np
import os

from fwk import dataset, stage, acoustic

dset = dataset.Dataset()
dset.get_from("datasets/clarin-long/data")
dset.select_first(9000)
am = acoustic.MappingGenerator([
    stage.Window(512, 512),
    stage.LogPowerFourier()
]).get(dset)
dset.generate(am, ["clean", "transcripts"])

  7%|▋         | 946/13810 [00:00<00:01, 9447.08it/s]

Getting dataset lengths


100%|██████████| 13810/13810 [00:01<00:00, 9596.20it/s]
  0%|          | 21/9000 [00:00<00:44, 202.15it/s]

Getting clean recordings


100%|██████████| 9000/9000 [01:41<00:00, 88.52it/s] 
 13%|█▎        | 1152/9000 [00:00<00:00, 11515.70it/s]

Getting list of phones


100%|██████████| 9000/9000 [00:01<00:00, 8735.02it/s] 
834it [00:00, 8334.50it/s]

Getting transcriptions


9000it [00:01, 5783.14it/s]


In [5]:
X = dset.clean
Y = dset.transcriptions

MEAN = X.mean()
STD = X.std()

valid = np.random.random(X.shape[0]) > 0.9

In [9]:
mc = keras.callbacks.ModelCheckpoint('models/mfcc-ctc-{epoch:08d}-bigger.h5', 
                                     save_weights_only=False, period=5)

from keras.constraints import max_norm

class MyCallback(keras.callbacks.Callback):
    def __init__(self, fname):
        self.fname = fname
        super(MyCallback, self).__init__()
        
    def on_epoch_end(self, epoch, logs={}):
        content = "Epoch: {}, loss: {}\n".format(epoch, logs.get('loss'))
        with open(self.fname, "a") as f:
            f.write(content)

class StopOnConvergence(Callback):
    def __init__(self, max_repetitions=10):
        super().__init__()
        self.max_repetitions = max_repetitions

    def on_train_begin(self, logs=None):
        self.repetitions = 0
        self.last_loss = np.inf

    def on_epoch_end(self, batch, logs=None):
        logs = logs or {}
        loss = logs.get('val_loss')
        if loss is not None:
            if loss > self.last_loss:
                self.repetitions += 1
            else:
                self.last_loss = loss
                self.repetitions = 0
            if self.repetitions > self.max_repetitions:
                self.model.stop_training = True

            
logger = MyCallback("./training-bigger-log.txt")
NFEATS = 20

def mk_model(max_label_length):
    feature_input = Input(shape = (None, NFEATS))
    layer = Lambda(lambda x: (x - MEAN) / STD)(feature_input)
    layer = Lambda(K.expand_dims)(layer)
    layer_1 = Conv2D(12, [5,1], activation='linear', strides=(1,1), kernel_initializer=Orthogonal(), padding='same')(layer)
    layer_2 = LeakyReLU(0.01)(layer_1)
    layer = BatchNormalization()(layer_2)
    layer_1 = Conv2D(16, [5,1], activation='linear', strides=(1,1), kernel_initializer=Orthogonal(), padding='same')(layer)
    layer_2 = LeakyReLU(0.01)(layer_1)
    layer = BatchNormalization()(layer_2)
    layer = TimeDistributed(Flatten())(layer)
    layer_1 = Conv1D(512, 5, activation='linear', kernel_initializer=Orthogonal(), padding='same')(layer)
    layer_2 = LeakyReLU(0.01)(layer_1)
    layer = BatchNormalization()(layer_2)
    for i in range(7):
        layer = Bidirectional(GRU(256, return_sequences = True, recurrent_dropout=0.01, kernel_initializer=Orthogonal(), activation='linear'))(layer)
        layer = LeakyReLU(0.01)(layer)
        layer = BatchNormalization()(layer)
    layer = Dense(1024)(layer)
    layer = LeakyReLU(0.01)(layer)
    layer = BatchNormalization()(layer)
    layer_15 = Dense(NPHONES + 1, activation = 'softmax')(layer)
    label_input = Input(shape = (max_label_length,))
    input_length = Input(name='input_length', shape=[1], dtype='int64')
    label_length = Input(name='label_length', shape=[1], dtype='int64')
    loss_lambda = Lambda(lambda args:ctc_batch_cost(*args), output_shape=(1,), name='ctc')([label_input, layer_15, input_length, label_length])
    model = Model([feature_input, label_input, input_length, label_length], [loss_lambda])
    model.summary()
    predictive = Model(feature_input, layer_15)
    return model, predictive

def train(model, trainX, trainy, trainXl, trainyl, epochs = 50):
    # important: batch_size=1 bugs Tensorflow
    optimizer = Adam(0.0003, clipnorm=1.)
    model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=optimizer)
    return model.fit([trainX[~valid], trainy[~valid], trainXl[~valid], trainyl[~valid]], np.zeros(trainX[~valid].shape[0]), epochs = epochs,
                     batch_size = 32, callbacks=[mc, logger, StopOnConvergence(4)], validation_data=[[trainX[valid], trainy[valid], trainXl[valid], trainyl[valid]], np.zeros(trainX[valid].shape[0])])

def validate(predictions, valid_length, groundtruth, target_length):
    predictions = keras.backend.ctc_decode(predictions, valid_length, False, 1)
    predictions = predictions[0][0].eval(session=keras.backend.get_session())
    DERs = []
    for index in range(predictions.shape[0]):
        dist = float(editdistance.eval(
            [x for x in predictions[index, :] if x != -1],
            [x for x in groundtruth[index, :] if x != NPHONES]))
        DER = dist / target_length[index]
        DERs.append((DER, target_length[index]))
    return DERs

def try_else(exp, exp_else):
    try:
        return exp()
    except:
        return exp_else

if __name__=='__main__':
    data = X, Y
    NPHONES = Y.max()
    NFEATS = data[0].shape[2]
    X_lens = np.array([try_else(
            (lambda:np.where((x).mean(1) == (x).std(1))[0][0]),
            X.shape[1])
        for x in X])
    X_lens = np.ceil(X_lens / 4.0)
    Y_lens = np.array([np.where(x == NPHONES)[0] for x in data[1]])
    Y_lens = np.array([x[0] if len(x) else 0 for x in Y_lens])
    print(X.shape, Y.shape, X_lens.shape, Y_lens.shape)
    data = data[0][np.where(Y_lens)], data[1][np.where(Y_lens)], X_lens[np.where(Y_lens)].reshape(-1, 1), Y_lens[np.where(Y_lens)].reshape(-1, 1)
    print(list(map(lambda x:x.shape, data)))
    trn, predict = mk_model(data[1].shape[1])
    train(trn, *data, epochs=100) # at 300 it makes sensible predictions

(9000, 497, 257) (9000, 299) (9000,) (9000,)
[(9000, 497, 257), (9000, 299), (9000, 1), (9000, 1)]
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, None, 257)    0                                            
__________________________________________________________________________________________________
lambda_3 (Lambda)               (None, None, 257)    0           input_3[0][0]                    
__________________________________________________________________________________________________
lambda_4 (Lambda)               (None, None, 257, 1) 0           lambda_3[0][0]                   
__________________________________________________________________________________________________
conv2d_3 (Conv2D)               (None, None, 257, 12 72          lambda_4[0][0]                   
__________

Train on 8119 samples, validate on 881 samples
Epoch 1/100


InvalidArgumentError: Not enough time for target transition sequence (required: 82, available: 73)0You can turn this error into a warning by using the flag ignore_longer_outputs_than_inputs
	 [[node ctc_1/CTCLoss (defined at /venv/lib/python3.5/site-packages/keras/backend/tensorflow_backend.py:3950)  = CTCLoss[_class=["loc:@training_1/Adam/gradients/ctc_1/CTCLoss_grad/mul"], ctc_merge_repeated=true, ignore_longer_outputs_than_inputs=false, preprocess_collapse_repeated=false, _device="/job:localhost/replica:0/task:0/device:CPU:0"](ctc_1/Log/_1969, ctc_1/ToInt64/_1971, ctc_1/ToInt32_2/_1973, ctc_1/ToInt32_1/_1975)]]

Caused by op 'ctc_1/CTCLoss', defined at:
  File "/usr/lib/python3.5/runpy.py", line 184, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/lib/python3.5/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/venv/lib/python3.5/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/venv/lib/python3.5/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/venv/lib/python3.5/site-packages/ipykernel/kernelapp.py", line 486, in start
    self.io_loop.start()
  File "/venv/lib/python3.5/site-packages/tornado/platform/asyncio.py", line 132, in start
    self.asyncio_loop.run_forever()
  File "/usr/lib/python3.5/asyncio/base_events.py", line 345, in run_forever
    self._run_once()
  File "/usr/lib/python3.5/asyncio/base_events.py", line 1312, in _run_once
    handle._run()
  File "/usr/lib/python3.5/asyncio/events.py", line 125, in _run
    self._callback(*self._args)
  File "/venv/lib/python3.5/site-packages/tornado/platform/asyncio.py", line 122, in _handle_events
    handler_func(fileobj, events)
  File "/venv/lib/python3.5/site-packages/tornado/stack_context.py", line 300, in null_wrapper
    return fn(*args, **kwargs)
  File "/venv/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 450, in _handle_events
    self._handle_recv()
  File "/venv/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 480, in _handle_recv
    self._run_callback(callback, msg)
  File "/venv/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 432, in _run_callback
    callback(*args, **kwargs)
  File "/venv/lib/python3.5/site-packages/tornado/stack_context.py", line 300, in null_wrapper
    return fn(*args, **kwargs)
  File "/venv/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/venv/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 233, in dispatch_shell
    handler(stream, idents, msg)
  File "/venv/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "/venv/lib/python3.5/site-packages/ipykernel/ipkernel.py", line 208, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/venv/lib/python3.5/site-packages/ipykernel/zmqshell.py", line 537, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/venv/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2728, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/venv/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2850, in run_ast_nodes
    if self.run_code(code, result):
  File "/venv/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2910, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-9-cb7555a70aca>", line 111, in <module>
    trn, predict = mk_model(data[1].shape[1])
  File "<ipython-input-9-cb7555a70aca>", line 66, in mk_model
    loss_lambda = Lambda(lambda args:ctc_batch_cost(*args), output_shape=(1,), name='ctc')([label_input, layer_15, input_length, label_length])
  File "/venv/lib/python3.5/site-packages/keras/engine/topology.py", line 619, in __call__
    output = self.call(inputs, **kwargs)
  File "/venv/lib/python3.5/site-packages/keras/layers/core.py", line 663, in call
    return self.function(inputs, **arguments)
  File "<ipython-input-9-cb7555a70aca>", line 66, in <lambda>
    loss_lambda = Lambda(lambda args:ctc_batch_cost(*args), output_shape=(1,), name='ctc')([label_input, layer_15, input_length, label_length])
  File "/venv/lib/python3.5/site-packages/keras/backend/tensorflow_backend.py", line 3950, in ctc_batch_cost
    sequence_length=input_length), 1)
  File "/venv/lib/python3.5/site-packages/tensorflow/python/ops/ctc_ops.py", line 158, in ctc_loss
    ignore_longer_outputs_than_inputs=ignore_longer_outputs_than_inputs)
  File "/venv/lib/python3.5/site-packages/tensorflow/python/ops/gen_ctc_ops.py", line 286, in ctc_loss
    name=name)
  File "/venv/lib/python3.5/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "/venv/lib/python3.5/site-packages/tensorflow/python/util/deprecation.py", line 488, in new_func
    return func(*args, **kwargs)
  File "/venv/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 3274, in create_op
    op_def=op_def)
  File "/venv/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 1770, in __init__
    self._traceback = tf_stack.extract_stack()

InvalidArgumentError (see above for traceback): Not enough time for target transition sequence (required: 82, available: 73)0You can turn this error into a warning by using the flag ignore_longer_outputs_than_inputs
	 [[node ctc_1/CTCLoss (defined at /venv/lib/python3.5/site-packages/keras/backend/tensorflow_backend.py:3950)  = CTCLoss[_class=["loc:@training_1/Adam/gradients/ctc_1/CTCLoss_grad/mul"], ctc_merge_repeated=true, ignore_longer_outputs_than_inputs=false, preprocess_collapse_repeated=false, _device="/job:localhost/replica:0/task:0/device:CPU:0"](ctc_1/Log/_1969, ctc_1/ToInt64/_1971, ctc_1/ToInt32_2/_1973, ctc_1/ToInt32_1/_1975)]]


In [8]:
print(predict.layers[2].get_weights())
print(np.exp(2 * np.pi * 1j / 512))

[array([0.99565756, 0.01837571], dtype=float32), array([0.9999578 , 0.00477572], dtype=float32)]
(0.9999247018391445+0.012271538285719925j)


In [9]:
pr = predict.predict(X[:1])
pr = K.ctc_decode(pr, X_lens[:1])[0][0]
pr.eval(session=K.get_session())

Instructions for updating:
Create a `tf.sparse.SparseTensor` and use `tf.sparse.to_dense` instead.


array([[18, 20, 35, 20, 13, 20, 17, 22,  6, 20, 35, 22, 13, 35, 20, 21,
        20, 25, 29, 20, 24, 20, 22, 35, 27, 20, 35, 22,  6, 27, 22, 18,
        20, 13,  6, 20, 13, 20, 33, 20,  6,  6, 20, 11, 22, 13, 22, 17,
        22, 35, 20, 18]])

In [None]:
Y[:1]

In [5]:
import editdistance
editdistance.eval(list(pr.eval(session=K.get_session())[0]), list(Y[0, :Y_lens[0]])) / Y_lens[0]

1.1857142857142857

In [6]:
pr = predict.predict(X[:160])
pr = K.ctc_decode(pr, X_lens[:160])[0][0]
hypos = [list(x) for x in pr.eval(session=K.get_session())]
lens = [x.index(-1) if -1 in x else len(x) for x in hypos]
hypos = [x[:lim] for x, lim in zip(hypos, lens)]
gts = [list(x) for x in Y[:160]]
lens = [x.index(37) if 37 in x else len(x) for x in gts]
gts = [x[:lim] for x, lim in zip(gts, lens)]
[editdistance.eval(gt, hypo) / len(gt) for gt, hypo in zip(gts, hypos)]

[0.3,
 0.23255813953488372,
 0.45161290322580644,
 0.275,
 0.4297520661157025,
 0.2127659574468085,
 0.24074074074074073,
 0.17777777777777778,
 0.25,
 0.23478260869565218,
 0.24050632911392406,
 0.19444444444444445,
 0.16923076923076924,
 0.5441176470588235,
 0.3333333333333333,
 0.4852941176470588,
 0.27631578947368424,
 0.2926829268292683,
 0.19101123595505617,
 0.3253012048192771,
 0.16071428571428573,
 0.18604651162790697,
 0.1958762886597938,
 0.29411764705882354,
 0.23076923076923078,
 0.3564356435643564,
 0.19480519480519481,
 0.25757575757575757,
 0.2777777777777778,
 0.21212121212121213,
 0.1797752808988764,
 0.2222222222222222,
 0.20987654320987653,
 0.3283582089552239,
 0.28125,
 0.24358974358974358,
 0.4939759036144578,
 0.24050632911392406,
 0.23255813953488372,
 0.19540229885057472,
 0.19753086419753085,
 0.19047619047619047,
 0.13253012048192772,
 0.367816091954023,
 0.24175824175824176,
 0.30434782608695654,
 0.43661971830985913,
 0.27848101265822783,
 0.20270270270270

In [7]:
np.array(_).mean()

0.2719325210045632