In [1]:
import numpy as np
import tensorflow as tf
import os
import tqdm
import glob
import sys
import matplotlib.pyplot as plt
import pickle
import multiprocessing
import itertools
import random
from sklearn.model_selection import train_test_split
%matplotlib inline

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
with open("train_files.pickle", "rb") as f:
    files_train = pickle.load(f)
with open("val_files.pickle", "rb") as f:
    files_val = pickle.load(f)

In [4]:
labels_train = []
phonemes_train = []
mfcc_train = []
video_train = []
for file in files_train:
    labels_train.append(np.load(file)["labels"])
    phonemes_train.append(np.load(file)["phonemes"])
    mfcc_train.append(np.load(file)["mfcc"])
    video_train.append(np.load(file)["video"])

In [5]:
labels_val = []
phonemes_val = []
mfcc_val = []
video_val = []
for file in tqdm.tqdm(files_val):
    labels_val.append(np.load(file)["labels"])
    phonemes_val.append(np.load(file)["phonemes"])
    mfcc_val.append(np.load(file)["mfcc"])
    video_val.append(np.load(file)["video"])

100%|██████████| 937/937 [00:07<00:00, 133.52it/s]


In [29]:
# def conv_layer(prev_layer, filter_width, stride, input_channels, output_channels,
#                layer_id, use_relu=True, use_batchnorn=False, is_training=None):
#     with tf.variable_scope('conv_{}'.format(layer_id)):
#         convolution_out = tf.layers.conv1d(prev_layer, output_channels, 
#                                            kernel_size=filter_width, strides=stride, activation=None)
#         if use_batchnorn:
#             if is_training is None:
#                 raise Exception("is_training placeholder required")
#             convolution_out = tf.layers.batch_normalization(convolution_out, training=is_training)
#         if use_relu:
#             convolution_out = tf.nn.relu(convolution_out, name='activation')
#         return convolution_out, output_channels

In [34]:
from tensorflow.contrib.layers import xavier_initializer


In [38]:
def conv_layer(prev_layer, filter_width, stride, input_channels, output_channels,
               layer_id, use_relu=True, use_leaky_relu=False, use_batchnorn=False, is_training=None):
    with tf.variable_scope('conv_{}'.format(layer_id)):
        filters = tf.get_variable('filters', shape=[filter_width, input_channels, output_channels],
                                  dtype=tf.float32, initializer=xavier_initializer())
        bias = tf.Variable(tf.constant(0.0, shape=[output_channels]), name='bias')
        convolution_out = tf.nn.conv1d(prev_layer, filters, stride, 'SAME', use_cudnn_on_gpu=True, name='convolution')
        convolution_out = tf.nn.bias_add(convolution_out, bias)
        if use_batchnorn:
            if is_training is None:
                raise Exception("is_training placeholder required")
            convolution_out = tf.layers.batch_normalization(convolution_out, training=is_training)
        if use_relu:
            if not use_leaky_relu:
                activations = tf.nn.relu(convolution_out, name='activation')
            else:
                activations = tflearn.activations.leaky_relu(convolution_out, alpha=0.01, name="activation")
            return activations, output_channels
        else:
            return convolution_out, output_channels

In [82]:
class SpeechDNNModel:
    def __init__(self, num_features=13, num_symbols=23 + 1, optimizer="adam", use_batchnorm=True):
        tf.reset_default_graph()
        self.num_features = num_features
        self.num_symbols = num_symbols
        self.epoch = 0
        self.step = 0
        self.min_dev_loss = np.float('inf')
        self.global_step = tf.Variable(0, trainable=False, name="global_step")
        self.optimizer_type = optimizer
        self.use_batchnorm = use_batchnorm
        self._build_graph()

    def _create_placeholders(self):
        self.features_placeholder = tf.placeholder(tf.float32, [None, None, self.num_features], name="features")
        self.features_len_placeholder = tf.placeholder(tf.int32, [None], name="features_len")
        self.target_placeholder = tf.placeholder(tf.int32, [None, None], name="targets")
        self.target_len_placeholder = tf.placeholder(tf.int32, [None], name="targets_len")

        self.learning_rate = tf.placeholder_with_default(1e-4, [], name="learning_rate")
        self.is_training = tf.placeholder_with_default(False, [], name="is_training")

        # transform target to sparse
        target_ind = tf.where(tf.not_equal(tf.sequence_mask(self.target_len_placeholder), False))
        target_val = tf.gather_nd(self.target_placeholder, target_ind)
        self.target_sparse = tf.SparseTensor(target_ind, target_val,
                                             tf.cast(tf.shape(self.target_placeholder), dtype=tf.int64))

    def _inference(self):
        # define architecture
        outputs, channels = conv_layer(self.features_placeholder,
                                       filter_width=16, stride=1,
                                       input_channels=self.num_features, output_channels=256,
                                       layer_id=1,
                                       use_batchnorn=self.use_batchnorm,
                                       is_training=self.is_training)

        # 7 layers without striding of output size [batch_size, max_time / 2, 250]
        for layer_id in range(2):
            outputs, channels = conv_layer(outputs, 4, 1, channels, channels, layer_id + 2,
                                           use_batchnorn=self.use_batchnorm,
                                           is_training=self.is_training)

        # 1 layer with high kernel width and output size [batch_size, max_time / 2, 2000]
        outputs, channels = conv_layer(outputs, 8, 1, channels, channels * 4, layer_id=10,
                                       use_batchnorn=self.use_batchnorm,
                                       is_training=self.is_training)

        # 1 fully connected layer of output size [batch_size, max_time / 2, 2000]
        outputs, channels = conv_layer(outputs, 1, 1, channels, channels, layer_id=11,
                                       use_batchnorn=self.use_batchnorm,
                                       is_training=self.is_training)

        # 1 fully connected layer of output size [batch_size, max_time / 2, num_classes]
        outputs, channels = conv_layer(outputs, 1, 1, channels, self.num_symbols, 12, use_relu=False,
                                       use_batchnorn=False)

        logits_batch_major = outputs

        self.logits = tf.transpose(logits_batch_major, [1, 0, 2])

    def _create_loss(self):
        # predictions = tf.nn.ctc_beam_search_decoder(logits_time_major, mfcc_len_placeholder // 2, beam_width=20)
        self.predictions, _ = tf.nn.ctc_greedy_decoder(self.logits,
                                                       self.features_len_placeholder)  # predictions - sparse tensor!

        self.LER = tf.edit_distance(self.predictions[0], tf.cast(self.target_sparse, tf.int64))
        with tf.name_scope('training'):
#             cost = tf.nn.ctc_loss(self.target_sparse, self.logits, self.features_len_placeholder, time_major=True)
            cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=self.target_placeholder)
            corrected_cost = tf.minimum(cost, 400.0)
            self.loss = tf.reduce_mean(corrected_cost, name='average_loss')

    def _create_optimizer(self):
        if self.optimizer_type == "adam":
            self.optimizer = tf.train.AdamOptimizer(self.learning_rate)
        elif self.optimizer_type == "yellowfin":
            raise ValueError("not now")
#             self.optimizer = YFOptimizer(self.yellowfin_lr)
#             print("using YellowFin with lr {:f}".format(self.yellowfin_lr))
        else:
            raise ValueError("incorrect optimizer")

        gvs = self.optimizer.compute_gradients(self.loss)
        gradients, trainables = zip(*gvs)
        clipped_gradients, norm = tf.clip_by_global_norm(gradients, 5.0, name='clip_gradients')
        # batch normalization in tensorflow requires this extra dependency
        extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(extra_update_ops):
            self.train_step = self.optimizer.apply_gradients(zip(clipped_gradients, trainables),
                                                             global_step=self.global_step)
    
    def _build_graph(self):
        self._create_placeholders()
        self._inference()
        self._create_loss()
        self._network_vars = tf.global_variables()  # without optimizer
        self._create_optimizer()
        self.saver = tf.train.Saver()
        # self._create_summary()

    def save_weights(self, sess):
        self.saver.save(sess, '/home/artbataev/Documents/checkpoints/dnn_mfcc/adam_checkpoints_best/ckpt',
                        self.step)

    def restore_weights(self, sess, only_network=False):
        ckpt = tf.train.get_checkpoint_state(
            '/home/artbataev/Documents/checkpoints/dnn_mfcc/adam_checkpoints_best/')
        if ckpt and ckpt.model_checkpoint_path:
            if only_network:
                saver = tf.train.Saver(self._network_vars)
                saver.restore(sess, ckpt.model_checkpoint_path)
            else:
                self.saver.restore(sess, ckpt.model_checkpoint_path)

    def init_op(self, sess, restore=True, restore_only_network=False):
        # sess.run(tf.global_variables_initializer())
        if restore:
            sess.run(tf.global_variables_initializer())
            self.restore_weights(sess, only_network=restore_only_network)
        else:
            sess.run(tf.global_variables_initializer())

    def test_net(self, batch_generator, sess, verbose=False):
        last = False
        full_len = 0
        loss = 0.0
        LER = 0.0
        WER = 0.0
        while not last:
            texts, texts_len, mfcc, mfcc_len, last = next(batch_generator)
            feed_dict = {
                self.features_placeholder: mfcc,
                self.features_len_placeholder: mfcc_len,
                self.target_placeholder: texts,
                self.target_len_placeholder: texts_len
            }

            if verbose:
                current_loss, predicted, current_LER = sess.run([self.loss, self.predictions, self.LER],
                                                                feed_dict=feed_dict)
            else:
                current_loss, current_LER = sess.run([self.loss, self.LER], feed_dict=feed_dict)

            loss += current_loss * len(texts)
            LER += np.sum(current_LER)
            full_len += len(texts)
            if verbose:
                orig_sentence = batch_generator.get_encoder().decode(texts[0][:texts_len[0]])
                pred_sentences_compact = batch_generator.get_encoder().decode_texts_sparse(predicted[0].indices,
                                                                                           predicted[0].values,
                                                                                           predicted[0].dense_shape)
                distance = current_LER[0]
                print("Average loss: {:.5f}, Average LER: {:.3f}% LER: {:.2f}%".format(current_loss,
                                                                                       np.mean(current_LER) * 100,
                                                                                       distance * 100))
                print("Original: ", orig_sentence)
                print("--Predicted: ", pred_sentences_compact[0])
                print("-" * 50)
                for i, text in enumerate(texts):
                    orig = batch_generator.get_encoder().decode(text[:texts_len[i]])
                    WER += editdistance.eval(orig.split(), pred_sentences_compact[i].split()) / len(orig.split())
        LER /= full_len
        WER /= full_len
        loss /= full_len
        if verbose:
            print("Average test loss: {:.5f}".format(loss))
            print("Average test LER: {:.2f}%".format(LER * 100))
            print("Average test WER: {:.2f}%".format(WER * 100))
        return loss, LER

    def run_step(self, batch_generator, sess, lr=1e-4):
        texts, texts_len, mfcc, mfcc_len, last = next(batch_generator)
        feed_dict = {
            self.features_placeholder: mfcc,
            self.features_len_placeholder: mfcc_len,
            self.target_placeholder: texts,
            self.target_len_placeholder: texts_len,
            self.is_training: True,
            self.learning_rate: lr,
        }
        current_loss, current_LER, _ = sess.run([self.loss, self.LER, self.train_step], feed_dict=feed_dict)
        self.step += 1
        if last:
            self.epoch += 1
            print("Epoch {}, end of dataset".format(self.epoch))
        return current_loss, current_LER

    def fit(self, batch_generator, dev_generator, sess, steps=1, lr=1e-4):
        try:
            train_loss = 0.0
            train_LER = 0.0
            for _ in range(steps):
                current_loss, current_LER = self.run_step(batch_generator, sess, lr)
                train_loss += current_loss
                train_LER += np.mean(current_LER)

                if self.step % 10 == 0:
                    train_loss /= 10
                    train_LER /= 10
                    print("Epoch {} step {} average_loss: {:.5f} LER: {:.2f}%".format(
                        self.epoch, self.step, train_loss, train_LER * 100))
                else:
                    print(".", end="")

#                 if self.step % 100 == 0 and train_loss < 350:
#                     epoch_test_loss, epoch_LER = self.test_net(dev_generator, sess)
#                     improved_text = "improved" if epoch_test_loss < self.min_dev_loss else "not improved"
#                     print("Step {} test_loss: {:.5f} LER: {:.2f}%".format(self.step, epoch_test_loss,
#                                                                           np.mean(epoch_LER) * 100), improved_text)
#                     if epoch_test_loss < self.min_dev_loss:
#                         self.min_dev_loss = epoch_test_loss
                        self.save_weights(sess)

                if self.step % 10 == 0:
                    train_loss = 0.0
                    train_LER = 0.0

                if np.isnan(current_loss) or np.isinf(current_loss) or (current_loss == 400.0 and self.step > 200):
                    print("step", self.step, current_loss)
                    print("=" * 20)
                    print("Big loss, restoring weights")
                    print("=" * 20)
                    try:
                        self.restore_weights(sess)
                    except:
                        print("can't restore, initialize")
                        self.init_op(sess, restore=False)


        except KeyboardInterrupt:
            print("Training Interrupted")

IndentationError: unexpected indent (<ipython-input-82-e73c856cbca2>, line 213)

In [None]:
dnn = SpeechDNNModel(optimizer="adam", use_batchnorm=True)
sess = tf.Session()
dnn.init_op(sess, restore=False)

In [83]:
batch_size = 64

In [84]:
def train_generator():
    global batch_size
    while True:
        indices = np.random.choice(len(labels_train), size=batch_size, replace=False)
        texts_len = np.array(list(map(lambda x: len(x), (phonemes_train[i] for i in indices))))
        mfcc_len = np.array(list(map(lambda x: len(x), (mfcc_train[i] for i in indices))))
        texts = np.zeros((batch_size, np.max(texts_len)), dtype=np.int)
        mfcc = np.zeros((batch_size, np.max(mfcc_len), 13))
        #         texts, texts_len, mfcc, mfcc_len, last
        for i in range(batch_size):
            texts[i, :len(phonemes_train[indices[i]])] = phonemes_train[indices[i]]
            mfcc[i, :len(mfcc_train[indices[i]]), :] = mfcc_train[indices[i]]
        yield texts, texts_len, mfcc, mfcc_len, False

In [85]:
t = train_generator()

In [86]:
texts, texts_len, mfcc, mfcc_len, last = next(t)

In [87]:
# texts, texts_len, mfcc, mfcc_len, last = next(t)

In [None]:
dnn.fit(t, t, sess, steps=1000)

.........Epoch 0 step 110 average_loss: 1.68494 LER: 99.72%
.........Epoch 0 step 120 average_loss: 1.64274 LER: 99.72%
.........Epoch 0 step 130 average_loss: 1.59395 LER: 99.71%
.........Epoch 0 step 140 average_loss: 1.76933 LER: 99.71%
.........Epoch 0 step 150 average_loss: 1.65256 LER: 99.71%
.........Epoch 0 step 160 average_loss: 1.70048 LER: 99.72%
.........Epoch 0 step 170 average_loss: 1.68945 LER: 99.72%
.........Epoch 0 step 180 average_loss: 1.59162 LER: 99.72%
.........Epoch 0 step 190 average_loss: 1.59568 LER: 99.72%
.........Epoch 0 step 200 average_loss: 1.57573 LER: 99.71%
