In [1]:
import numpy as np
import tensorflow as tf
import os
import tqdm
import glob
import sys
import matplotlib.pyplot as plt
import pickle
import multiprocessing
import itertools
import random
from sklearn.model_selection import train_test_split
%matplotlib inline

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
with open("train_files.pickle", "rb") as f:
    files_train = pickle.load(f)
with open("val_files.pickle", "rb") as f:
    files_val = pickle.load(f)

In [4]:
labels_train = []
phonemes_train = []
mfcc_train = []
fbanks_train = []
video_features_train = []
# video_train = []
for file in tqdm.tqdm(files_train):
    with np.load(file) as data:
        cur_labels = data["labels"]
        cur_phonemes = data["phonemes"]
        cur_mfcc = data["mfcc"]
        cur_mfcc = (cur_mfcc - cur_mfcc.mean(axis=0))/cur_mfcc.std(axis=0)
        cur_fbanks = data["fbanks"]
        cur_fbanks = (cur_fbanks - cur_fbanks.mean(axis=0)) / cur_fbanks.std(axis=0)
    #     cur_video = np.load(file)["video"]
    labels_train.append(cur_labels)
    phonemes_train.append(cur_phonemes)
    mfcc_train.append(cur_mfcc)
    fbanks_train.append(cur_fbanks)
    
    with np.load(os.path.join("../data/lip_reading/synchronized/video_features/train/", os.path.basename(file))) as data2:
        cur_video_features = data2["video_features"]
    video_features_train.append(cur_video_features)
    #     video_train.append(cur_video)

100%|██████████| 8433/8433 [00:32<00:00, 256.06it/s]


In [6]:
labels_val = []
phonemes_val = []
mfcc_val = []
video_val = []
video_features_val = []
fbanks_val = []
for file in tqdm.tqdm(files_val):
    with np.load(file) as data:
        cur_labels = data["labels"]
        cur_phonemes = data["phonemes"]
        cur_mfcc = data["mfcc"]
        cur_mfcc = (cur_mfcc - cur_mfcc.mean(axis=0))/cur_mfcc.std(axis=0)
        cur_fbanks = data["fbanks"]
        cur_fbanks = (cur_fbanks - cur_fbanks.mean(axis=0)) / cur_fbanks.std(axis=0)
    labels_val.append(cur_labels)
    phonemes_val.append(cur_phonemes)
    mfcc_val.append(cur_mfcc)
    fbanks_val.append(cur_fbanks)
    
    with np.load(os.path.join("../data/lip_reading/synchronized/video_features/train/", os.path.basename(file))) as data2:
        cur_video_features = data2["video_features"]
    video_features_val.append(cur_video_features)

100%|██████████| 937/937 [00:03<00:00, 301.63it/s]


In [9]:
def conv_layer(prev_layer, filter_width, num_filters, strides=1, use_relu=True, use_batchnorn=True, is_training=None):
    convolution_out = tf.layers.conv1d(prev_layer, num_filters, filter_width, strides=strides, padding="same", 
                                  activation=None)
    if use_batchnorn:
        if is_training is None:
            raise Exception("is_training placeholder required")
        convolution_out = tf.layers.batch_normalization(convolution_out, training=is_training)
    if use_relu:
        convolution_out = tf.nn.relu(convolution_out)
    return convolution_out

In [185]:
class CtcFullDNNModel:
    def __init__(self, num_features=13, num_symbols=23+1, optimizer="adam", use_batchnorm=True, folder="dnn_mfcc"):
        tf.reset_default_graph()
        self.num_features = num_features
        self.num_symbols = num_symbols
        self.checkpoints_folder = folder
        self.epoch = 0
        self.step = 0
        self.min_dev_loss = np.float('inf')
        self.global_step = tf.Variable(0, trainable=False, name="global_step")
        self.optimizer_type = optimizer
        self.use_batchnorm = use_batchnorm
        self._build_graph()

    def _create_placeholders(self):
        self.features_placeholder = tf.placeholder(tf.float32, [None, None, self.num_features + 512], name="features")
        self.features_len_placeholder = tf.placeholder(tf.int32, [None], name="features_len")
        self.target_placeholder = tf.placeholder(tf.int32, [None, None], name="targets")
        self.target_len_placeholder = tf.placeholder(tf.int32, [None], name="targets_len")

        self.learning_rate = tf.placeholder_with_default(1e-4, [], name="learning_rate")
        self.is_training = tf.placeholder_with_default(False, [], name="is_training")

        # transform target to sparse
        target_ind = tf.where(tf.not_equal(tf.sequence_mask(self.target_len_placeholder), False))
        target_val = tf.gather_nd(self.target_placeholder, target_ind)
        self.target_sparse = tf.SparseTensor(target_ind, target_val,
                                             tf.cast(tf.shape(self.target_placeholder), dtype=tf.int64))

    def _inference(self):
        # define architecture
        outputs = self.features_placeholder # [:, :, :self.num_features]
        if self.use_batchnorm:
            outputs = tf.layers.batch_normalization(outputs, training=self.is_training)
        outputs = conv_layer(outputs, filter_width=48, strides=2, num_filters=256,
                             use_batchnorn=self.use_batchnorm,
                             is_training=self.is_training)

        for layer_id in range(7):
            outputs = conv_layer(outputs, filter_width=7, num_filters=256,
                                 use_batchnorn=self.use_batchnorm,
                                 is_training=self.is_training)

        outputs = conv_layer(outputs, filter_width=32, strides=2, num_filters = 256 * 4,
                    use_batchnorn = self.use_batchnorm,
                    is_training = self.is_training)

        outputs = conv_layer(outputs, filter_width=1, strides=2, num_filters=256, 
                use_batchnorn = self.use_batchnorm,
                is_training = self.is_training)

        outputs = conv_layer(outputs, 1, num_filters=self.num_symbols, use_relu=False,
        use_batchnorn = False)

        logits_batch_major = outputs

        self.logits = logits_batch_major  # tf.transpose(logits_batch_major, [1, 0, 2])
        self.logits_time_major = tf.transpose(logits_batch_major, [1, 0, 2])


    def _create_loss(self):
        self.predictions, _ = tf.nn.ctc_greedy_decoder(self.logits_time_major,
                                                       self.features_len_placeholder // 8)  # predictions - sparse tensor!
        self.predictions, _ = tf.nn.ctc_beam_search_decoder(self.logits_time_major,
                                                       self.features_len_placeholder // 8, beam_width=200)
        self.LER = tf.edit_distance(self.predictions[0], tf.cast(self.target_sparse, tf.int64))
        with tf.name_scope('training'):
            cost = tf.nn.ctc_loss(self.target_sparse, self.logits_time_major, self.features_len_placeholder // 8, time_major=True)
            corrected_cost = tf.minimum(cost, 400.0)
            self.loss = tf.reduce_mean(corrected_cost, name='average_loss')


    def _create_optimizer(self):
        if self.optimizer_type == "adam":
            self.optimizer = tf.train.AdamOptimizer(self.learning_rate)
        elif self.optimizer_type == "yellowfin":
            raise ValueError("not now")
        else:
            raise ValueError("incorrect optimizer")

        gvs = self.optimizer.compute_gradients(self.loss)
        gradients, trainables = zip(*gvs)
        clipped_gradients, norm = tf.clip_by_global_norm(gradients, 5.0, name='clip_gradients')
        # batch normalization in tensorflow requires this extra dependency
        extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(extra_update_ops):
            self.train_step = self.optimizer.apply_gradients(zip(clipped_gradients, trainables),
                                                             global_step=self.global_step)


    def _build_graph(self):
        self._create_placeholders()
        self._inference()
        self._create_loss()
        self._network_vars = tf.global_variables()  # without optimizer
        self._create_optimizer()
        self.saver = tf.train.Saver()
        # self._create_summary()


    def save_weights(self, sess):
        self.saver.save(sess, 
            '/home/artbataev/Documents/checkpoints/{}/adam_checkpoints_best/ckpt'.format(self.checkpoints_folder), 
            self.step)


    def restore_weights(self, sess, only_network=False):
        ckpt = tf.train.get_checkpoint_state(
            '/home/artbataev/Documents/checkpoints/{}/adam_checkpoints_best/'.format(self.checkpoints_folder))
        if ckpt and ckpt.model_checkpoint_path:
            if only_network:
                saver = tf.train.Saver(self._network_vars)
                saver.restore(sess, ckpt.model_checkpoint_path)
            else:
                self.saver.restore(sess, ckpt.model_checkpoint_path)


    def init_op(self, sess, restore=True, restore_only_network=False):
        if restore:
            sess.run(tf.global_variables_initializer())
            self.restore_weights(sess, only_network=restore_only_network)
        else:
            sess.run(tf.global_variables_initializer())
    
#     def get_predictions_p(self, features, features_len):
#         feed_dict = {
#                 self.features_placeholder: features.reshape(1, features.shape[0], features.shape[1]),
#                 self.features_len_placeholder: features_len}
#         pred_p = sess.run(self.predictions_prob, feed_dict=feed_dict)
#         return pred_p[0]
    
    def get_predictions(self, sess, features, features_len):
        feed_dict = {
                self.features_placeholder: features[np.newaxis,...],
                self.features_len_placeholder: features_len,
            }
        pred = sess.run(self.predictions, feed_dict=feed_dict)
        return pred[0]
    
    def test_net(self, batch_generator, sess, verbose=False):
        last = False
        full_len = 0
        loss = 0.0
        LER = 0.0
        while not last:
            texts, texts_len, mfcc, mfcc_len, last = next(batch_generator)
            feed_dict = {
                self.features_placeholder: mfcc,
                self.features_len_placeholder: mfcc_len,
                self.target_placeholder: texts,
                self.target_len_placeholder: texts_len
            }

            current_loss, current_LER = sess.run([self.loss, self.LER], feed_dict=feed_dict)

            loss += current_loss * len(texts)
            LER += np.sum(current_LER)
            full_len += len(texts)
        
        LER /= full_len
        loss /= full_len
        return loss, LER

    def run_step(self, batch_generator, sess, lr=1e-4):
        texts, texts_len, mfcc, mfcc_len, last = next(batch_generator)
        feed_dict = {
            self.features_placeholder: mfcc,
            self.features_len_placeholder: mfcc_len,
            self.target_placeholder: texts,
            self.target_len_placeholder: texts_len,
            self.is_training: True,
            self.learning_rate: lr,
        }
        current_loss, current_LER, _ = sess.run([self.loss, self.LER, self.train_step], feed_dict=feed_dict)
        self.step += 1
        if last:
            self.epoch += 1
            print("Epoch {}, end of dataset".format(self.epoch))
        return current_loss, current_LER

    def fit(self, batch_generator, dev_generator, sess, steps=1, lr=1e-4):
        try:
            train_loss = 0.0
            train_LER = 0.0
            for _ in range(steps):
                current_loss, current_LER = self.run_step(batch_generator, sess, lr)
                train_loss += current_loss
                train_LER += np.mean(current_LER)

                if self.step % 10 == 0:
                    train_loss /= 10
                    train_LER /= 10
                    print("Epoch {} step {} average_loss: {:.5f} LER: {:.2f}%".format(
                        self.epoch, self.step, train_loss, train_LER * 100))
                else:
                    print(".", end="")

                if self.step % 20 == 0 and train_loss < 350:
                    epoch_test_loss, epoch_LER = self.test_net(dev_generator, sess)
                    improved_text = "improved" if epoch_test_loss < self.min_dev_loss else "not improved"
                    print("Step {} test_loss: {:.5f} LER: {:.2f}%".format(self.step, epoch_test_loss,
                                                                          np.mean(epoch_LER) * 100), improved_text)
                    if epoch_test_loss < self.min_dev_loss:
                        self.min_dev_loss = epoch_test_loss
                        self.save_weights(sess)

                if self.step % 10 == 0:
                    train_loss = 0.0
                    train_LER = 0.0

        except KeyboardInterrupt:
            print("Training Interrupted")

In [186]:
batch_size = 256

In [187]:
def train_generator():
    global batch_size
    while True:
        all_indices = np.random.choice(len(labels_train), len(labels_train), replace=False)
        for j in range(0, len(labels_train), batch_size):
            indices = all_indices[j: j + batch_size]
            texts_len = np.array(list(map(lambda x: len(x), (labels_train[i] for i in indices))))
            mfcc_len = np.array(list(map(lambda x: len(x), (mfcc_train[i] for i in indices))))
            texts = np.zeros((texts_len.shape[0], np.max(texts_len)), dtype=np.int)
            features = np.zeros((texts_len.shape[0], np.max(mfcc_len), 13 + 512))
            #         texts, texts_len, mfcc, mfcc_len, last
            for i in range(texts_len.shape[0]):
                texts[i, :len(labels_train[indices[i]])] = labels_train[indices[i]]
                features[i, :len(mfcc_train[indices[i]]), :] = np.hstack((mfcc_train[indices[i]], video_features_train[indices[i]]))
            yield texts, texts_len, features, mfcc_len, j + batch_size >= len(labels_train)

In [188]:
def val_generator():
    global batch_size
    while True:
        for j in range(0, len(labels_val), batch_size):
            indices = np.arange(j, min(j + batch_size, len(labels_val)))
            texts_len = np.array(list(map(lambda x: len(x), (labels_val[i] for i in indices))))
            mfcc_len = np.array(list(map(lambda x: len(x), (mfcc_val[i] for i in indices))))
            texts = np.zeros((texts_len.shape[0], np.max(texts_len)), dtype=np.int)
            features = np.zeros((texts_len.shape[0], np.max(mfcc_len), 13 + 512))
            #         texts, texts_len, mfcc, mfcc_len, last
            for i in range(texts_len.shape[0]):
                texts[i, :len(labels_val[indices[i]])] = labels_val[indices[i]]
                features[i, :len(mfcc_val[indices[i]]), :] = np.hstack((mfcc_val[indices[i]], video_features_val[indices[i]]))
            yield texts, texts_len, features, mfcc_len, j + batch_size >= len(labels_val)

In [189]:
# vgen = val_generator()

In [190]:
tgen = train_generator()
vgen = val_generator()

In [191]:
# texts, texts_len, mfcc, mfcc_len, last = next(t)

In [193]:
dnn = CtcFullDNNModel(optimizer="adam", use_batchnorm=True, folder="dnn_onlymfcc_ctc")
sess = tf.Session()
dnn.init_op(sess, restore=False)

In [194]:
dnn.fit(tgen, vgen, sess, steps=1000, lr=1e-4) # обучение модели только на mfcc - результаты хуже, чем на mfcc+video

.........Epoch 0 step 10 average_loss: 56.72946 LER: 131.70%
.........Epoch 0 step 20 average_loss: 19.50456 LER: 88.12%
Step 20 test_loss: 148.93799 LER: 315.33% improved
.........Epoch 0 step 30 average_loss: 15.09070 LER: 72.07%
..Epoch 1, end of dataset
.......Epoch 1 step 40 average_loss: 8.57976 LER: 33.49%
Step 40 test_loss: 153.89334 LER: 298.80% not improved
.........Epoch 1 step 50 average_loss: 4.79055 LER: 18.00%
.........Epoch 1 step 60 average_loss: 3.13647 LER: 11.86%
Step 60 test_loss: 138.18154 LER: 280.67% improved
.....Epoch 2, end of dataset
....Epoch 2 step 70 average_loss: 2.26522 LER: 8.69%
.........Epoch 2 step 80 average_loss: 1.81067 LER: 7.25%
Step 80 test_loss: 72.70868 LER: 127.36% improved
.........Epoch 2 step 90 average_loss: 1.66715 LER: 6.60%
........Epoch 3, end of dataset
.Epoch 3 step 100 average_loss: 1.51048 LER: 5.94%
Step 100 test_loss: 36.18597 LER: 89.47% improved
.........Epoch 3 step 110 average_loss: 1.07615 LER: 3.52%
.........Epoch 3 step

In [195]:
dnn.init_op(sess, restore=True)

INFO:tensorflow:Restoring parameters from /home/artbataev/Documents/checkpoints/dnn_onlymfcc_ctc/adam_checkpoints_best/ckpt-380


In [165]:
dnn = CtcFullDNNModel(optimizer="adam", use_batchnorm=True, folder="dnn_full_ctc_best")
sess = tf.Session()
dnn.init_op(sess, restore=True)

INFO:tensorflow:Restoring parameters from /home/artbataev/Documents/checkpoints/dnn_full_ctc_best/adam_checkpoints_best/ckpt-260


In [137]:
%%time
"WER {:.3f}%".format(dnn.test_net(vgen, sess)[1] * 100)

CPU times: user 5.64 s, sys: 864 ms, total: 6.51 s
Wall time: 7.36 s


'WER 2.967%'

In [196]:
num2word = ["ноль", "один", "два", "три", "четыре", "пять", "шесть", "семь", "восемь", "девять"]

In [197]:
import FtrFile
test_mfcc = 'ark,t:../data/test_mfcc.txtftr' # признаки записей 
resName = 'decode_results_mfcc_ctc'            # файл с результатами декодирования
testName = 'test_ref.txt'             # файл с текстом записей

In [198]:
num_file = 0
numbFrame = 0

all_args = []
for filename, features in FtrFile.FtrDirectoryReader(test_mfcc):
    if num_file < 1000:
        num_file += 1
        numbFrame += features.nSamples
        all_args.append(filename)
    else: break    

In [199]:
test_dir1 = "../data/lip_reading/synchronized/test/"
test_dir2 = "../data/lip_reading/synchronized/video_features/test/"

In [200]:
def run_decoding(filename):
    cur_mfcc_train = np.load(os.path.join(test_dir1, filename + ".npz"))["mfcc"]
    cur_video_features = np.load(os.path.join(test_dir2, filename + ".npz"))["video_features"]
    nums = dnn.get_predictions(sess, np.hstack((cur_mfcc_train, cur_video_features)), np.array(cur_mfcc_train.shape[0]).reshape(-1, )).values
#     print(nums)
    words = " ".join(map(lambda x: num2word[x], nums))
#     print(words)
    return filename + " " + words

In [201]:
results = []
for filename in tqdm.tqdm(all_args):
    results.append(run_decoding(filename))

100%|██████████| 1000/1000 [00:11<00:00, 89.45it/s]


In [202]:
import wer

In [203]:
def computeWer(testName, resName):
    
    WER = wer.computeWER(testName, resName)
    print('\n' + '-'*10 + 'RESULT OF RECOGNITION:' + '--'*10 + '\n')
    print('%WER is {}'.format(WER))

In [204]:
with open(resName, 'w') as fn:
    for result in results:
        fn.write(result + "\n")

#4. Compute WER:
computeWer(testName, resName)


----------RESULT OF RECOGNITION:--------------------

%WER is 5.13
