In [103]:
import tensorflow as tf
import numpy as np
import time
import math
import os
import sys

# sys.path.append('.')
sys.path.append('..')
from utils.timit_dataset import TimitDataset, load_data
os.environ["CUDA_VISIBLE_DEVICES"]='0'

In [72]:
# hyperparameter
hps_list = {
    'num_epochs': 200,
    'lr': 5e-3,
    'grad_clip': 5,
    'num_hidden': 128,
    'num_features': 39,
    'num_classes': 61+1,
    'num_rnn_layers': 4,
    'batch_size': 32,
    'drop_prob': 0.2
}

# Can be accessed as 'hps.attr'
def hparas(hps_list):
    class Hparas(object):
        pass
    hps = Hparas()
    for hp in hps_list:
        setattr(hps, hp, hps_list[hp])
    return hps

hps = hparas(hps_list)

In [137]:
class DeepSpeech2:
    def __init__(self, sess, hps):
        self.num_features = hps.num_features
        self.num_classes = hps.num_classes
        self.num_hidden = hps.num_hidden
        self.num_rnn_layers = hps.num_rnn_layers
        self.keep_prob = 1 - hps.drop_prob
        self.lr = hps.lr
        self.grad_clip = hps.grad_clip
        self.batch_size = hps.batch_size
        self.sess = sess
        pass
    
    def build_graph(self, is_training=False):
        with tf.variable_scope('Input'):
            # Shape = [batch_size, time_step, num_features]
            self.inputs = tf.placeholder(tf.float32, shape=[None, None, self.num_features])
            # Sparse representation is required for ctc_loss
            self.targets = tf.sparse_placeholder(tf.int32)
            self.seq_len = tf.placeholder(tf.int32, [None])
            shape = tf.shape(self.inputs)
            batch_size, time_steps, num_features = shape[0], shape[1], shape[2]
            # Shape = [batch_size, num_features, time_step, 1]
            self.Xrs = tf.reshape(self.inputs, shape=[batch_size, num_features, time_steps, 1])
            
        with tf.variable_scope('Convolution_layer'):
            # Shape = [height, width, in_channel, out_channel]
            filter_1 = tf.get_variable('filter_1', shape=(41, 11, 1, 32), 
                                       initializer=tf.truncated_normal_initializer(stddev=0.02), 
                                       dtype=tf.float32)
            stride_1 = [1, 2, 1, 1]
            filter_2 = tf.get_variable('filter_2', shape=(21, 11, 32, 32), 
                                       initializer=tf.truncated_normal_initializer(stddev=0.02), 
                                       dtype=tf.float32)
            stride_2 = [1, 2, 1, 1]
            filter_3 = tf.get_variable('filter_3', shape=(21, 11, 32, 96), 
                                       initializer=tf.truncated_normal_initializer(stddev=0.02), 
                                       dtype=tf.float32)
            stride_3 = [1, 2, 1, 1]
            
            layer_1 = tf.nn.conv2d(self.Xrs, filter_1, stride_1, padding='SAME')
            layer_1 = tf.layers.batch_normalization(layer_1, training=is_training)
            layer_1 = tf.contrib.layers.dropout(layer_1, keep_prob=self.keep_prob, is_training=is_training)
            
            layer_2 = tf.nn.conv2d(layer_1, filter_2, stride_2, padding='SAME')
            layer_2 = tf.layers.batch_normalization(layer_2, training=is_training)
            layer_2 = tf.contrib.layers.dropout(layer_2, keep_prob=self.keep_prob, is_training=is_training)
            # The shape of layer_3's output is [batch_size, height, width, channels]
            layer_3 = tf.nn.conv2d(layer_2, filter_3, stride_3, padding='SAME')
            layer_3 = tf.layers.batch_normalization(layer_3, training=is_training)
            layer_3 = tf.contrib.layers.dropout(layer_3, keep_prob=self.keep_prob, is_training=is_training)
            
        with tf.variable_scope('Recurrent_layer') as scope:
            seq_len_shrinked = tf.ceil(
                tf.div(tf.to_float(self.seq_len), stride_1[2] * stride_2[2] * stride_3[2]))
            seq_len_shrinked = tf.to_int32(seq_len_shrinked)
            num_features_shrinked = math.ceil(float(self.num_features) / (stride_1[1] * stride_2[1] * stride_3[1]))
            # Shape = [batch_size, time_steps_shrinked, num_features_shrinked, channel]
            rnn_input = tf.transpose(layer_3, (0, 2, 1, 3))
            # Shape = [batch_size, time_steps_shrinked, channel * num_features_shrinked]
            rnn_input = tf.reshape(layer_3, shape=[batch_size, -1, 96 * num_features_shrinked])
            for i in range(self.num_rnn_layers):
                cell = tf.contrib.rnn.GRUCell(self.num_hidden)
                rnn_output, _ = tf.nn.dynamic_rnn(
                    cell=cell,
                    inputs=rnn_input, 
                    sequence_length=seq_len_shrinked, 
                    dtype=tf.float32,
                    time_major=False,
                    scope='RNN-%d' % i
                )
                rnn_output = tf.layers.batch_normalization(rnn_output, training=is_training)
                rnn_output = tf.contrib.layers.dropout(rnn_output, keep_prob=self.keep_prob, is_training=is_training)
                # output with shape [batch_size, time_steps_shrinked, num_hidden]
                rnn_input = rnn_output
                
        with tf.variable_scope('Projection_layer'):
            flatten = tf.reshape(rnn_output, shape=[-1, self.num_hidden])
            W_proj = tf.get_variable(name='W_proj', shape=([self.num_hidden, self.num_classes]),
                                     initializer=tf.truncated_normal_initializer(stddev=0.02), 
                                     dtype=tf.float32)
            b_proj = tf.get_variable(name='b_proj', shape=([self.num_classes]),
                                     initializer=tf.constant_initializer(value=0.0), 
                                     dtype=tf.float32)
            
            logits = tf.matmul(flatten, W_proj) + b_proj
            # Reshaping back to the original shape
            logits = tf.reshape(logits, [batch_size, -1, self.num_classes])
            # Time major
            self.logits = tf.transpose(logits, (1, 0, 2))
            
        with tf.variable_scope('Loss'):
            self.loss = tf.nn.ctc_loss(self.targets, self.logits, self.seq_len)
            self.cost = tf.reduce_mean(self.loss)

        with tf.variable_scope('Prediction'):
            if is_training:
                self.decoded, log_prob = tf.nn.ctc_greedy_decoder(self.logits, self.seq_len)
            else:
                self.decoded, log_prob = tf.nn.ctc_beam_search_decoder(self.logits, self.seq_len)
                
            # Inaccuracy: Phoneme Error Rate (PER)
            self.per = tf.reduce_mean(tf.edit_distance(
                tf.to_int32(self.decoded[0]), self.targets, normalize=True))
            # Prediction
            self.pred = tf.sparse_tensor_to_dense(self.decoded[0], default_value=-1)
            
        with tf.variable_scope('Optimizer'):
            self.global_step = tf.Variable(0, trainable=False, name='global_step')
            self.var_trainable = tf.trainable_variables()
            # Gradient clipping
            grads, _ = tf.clip_by_global_norm(
                tf.gradients(self.cost, self.var_trainable), self.grad_clip)
            optimizer = tf.train.AdamOptimizer(self.lr, epsilon=1e-3)
            self.opt = optimizer.apply_gradients(
                zip(grads, self.var_trainable), global_step=self.global_step)

        self.sess.run(tf.global_variables_initializer())
        self.saver = tf.train.Saver(max_to_keep=20)            
            
    
    def train(self, data, ep, ckpt_dir='./ckpt_model', log=True, load_idx=None, re_train=False):
        num_train_example = data.num_samples
        num_batch_per_epoch = num_train_example // self.batch_size
        
        # train an epoch
        if not re_train:
            if load_idx:
                self.restore(ckpt_dir, idx=load_idx)
            else:
                self.restore(ckpt_dir)
        
        train_per = 0
        for batch in range(num_batch_per_epoch):
            start = time.time()
            batch_X, batch_y = data[batch]
            batch_train_inputs, batch_train_seq_len = data.padding(batch_X)
            batch_train_targets = data.to_sparse_tuple(batch_y, dtype=np.int64)
            feed = {self.inputs: batch_train_inputs,
                    self.targets: batch_train_targets,
                    self.seq_len: batch_train_seq_len}

            batch_cost, _ = self.sess.run([self.cost, self.opt], feed)
            batch_per = self.sess.run(self.per, feed_dict=feed)
            train_per += batch_per * self.batch_size
            if log and (batch % 1 == 0):
                log = "{}:{}/{}, train_cost = {:.3f}, train_per = {:.3f}, time = {:.3f}"
                print(log.format(ep, batch, num_batch_per_epoch, batch_cost, batch_per, time.time() - start))
        train_per /= num_train_example
        print('[!] epoch {}: train_per = {:.3f}'.format(ep, train_per))
        self.save(ckpt_dir=ckpt_dir, idx=ep)
        
    def test(self, data, num=5):
        start = time.time()
        X, y = data[:num]
        inputs_pad, seq_len = data.padding(X)
        targets_sparse = data.to_sparse_tuple(y, dtype=np.int64)
        targets = y
        feed = {self.inputs: inputs_pad,
                self.seq_len: seq_len, 
                self.targets: targets_sparse}
        per, pred = self.sess.run([self.per, self.pred], feed_dict=feed)
        print('[!] PER: %f, time: %.3f' % (per, time.time() - start))
        
    
    def save(self, ckpt_dir='./ckpt_model', idx=0):
        if not os.path.exists(ckpt_dir):
            os.makedirs(ckpt_dir)
        self.saver.save(self.sess, os.path.join(
            ckpt_dir, 'model-%d.ckpt' % idx))
    
    
    def restore(self, ckpt_dir='./ckpt_model', idx=None):
        exist_model = False
        if idx:
            self.saver.restore(self.sess, 
                               os.path.join(ckpt_dir, 'model-%d.ckpt' % idx))
            exist_model = True
        else:
            latest_ckpt = tf.train.latest_checkpoint(ckpt_dir)
            if latest_ckpt:
                self.saver.restore(self.sess, latest_ckpt)
                exist_model = True
        return exist_model
    

In [138]:
# Create a hyperparater object
hps = hparas(hps_list)

feat_path = '../data.pkl'
X_tr, y_tr, X_val, y_val, X_te, y_te = load_data(feat_path)

data_tr = TimitDataset(X_tr, y_tr, batch_size=hps.batch_size)
data_val = TimitDataset(X_val, y_val)
data_te = TimitDataset(X_te, y_te)

# Used for .ipynb
tf.reset_default_graph()

sess = tf.Session()
with tf.variable_scope('model', reuse=tf.AUTO_REUSE):
    model_tr = DeepSpeech2(sess, hps=hps)
    model_tr.build_graph(is_training=True)
    model_te = DeepSpeech2(sess, hps=hps)
    model_tr.build_graph(is_training=False)

# If you don't want to re-train a model, set it to False
re_train = True
for ep in range(hps.num_epochs):
    model_tr.train(data_tr, ep, ckpt_dir='./ckpt_model', log=True, load_idx=None, re_train=re_train)
    re_train = False
    # See testing PER after ep > 20
    if (ep > 20) and (ep % 5 == 0):
        model_te.test(data_te, num=len(data_te))
    data_tr.shuffle()

sess.close()

0:0/129, train_cost = 1165.295, train_per = 3.082, time = 22.568


KeyboardInterrupt: 