Permalink
Browse files

initial commit

  • Loading branch information...
1 parent 258bca7 commit 215667fac296a49407c4babddc87ea039d9b8fb7 mansour committed Nov 24, 2016
Showing with 307 additions and 0 deletions.
  1. +116 −0 data.py
  2. +108 −0 recognize.py
  3. +83 −0 train.py
View
@@ -0,0 +1,116 @@
+# -*- coding: utf-8 -*-
+import sugartensor as tf
+import numpy as np
+import pandas as pd
+import librosa
+import glob
+import os
+import string
+import itertools
+
+
+__author__ = 'buriburisuri@gmail.com'
+
+
+class VCTK(object):
+
+ def __init__(self, batch_size=16, data_path='asset/data/'):
+
+ @tf.sg_producer_func
+ def _load_mfcc(src_list):
+ lab, wav = src_list # label, wave_file
+ # decode string to integer
+ lab = np.fromstring(lab, np.int)
+ # load wave file
+ wav, sr = librosa.load(wav, mono=True)
+ # mfcc
+ mfcc = librosa.feature.mfcc(wav, sr)
+ # return result
+ return lab, mfcc
+
+ # load corpus
+ labels, wave_files = self._load_corpus(data_path)
+
+ # to constant tensor
+ label = tf.convert_to_tensor(labels)
+ wave_file = tf.convert_to_tensor(wave_files)
+
+ # create queue from constant tensor
+ label, wave_file = tf.train.slice_input_producer([label, wave_file], shuffle=True)
+
+ # decode wave file
+ label, mfcc = _load_mfcc(source=[label, wave_file], dtypes=[tf.sg_intx, tf.sg_floatx],
+ capacity=128, num_threads=32)
+
+ # create batch queue with dynamic pad
+ batch_queue = tf.train.batch([label, mfcc], batch_size,
+ shapes=[(None,), (20, None)],
+ num_threads=32, capacity=batch_size*48,
+ dynamic_pad=True)
+
+ # split data
+ self.label, self.mfcc = batch_queue
+ # batch * time * dim
+ self.mfcc = self.mfcc.sg_transpose(perm=[0, 2, 1])
+
+ # calc total batch count
+ self.num_batch = len(labels) // batch_size
+
+ # print info
+ tf.sg_info('VCTK corpus loaded.(total data=%d, total batch=%d)' % (len(labels), self.num_batch))
+
+ def _load_corpus(self, data_path):
+
+ # read meta-info
+ df = pd.read_table(data_path + 'speaker-info.txt', usecols=['ID', 'AGE', 'GENDER', 'ACCENTS'],
+ index_col=False, delim_whitespace=True)
+
+ # make file ID
+ file_ids = []
+ for d in [data_path + 'txt/p%d/' % uid for uid in df.ID.values]:
+ file_ids.extend([f[-12:-4] for f in sorted(glob.glob(d + '*.txt'))])
+
+ # make wave file list
+ wav_files = [data_path + 'wav48/%s/' % f[:4] + f + '.wav' for f in file_ids]
+
+ # exclude extremely short wave files
+ file_id, wav_file = [], []
+ for i, w in zip(file_ids, wav_files):
+ if os.stat(w).st_size > 240000: # at least 5 seconds
+ file_id.append(i)
+ wav_file.append(w)
+
+ # read label sentence
+ sents = []
+ for f in file_id:
+ # remove punctuation, to lower, clean white space
+ s = ' '.join(open(data_path + 'txt/%s/' % f[:4] + f + '.txt').read()
+ .translate(None, string.punctuation).lower().split())
+ # append byte code
+ sents.append([ord(ch) for ch in s])
+
+ # make vocabulary
+ self.index2byte = [0] + list(np.unique(list(itertools.chain(*sents)))) # add <EMP> token
+ self.byte2index = {}
+ for i, b in enumerate(self.index2byte):
+ self.byte2index[b] = i
+ self.voca_size = len(self.index2byte)
+ self.max_len = np.max([len(s) for s in sents])
+
+ # byte to index label
+ label = []
+ for s in sents:
+ # save as string for variable-length support.
+ label.append(np.asarray([self.byte2index[ch] for ch in s]).tostring())
+
+ return label, wav_file
+
+ def print_index(self, indices):
+ for i, index in enumerate(indices):
+ str_ = ''
+ for ch in index:
+ if ch > 0:
+ str_ += unichr(self.index2byte[ch])
+ elif ch == 0: # <EOS>
+ break
+ print str_
View
@@ -0,0 +1,108 @@
+# -*- coding: utf-8 -*-
+import sugartensor as tf
+import numpy as np
+import librosa
+from data import VCTK
+
+
+__author__ = 'buriburisuri@gmail.com'
+
+
+# set log level to debug
+tf.sg_verbosity(10)
+
+# command line argument
+tf.sg_arg_def(file=('', 'speech wave file to recognize.'))
+
+#
+# hyper parameters
+#
+
+batch_size = 1 # batch size
+num_blocks = 3 # dilated blocks
+num_dim = 128 # latent dimension
+
+#
+# inputs
+#
+
+# VCTK corpus input tensor ( with QueueRunner )
+data = VCTK()
+
+# vocabulary size
+voca_size = data.voca_size
+
+# mfcc feature of audio
+x = tf.placeholder(dtype=tf.sg_floatx, shape=(batch_size, None, 20))
+
+# sequence length except zero-padding
+seq_len = tf.not_equal(x.sg_sum(dims=2), 0.).sg_int().sg_sum(dims=1)
+
+
+#
+# encode graph ( atrous convolution )
+#
+
+# residual block
+def res_block(tensor, size, rate, dim=num_dim):
+
+ # filter convolution
+ conv_filter = tensor.sg_aconv1d(size=size, rate=rate, act='tanh', bn=True)
+
+ # gate convolution
+ conv_gate = tensor.sg_aconv1d(size=size, rate=rate, act='sigmoid', bn=True)
+
+ # output by gate multiplying
+ out = conv_filter * conv_gate
+
+ # final output
+ out = out.sg_conv1d(size=1, dim=dim, act='tanh', bn=True)
+
+ # residual and skip output
+ return out + tensor, out
+
+# expand dimension
+z = x.sg_conv1d(size=1, dim=num_dim, act='tanh', bn=True)
+
+# dilated conv block loop
+skip = 0 # skip connections
+for i in range(num_blocks):
+ for r in [1, 2, 4, 8, 16]:
+ z, s = res_block(z, size=7, rate=r)
+ skip += s
+
+# final logit layers
+logit = (skip
+ .sg_conv1d(size=1, act='tanh', bn=True)
+ .sg_conv1d(size=1, dim=voca_size))
+
+# ctc decoding
+decoded, _ = tf.nn.ctc_beam_search_decoder(logit.sg_transpose(perm=[1, 0, 2]), seq_len)
+
+# to dense tensor
+y = tf.sparse_to_dense(decoded[0].indices, decoded[0].shape, decoded[0].values) + 1
+
+#
+# regcognize wave file
+#
+
+# load wave file
+wav, sr = librosa.load(tf.sg_arg().file, mono=True)
+# get mfcc feature
+mfcc = np.transpose(np.expand_dims(librosa.feature.mfcc(wav, sr), axis=0), [0, 2, 1])
+
+# run network
+with tf.Session() as sess:
+
+ # init variables
+ tf.sg_init(sess)
+
+ # restore parameters
+ saver = tf.train.Saver()
+ saver.restore(sess, tf.train.latest_checkpoint('asset/train/ckpt'))
+
+ # run session
+ label = sess.run(y, feed_dict={x: mfcc})
+
+ # print label
+ data.print_index(label)
View
@@ -0,0 +1,83 @@
+# -*- coding: utf-8 -*-
+import sugartensor as tf
+from data import VCTK
+
+
+__author__ = 'buriburisuri@gmail.com'
+
+
+# set log level to debug
+tf.sg_verbosity(10)
+
+
+#
+# hyper parameters
+#
+
+batch_size = 16 # batch size
+num_blocks = 3 # dilated blocks
+num_dim = 128 # latent dimension
+
+#
+# inputs
+#
+
+# VCTK corpus input tensor ( with QueueRunner )
+data = VCTK(batch_size=batch_size)
+
+# vocabulary size
+voca_size = data.voca_size
+
+# mfcc feature of audio
+x = data.mfcc
+
+# sequence length except zero-padding
+seq_len = tf.not_equal(x.sg_sum(dims=2), 0.).sg_int().sg_sum(dims=1)
+
+# target sentence label
+y = data.label
+
+
+#
+# encode graph ( atrous convolution )
+#
+
+# residual block
+def res_block(tensor, size, rate, dim=num_dim):
+
+ # filter convolution
+ conv_filter = tensor.sg_aconv1d(size=size, rate=rate, act='tanh', bn=True)
+
+ # gate convolution
+ conv_gate = tensor.sg_aconv1d(size=size, rate=rate, act='sigmoid', bn=True)
+
+ # output by gate multiplying
+ out = conv_filter * conv_gate
+
+ # final output
+ out = out.sg_conv1d(size=1, dim=dim, act='tanh', bn=True)
+
+ # residual and skip output
+ return out + tensor, out
+
+# expand dimension
+z = x.sg_conv1d(size=1, dim=num_dim, act='tanh', bn=True)
+
+# dilated conv block loop
+skip = 0 # skip connections
+for i in range(num_blocks):
+ for r in [1, 2, 4, 8, 16]:
+ z, s = res_block(z, size=7, rate=r)
+ skip += s
+
+# final logit layers
+logit = (skip
+ .sg_conv1d(size=1, act='tanh', bn=True)
+ .sg_conv1d(size=1, dim=voca_size))
+
+# CTC loss
+loss = logit.sg_ctc(target=y, seq_len=seq_len)
+
+# train
+tf.sg_train(log_interval=30, lr=0.0001, loss=loss,
+ ep_size=data.num_batch, max_ep=20, early_stop=False)

0 comments on commit 215667f

Please sign in to comment.