In [1]:
# # This Python 3 environment comes with many helpful analytics libraries installed
# # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# # For example, here's several helpful packages to load

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# # Input data files are available in the read-only "../input/" directory
# # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import numpy as np

def dense_to_sparse(dense, dtype=np.int32):
    indices = []
    values = []
    # dense: [batch_sz x time?], phonems

    for n, times in enumerate(dense):
        indices.extend(zip([n]*len(times), range(len(times))))
        values.extend(times)

    indices = np.asarray(indices, dtype=np.int64)
    values = np.asarray(values, dtype=dtype)
    shape = np.asarray([len(dense), np.asarray(indices).max(0)[1]+1], dtype=np.int64)

    return indices, values, shape

def pad_time(dense):
    # dense: [batch_sz x time? x mfcc_len]
    # treats it as [batch_sz x variable_len? x [...data...]]
    batch_size = len(dense)
    time_lens = np.asarray([len(times) for times in dense], dtype=np.int64)
    max_time = np.max(time_lens)

    # take the constant shape of inner data from first sample
    data_shape = np.asarray(dense[0]).shape[1:]

    # create an empty box with zeros to place parts of 'dense'
    padded = (np.ones((batch_size, max_time) + data_shape) * 0.0).astype(np.float32)

    # fill in rows of the box with time x data
    for idx, times in enumerate(dense):
        assert(times.shape[1:] == data_shape)
        padded[idx, :len(times)] = times

    # return padded and the lengths array
    return padded, time_lens

In [3]:
import os
import glob
import numpy as np

# wav -> sph, a simple rename
def step_one():
    wav_glob = '../input/darpa-timit-wav-wav/data/TIMIT/**/*.WAV'
    for wav_filename in glob.glob(wav_glob, recursive=True):
        sph_filename = wav_filename[:-3] + "SPH"
        print(wav_filename)
        os.rename(wav_filename, sph_filename)

# sph -> wav, conversion, so it can be read with scipy
def step_two():
    sph_glob = '../input/darpa-timit-wav-wav/data/TIMIT/**/*.SPH'
    for sph_filename in glob.glob(sph_glob, recursive=True):
        print(sph_filename)
        wav_filename = sph_filename[:-3] + "wav"
        os.system("sox " + sph_filename + " " + wav_filename)

In [4]:
class Dataset():
    """ A generic dataset class offering common accessors """
    def __init__(self):
        pass

    def get_num_training_examples(self):
        return self.x_train.shape[0]

    def get_training_batch(self, batch_idx, batch_size):
        start_idx = batch_idx * batch_size
        end_idx = start_idx + batch_size # if there is less that's ok

        x_batch = self.x_train[start_idx:end_idx]
        y_batch = self.y_train[start_idx:end_idx]

        return x_batch, y_batch

    def get_num_test_examples(self):
        return self.x_test.shape[0]

    def get_test_batch(self, batch_idx, batch_size):
        start_idx = batch_idx * batch_size
        end_idx = start_idx + batch_size # if there is less that's ok

        x_batch = self.x_test[start_idx:end_idx]
        y_batch = self.y_test[start_idx:end_idx]

        return x_batch, y_batch

    def get_test_data(self):
        return self.x_test, self.y_test

    def get_training_data(self):
        return self.x_train, self.y_train

In [5]:
pip install python_speech_features

Collecting python_speech_features
  Downloading python_speech_features-0.6.tar.gz (5.6 kB)
Building wheels for collected packages: python-speech-features
  Building wheel for python-speech-features (setup.py) ... [?25ldone
[?25h  Created wheel for python-speech-features: filename=python_speech_features-0.6-py3-none-any.whl size=5888 sha256=93cd8072306e44c7fe694630d8526d30a690bf5391f2180b50f79b47e184b4cb
  Stored in directory: /root/.cache/pip/wheels/b0/0e/94/28cd6afa3cd5998a63eef99fe31777acd7d758f59cf24839eb
Successfully built python-speech-features
Installing collected packages: python-speech-features
Successfully installed python-speech-features-0.6
Note: you may need to restart the kernel to use updated packages.


In [6]:
import python_speech_features as psf
import scipy.io.wavfile as sciwav
import os
import glob
import numpy as np
# from dataset import Dataset

class TimitDataset(Dataset):

    phonemes = ['h#', 'sh', 'ix', 'hv', 'eh', 'dcl', 'jh', 'ih', 'd', 'ah', 
               'kcl', 'k', 's', 'ux', 'q', 'en', 'gcl', 'g', 'r', 'w', 
               'ao', 'epi', 'dx', 'axr', 'l', 'y', 'uh', 'n', 'ae', 'm', 
               'oy', 'ax', 'dh', 'tcl', 'iy', 'v', 'f', 't', 'pcl', 'ow', 
               'hh', 'ch', 'bcl', 'b', 'aa', 'em', 'ng', 'ay', 'th', 'ax-h', 
               'ey', 'p', 'aw', 'er', 'nx', 'z', 'el', 'uw', 'pau', 'zh', 
               'eng', 'BLANK'] # 61 + 1, numbers [0, 61]

    def __init__(self, timit_root, mfcc_size, split_phonemes):
        self.mfcc_size = mfcc_size
        # load the dataset
        training_root = os.path.join(timit_root, 'TRAIN')
        test_root = os.path.join(timit_root, 'TEST')
        if split_phonemes:
#             print('enter1')
            self.x_train, self.y_train = self.load_split_timit_data(training_root)
            self.x_test, self.y_test = self.load_split_timit_data(test_root)
            self.normalize_xs()
            print(self.x_train.shape) # num_examples=142910 x [num_windows?, mfcc_size=13]
            print(self.y_train.shape) # num_examples=142910 x phonem
            print(self.x_test.shape) # num_examples=51681 x [num_windows?, mfcc_size=13]
            print(self.y_test.shape) # num_examples=51681 x phonem
        else:
#             print('enter2')
            self.x_train, self.y_train = self.load_timit_data(training_root)
            self.x_test, self.y_test = self.load_timit_data(test_root)
            self.normalize_xs()
            print(self.x_train.shape) # num_examples=4620 x [num_windows?, mfcc_size=13]
            print(self.y_train.shape) # num_examples=4620 x [num_phonemes?]
            print(self.x_test.shape) # num_examples=1680 x [num_windows?, mfcc_size=13]
            print(self.y_test.shape) # num_examples=1680 x [num_phonemes?]
            # actually 3696 and 1344 when we drop SA

    def num_classes(self):
        return len(self.phonemes)

    def normalize_xs(self):
#         print("self.x_train ", self.x_train)
#         print("self.x_test ", self.x_test)
        all_xs = np.concatenate([self.x_train, self.x_test])
#         print(type(all_xs))
#         print("all_xs 1 :", all_xs[5])
        all_xs = np.vstack(all_xs.flatten())
#         print("all_xs 2 :", all_xs)
        mean = np.mean(all_xs)
        std = np.std(all_xs)
        print("MEAN is ", mean)
        print("STD is ", std)
        self.x_train = (self.x_train - mean) / std
        self.x_test = (self.x_test - mean) / std


    def load_split_timit_data(self, root_dir):
        x_list = []
        y_list = []
        wav_glob = os.path.join(root_dir, '**/*.WAV')
#         print(wav_glob)
        it = 1
        for wav_filename in glob.glob(wav_glob, recursive=True):
            if wav_filename[-3:] == 'WAV':
                continue
#             print(it)
            if wav_filename[-7:] in ['SA1.WAV', 'SA2.WAV', 'SA1.WAV.wav', 'SA2.WAV.wav']:
                # drop SA sentences
                continue
            # load audio
            sample_rate, wav = sciwav.read(wav_filename)
            
            # parse the text file with phonemes
            print(wav_filename)
            phn_filename = wav_filename[:-7] + 'PHN' # fragile, i know
            with open(phn_filename) as f:
                lines = f.readlines()
                phonemes = [line.split() for line in lines]

            # slice the wav file and pair up with the corresponding phoneme
            for l, r, ph in phonemes:
                # add x
                wav_slice = wav[int(l) : (int(r)+1)]
                mfcc_data = psf.mfcc(wav_slice, samplerate=sample_rate, numcep=self.mfcc_size,
                                     winlen=0.0125, winstep=0.005)
                x_list.append(mfcc_data)
                if len(x_list) % 100 == 0:
                    print('Added {} pairs.'.format(len(x_list)))
                # add y
                phonem_idx = TimitDataset.phonemes.index(ph)
                y_list.append(phonem_idx)

            # early break for debugging
            it += 1
            #if it == 20:
            #    break

        # return np arrays, second dimension can vary
        x = np.array(x_list)
        y = np.array(y_list)
#         print ("X: ", x)
#         print ("Y: ", y)
        return x, y


#     def test(self, root_dir):
#         wav_glob = os.path.join(root_dir, '**/*.WAV.wav')
#         for wav_filename in glob.glob(wav_glob, recursive=True):
#             print(wav_filename)
    
    def load_timit_data(self, root_dir):

        x_list = []
        y_list = []
        wav_glob = os.path.join(root_dir, '**/*.WAV.wav')
        it = 1
#         i = 0
#         j = 0
#         self.test(root_dir)
        for wav_filename in glob.glob(wav_glob, recursive=True):
            if wav_filename[-3:] == 'WAV':
                continue
#             i = i+1
#             print('i: ',i)
#             print('before :', wav_filename)
            if wav_filename[-7:] in ['SA1.WAV', 'SA2.WAV', 'SA1.WAV.wav', 'SA2.WAV.wav']:
                continue
            # load audio and get mfcc, add x
#             j = j+1
#             print('j: ',j)
#             print('after :', wav_filename)
            sample_rate, wav = sciwav.read(wav_filename)
            mfcc_data = psf.mfcc(wav, samplerate=sample_rate, numcep=self.mfcc_size)
            x_list.append(mfcc_data)
            if len(x_list) % 100 == 0:
                print('Loaded {} files.'.format(len(x_list)))
            
            # parse the text file with phonemes, and add y
#             wav_filename = wav_filename[:-3]
            phn_filename = wav_filename[:-7] + 'PHN' # fragile, i know
            with open(phn_filename) as f:
                phonemes = [line.split()[2] for line in f.readlines()]
                phonem_idxs = np.array([TimitDataset.phonemes.index(ph) for ph in phonemes])
                y_list.append(phonem_idxs)

            # early break for debugging
            it += 1
            #if it == 200:
            #    break

        # return np arrays, second dimension can vary
        x = np.array(x_list)
        y = np.array(y_list)
#         print ("X: ", x)
#         print()
#         print ("Y: ", y)
#         print()
        return x, y

In [7]:
# import numpy as np
# import random
# import math
# # from timit_dataset import TimitDataset
# # from conversions import *
# from hmmlearn import hmm

# class Ghmm():
#     """ A gaussian HMM for phonem sequence classification """

#     def __init__(self, mfcc_size, n_states, n_iter, num_labels):
#         self.n_states = n_states  # the number of HMM states 
#         self.mfcc_size = mfcc_size   # the size of a single MFCC vector
#         self.hmms = []    # one hmm for each phonem
#         self.phonem_freq = {}    # language model: phoneme frequencies
#         self.num_labels = num_labels    # number of phonemes
#         # create an hmm 
#         for idx in range(num_labels):
#             self.hmms.append(hmm.GaussianHMM(n_components=n_states, n_iter=n_iter))

#     def train_and_test(self, dataset, batch_size):
#         self.batch_size = batch_size
#         x_train, y_train = dataset.get_training_data()
#         num_examples = dataset.get_num_training_examples()
#         dropped = 0
#         for idx in range(num_examples):
#             if idx % 10 == 0:
#                 print('Training on {} out of {} examples'.format(idx, num_examples))
#             mfcc_vec_seq = x_train[idx]
#             phoneme_idx = y_train[idx]
# #             print(phoneme_idx)
# #             print(self.hmms)
#             # drop sequences that are too short
#             if len(mfcc_vec_seq) < self.n_states:
#                 print('dropped')
#                 dropped += 1
#             else:
#                 hmm = [self.hmms[x] for x in phoneme_idx]
# #                 print("Len of hmm: ",len(hmm))
#                 # add phonem frequency
#                 for hmm_idx, ph in enumerate(phoneme_idx):
# #                     print("hmm_idx: ", hmm_idx)
#                     if not ph in self.phonem_freq:
#                         self.phonem_freq[ph] = 0
#                     self.phonem_freq[ph] += 1
# #                     print("self.phonem_freq[ph]", self.phonem_freq[ph])
# #                     print(hmm[hmm_idx])
# #                     print("just before fitting mfcc_vec_seq", mfcc_vec_seq)
#                     hmm[hmm_idx].fit(mfcc_vec_seq)
#         # calculate phonem frequences
#         phonem_num = sum(self.phonem_freq.values())
#         for key in self.phonem_freq:
#             self.phonem_freq[key] /= float(phonem_num)
# #             print(self.phonem_freq[key])
# #             print(key)
#         print('Done training, dropped {} out of {}'.format(dropped, num_examples))
#         self.test_on_random_training_batch(dataset, self.batch_size)
#         self.test(dataset)

#     def test(self, dataset):
#         print("==Testing==")
#         num_examples = dataset.get_num_test_examples()
#         num_batches = math.ceil(num_examples / self.batch_size)
#         all_pairs = []
#         test_ler = 0
#         for batch_idx in range(num_batches):
#                 if batch_idx % 10 == 0:
#                     print('Testing {}'.format(batch_idx))
#                 x_batch, y_batch = dataset.get_test_batch(batch_idx, self.batch_size)
#                 actual_batch_size = x_batch.shape[0]
#                 ler, pairs = self.evaluate_batch(x_batch, y_batch)
#                 test_ler += ler * actual_batch_size
#                 all_pairs.extend(pairs)
#         test_ler /= num_examples
#         self.log(test_ler, random.sample(all_pairs, 10))


#     def test_on_random_training_batch(self, dataset, batch_size):
#         print('===Random training batch===')
#         num_examples = dataset.get_num_training_examples()
#         num_batches = math.ceil(num_examples / batch_size)
#         idx = random.randint(0, num_batches)
#         x_batch, y_batch = dataset.get_training_batch(idx, batch_size)
#         ler, pairs = self.evaluate_batch(x_batch, y_batch)
#         self.log(ler, random.sample(pairs, 10))

#     def evaluate_batch(self, xs, ys):
#         fails = 0
#         actual_batch_size = xs.shape[0]
#         pairs = []
#         # calculate error rate on a batch
#         for idx in range(actual_batch_size):
#             mfcc_vec_seq = xs[idx]
#             target_phoneme = ys[idx]
#             guessed_phoneme = self.infer(mfcc_vec_seq)
#             if guessed_phoneme not in target_phoneme:
#                 fails += 1
#             pairs.append((TimitDataset.phonemes[target_phoneme], 
#                           TimitDataset.phonemes[guessed_phoneme]))
#         ler = float(fails) / actual_batch_size
#         return ler, pairs


#     def log(self, ler, pairs):
#         print("Ler:", '{0:3f}'.format(ler))
#         for pair in pairs:
#             print('\t Target: <%s>' % pair[0])
#             print('\t Decoded: <%s>' % pair[1])

#     def infer(self, mfcc_vec_seq):
#         # infer the most likely phoneme using the forward algorithm and bayes
#         scores = []
#         for idx in range(self.num_labels):
#             if idx not in self.phonem_freq:
#                 continue  # not fitted at all, no examples of this phoneme were seen
#             score = self.phonem_freq[idx] * math.exp(self.hmms[idx].score(mfcc_vec_seq))
#             scores.append((score, idx))
#         best_idx = max(scores)[1]
#         return best_idx

In [8]:
# import numpy as np
# # from timit_dataset import TimitDataset
# # from mock_dataset import MockDataset
# # from ghmm import Ghmm
# import os
# import warnings

# # ignore a deprecation warning in the hmm lib
# warnings.filterwarnings("ignore")

# # hmm hyperparameters
# mfcc_size = 13
# n_states = 3
# batch_size = 10
# n_iter = 100

# def main():
#     # load the dataset with split phonemes
#     dataset = TimitDataset("../input/darpa-timit-wav-wav/data/TIMIT/", mfcc_size, split_phonemes=False)
#     num_labels = len(TimitDataset.phonemes) - 1
#     # create a gaussian hmm, train and test 
#     ghmm = Ghmm(mfcc_size=mfcc_size, n_states=n_states, 
#                 n_iter=n_iter, num_labels=num_labels)
#     ghmm.train_and_test(dataset, batch_size=10)

# if __name__ == "__main__":
#     main()

In [9]:
import numpy as np
import tensorflow as tf
import random
import math
# from timit_dataset import TimitDataset
# from conversions import *

class DeepBiLstmCtc():

    def __init__(self, 
                 mfcc_size, 
                 lstm_num_hidden,
                 num_layers,
                 num_classes,
                ):

        # inputs
        # MFCC vectors
        # max_num_windows_for_batch = max_time
        # size [batch_size, max_time, mfcc_size]
        self.input_x = tf.placeholder(tf.float32, [None, None, mfcc_size], name='input_x')
        # ctc_loss needs sparse tensor and an array of num_windows
        # indices[i, :] = [batch, time], values[i] is in [0, num_labels)
        self.input_y = tf.sparse_placeholder(tf.int32, name='input_y')
        # [batch_size], holds max_time values
        self.len_seq = tf.placeholder(tf.int32, [None], name='len_seq') 

        # architecture, deep bidirectional lstm
        cells_fwd = []
        cells_bwd = []
        for idx in range(num_layers):
            cells_fwd.append(tf.nn.rnn_cell.LSTMCell(lstm_num_hidden, state_is_tuple=True))
            cells_bwd.append(tf.nn.rnn_cell.LSTMCell(lstm_num_hidden, state_is_tuple=True))
        layers_fwd = tf.nn.rnn_cell.MultiRNNCell(cells_fwd, state_is_tuple=True)
        layers_bwd = tf.nn.rnn_cell.MultiRNNCell(cells_bwd, state_is_tuple=True)
        self.out, _ = tf.nn.bidirectional_dynamic_rnn(layers_fwd, layers_bwd, self.input_x, self.len_seq, dtype=tf.float32)
        self.full_out = tf.concat(self.out, 2) 
        # [batch_size, max_time, 2xlstm_num_hidden]

        # dense layer to get num_classes
        self.out_flat = tf.reshape(self.full_out, [-1, 2*lstm_num_hidden])
        self.logits = tf.layers.dense(self.out_flat, num_classes)
        # [batch_size, max_time, num_classes]

        # reshape back and do time major to prepare for ctc_loss
        batch_sz = tf.shape(self.input_x)[0]
        self.logits = tf.reshape(self.logits, [batch_sz, -1, num_classes])
        self.logits = tf.transpose(self.logits, (1, 0, 2))  
        # [max_time, batch_size, num_classes]

        # get ctc loss and cost
        loss = tf.nn.ctc_loss(self.input_y, self.logits, self.len_seq)
        self.total_loss = tf.reduce_mean(loss)

        # set up momentum optimizer
        initial_learning_rate = 1e-3
        momentum = 0.9
        self.optimizer = tf.train.MomentumOptimizer(initial_learning_rate,
                                           momentum).minimize(self.total_loss)

        # decode and calculate edit distance and ler inaccuracy
        decoded, _ = tf.nn.ctc_greedy_decoder(self.logits, self.len_seq)
        self.decoded = tf.cast(decoded[0], tf.int32)
        # sparse tensor: [batch_size, time] -> in range [0, nc-1]
        # input_y is also sparse: [batch_size, time] -> in range [0, nl-1]
        # num classes is 62, num labels is 61
        # inaccuracy: label error rate
        self.ler = tf.reduce_mean(tf.edit_distance(self.decoded, self.input_y)) # [0, 1]

        # summarize metrics for tensorboard
        loss_summary = tf.summary.scalar("batch_loss", self.total_loss)
        ler_summary = tf.summary.scalar("batch_ler", self.ler)
        self.summary_op = tf.summary.merge_all()

        # prepare tensorboard writer
        self.writer = tf.summary.FileWriter('./logs/train_real_test')
        self.writer.add_graph(tf.get_default_graph())

        # save the model after training
        self.saver = tf.train.Saver()

    def train_and_test(self, dataset, batch_size, num_epochs):
        self.batch_size = batch_size

        with tf.Session() as sess:
            # init global vars
            sess.run(tf.global_variables_initializer())

            # get training data
            num_examples = dataset.get_num_training_examples()
            num_batches = math.ceil(num_examples / batch_size)

            # train several epochs
            for epoch_idx in range(num_epochs):
                print("Epoch " + str(epoch_idx) + " starting.")
                epoch_loss = 0
                epoch_ler = 0

                # train several batches
                for batch_idx in range(num_batches):
                    # both num_windows and num_phonemes are time
                    # [batch_size num_windows? mfcc_size] [batch_size num_phonemes?]
                    x_batch, y_batch = dataset.get_training_batch(batch_idx, batch_size)
                    actual_batch_size = x_batch.shape[0]

                    # pad second dim of x_batch
                    x_batch_padded, x_batch_len_seq = pad_time(x_batch) 

                    # convert y_batch to a sparse vector for ctc
                    y_batch_sparse = dense_to_sparse(y_batch)

                    # evaluate and get loss
                    results = [self.ler, 
                               self.total_loss, 
                               self.optimizer, 
                               self.summary_op]
                    feed_dict = {self.input_x: x_batch_padded, 
                                 self.input_y: y_batch_sparse,
                                 self.len_seq: x_batch_len_seq}
                    batch_ler, batch_loss, _, summary = sess.run(results, feed_dict)

                    # add loss/ler to epoch metrics and output to tensorboard
                    self.writer.add_summary(summary, epoch_idx * num_batches + batch_idx)
                    epoch_loss += batch_loss*actual_batch_size
                    epoch_ler += batch_ler*actual_batch_size

                    # print batch loss
                    print("Epoch:", '{0:3d}'.format(epoch_idx), 
                          "|Batch:", '{0:3d}'.format(batch_idx), 
                          "|BatchLoss:", '{0:8.4f}'.format(batch_loss),
                          "|BatchLer:", '{0:8.4f}'.format(batch_ler))

                # average per example
                epoch_loss /= num_examples
                epoch_ler /= num_examples
                print("Epoch over:", '{0:3d}'.format(epoch_idx))
                print("MeanEpochLoss:", '{0:3f}'.format(epoch_loss))
                print("MeanEpochLer:", '{0:3f}'.format(epoch_ler))

            # save and test
            self.saver.save(sess, './saved_model/dblc-3-150')
            self.test_on_random_training_batch(sess, dataset, batch_size)
            self.test(sess, dataset)

    def test(self, sess, dataset):
        print('===Testing===')
        # calculate metrics on the test set
        num_examples = dataset.get_num_test_examples()
        num_batches = math.ceil(num_examples / self.batch_size)
        all_pairs = []
        test_loss = 0
        test_ler = 0
        for batch_idx in range(num_batches):
                x_batch, y_batch = dataset.get_test_batch(batch_idx, self.batch_size)
                actual_batch_size = x_batch.shape[0]
                loss, ler, pairs = self.evaluate_and_decode(sess, x_batch, y_batch)
                all_pairs.extend(pairs)
                test_loss += loss*actual_batch_size
                test_ler += ler*actual_batch_size
        test_loss /= num_examples
        test_ler /= num_examples
        self.log(test_loss, test_ler, random.sample(all_pairs, min(10, len(pairs))))

    def test_on_random_training_batch(self, sess, dataset, batch_size):
        print('===Random training batch===')
        # calculate metrics on a random training batch
        num_examples = dataset.get_num_training_examples()
        num_batches = math.ceil(num_examples / batch_size)
        idx = random.randint(0, num_batches)
        x_batch, y_batch = dataset.get_training_batch(idx, batch_size)

        loss, ler, pairs = self.evaluate_and_decode(sess, x_batch, y_batch)
        self.log(loss, ler, random.sample(pairs, min(10, len(pairs))))

    def log(self, loss, ler, pairs):
        # log metrics
        print("Loss:", '{0:3f}'.format(loss))
        print("Ler:", '{0:3f}'.format(ler))
        for pair in pairs:
            print('\t Target: %s' % pair[0])
            print('\t Decoded: %s' % pair[1])

    def evaluate_and_decode(self, sess, xs, ys):
        # evaluate xs and compare to ys
        xs_padded, xs_len_seq = pad_time(xs) 
        ys_sparse = dense_to_sparse(ys)

        results = [self.total_loss, self.ler, self.decoded]
        feed_dict = {self.input_x: xs_padded, 
                     self.input_y: ys_sparse,
                     self.len_seq: xs_len_seq}
        loss, ler, d = sess.run(results, feed_dict)

        dense_decoded = tf.sparse_tensor_to_dense(d, default_value=-1).eval(session=sess)

        # convert to strings and get 10 pairs of sequences to return
        it = 0
        pairs = []
        for batch_idx, seq in enumerate(dense_decoded):
            it += 1
            seq = [TimitDataset.phonemes[s] for s in seq if s != -1]
            target_seq = [TimitDataset.phonemes[s] for s in ys[batch_idx]]
            pairs.append((' '.join(target_seq), ' '.join(seq)))
            if it == 10:
                break
        return loss, ler, pairs

2021-11-26 08:01:16.221040: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


In [10]:
# import python_speech_features as psf
# import scipy.io.wavfile as sciwav
# import os
# import glob
# import numpy as np
# # from dataset import Dataset

# class MockDataset(Dataset):

#     # phonems # 61 + 1, numbers [0, 61]

#     def __init__(self, mfcc_size):
#         assert(mfcc_size == 11)
#         self.x_train = np.array([
#             np.array([[4, 5, 2, 54, 2, 3, 12, 12, 45, 43, 32], [4, 5, 10, 54, 22, 3, 10, 12, 40, 43, 32]]),
#             np.array([[3, 5, 33, 54, 11, 11, 11, 44, 43, 22, 23]])
#         ])
#         self.y_train = np.array([
#             np.array([11, 13]),
#             np.array([11, 22])
#         ])
#         # same for text
#         self.x_test = np.array([
#             np.array([[4, 5, 2, 54, 2, 3, 12, 12, 45, 43, 32], [4, 5, 10, 54, 22, 3, 10, 12, 40, 43, 32]]),
#             np.array([[3, 5, 33, 54, 11, 11, 11, 44, 43, 22, 23]])
#         ])
#         self.y_test = np.array([
#             np.array([11, 13]),
#             np.array([11, 22])
#         ])

In [11]:
import numpy as np
import tensorflow as tf
# from timit_dataset import TimitDataset
# from mock_dataset import MockDataset
# from lstm import DeepBiLstmCtc
import os
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior() 

os.environ['CUDA_VISIBLE_DEVICES'] = '0'

# model hyperparameters
batch_size = 64
num_epochs = 170
mfcc_size = 13
lstm_num_hidden = 150
num_layers = 3

def main():
    # load the timit dataset, create a network, train and test it
#     print(os.listdir("C:/Users/shukl/Documents/Georgia-Tech/MUSI-6201/Project on Music Segmentation/Phoneme Detection with HMMs/Github/rand0musername-speech-rec/speech-rec-master/data/TIMIT"))
    dataset = TimitDataset("../input/darpa-timit-wav-wav/data/TIMIT/", mfcc_size, split_phonemes=False)
    
    # dataset = MockDataset(mfcc_size)
    num_cl = dataset.num_classes()  # 61 + blank
    model = DeepBiLstmCtc(mfcc_size, lstm_num_hidden, num_layers, num_cl)
    model.train_and_test(dataset, batch_size, num_epochs)

if __name__ == "__main__":
    main()

Loaded 100 files.
Loaded 200 files.
Loaded 300 files.
Loaded 400 files.
Loaded 500 files.
Loaded 600 files.
Loaded 700 files.
Loaded 800 files.
Loaded 900 files.
Loaded 1000 files.
Loaded 1100 files.
Loaded 1200 files.
Loaded 1300 files.
Loaded 1400 files.
Loaded 1500 files.
Loaded 1600 files.
Loaded 1700 files.
Loaded 1800 files.
Loaded 1900 files.
Loaded 2000 files.
Loaded 2100 files.
Loaded 2200 files.
Loaded 2300 files.
Loaded 2400 files.
Loaded 2500 files.
Loaded 2600 files.
Loaded 2700 files.
Loaded 2800 files.
Loaded 2900 files.
Loaded 3000 files.
Loaded 3100 files.
Loaded 3200 files.
Loaded 3300 files.
Loaded 3400 files.
Loaded 3500 files.
Loaded 3600 files.
Loaded 3700 files.
Loaded 3800 files.
Loaded 3900 files.
Loaded 4000 files.
Loaded 4100 files.
Loaded 4200 files.
Loaded 4300 files.
Loaded 4400 files.
Loaded 4500 files.
Loaded 4600 files.




Loaded 100 files.
Loaded 200 files.
Loaded 300 files.
Loaded 400 files.
Loaded 500 files.
Loaded 600 files.
Loaded 700 files.
Loaded 800 files.
Loaded 900 files.
Loaded 1000 files.
Loaded 1100 files.
Loaded 1200 files.
Loaded 1300 files.
Loaded 1400 files.
Loaded 1500 files.
Loaded 1600 files.
MEAN is  -5.232289605390709
STD is  15.871591419622186
(4620,)
(4620,)
(1680,)
(1680,)


2021-11-26 08:03:03.095049: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-11-26 08:03:03.100918: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2021-11-26 08:03:03.138850: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:941] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-26 08:03:03.139467: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:00:04.0 name: Tesla P100-PCIE-16GB computeCapability: 6.0
coreClock: 1.3285GHz coreCount: 56 deviceMemorySize: 15.90GiB deviceMemoryBandwidth: 681.88GiB/s
2021-11-26 08:03:03.139517: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
2021-11-26 08:03:03.168378: I tensorflow/stream_executor/platform/def

Epoch 0 starting.


2021-11-26 08:03:05.794852: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublas.so.11
2021-11-26 08:03:06.580860: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublasLt.so.11


Epoch:   0 |Batch:   0 |BatchLoss: 1112.5825 |BatchLer:   0.9539
Epoch:   0 |Batch:   1 |BatchLoss: 895.8468 |BatchLer:   1.0000
Epoch:   0 |Batch:   2 |BatchLoss: 223.4816 |BatchLer:   1.0000
Epoch:   0 |Batch:   3 |BatchLoss: 281.0754 |BatchLer:   1.0000
Epoch:   0 |Batch:   4 |BatchLoss: 379.2457 |BatchLer:   1.0000
Epoch:   0 |Batch:   5 |BatchLoss: 392.1844 |BatchLer:   1.0000
Epoch:   0 |Batch:   6 |BatchLoss: 285.2530 |BatchLer:   1.0000
Epoch:   0 |Batch:   7 |BatchLoss: 162.2343 |BatchLer:   1.0000
Epoch:   0 |Batch:   8 |BatchLoss: 500.9765 |BatchLer:   1.0000
Epoch:   0 |Batch:   9 |BatchLoss: 240.4779 |BatchLer:   1.0000
Epoch:   0 |Batch:  10 |BatchLoss: 370.7809 |BatchLer:   1.0000
Epoch:   0 |Batch:  11 |BatchLoss: 381.9453 |BatchLer:   1.0000
Epoch:   0 |Batch:  12 |BatchLoss: 328.9367 |BatchLer:   1.0000
Epoch:   0 |Batch:  13 |BatchLoss: 239.6346 |BatchLer:   1.0000
Epoch:   0 |Batch:  14 |BatchLoss: 153.4874 |BatchLer:   1.0000
Epoch:   0 |Batch:  15 |BatchLoss: 397.