In [2]:
import os
import python_speech_features as features
import numpy as np
import librosa
from tqdm import tqdm
from sklearn.utils import shuffle
import scipy.io.wavfile as wav
import skimage
from skimage.io import imsave

from keras import layers
from keras.models import Sequential
from keras import regularizers
from tensorflow.keras.layers import Input
from keras.losses import Poisson

from keras.models import Model
from keras.layers import Input, Permute, Reshape, Lambda, Dot, Softmax
from keras.layers import Add, BatchNormalization, Conv2D, Dense, LSTM, Bidirectional, Dropout, Flatten
from keras import backend as K
from keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler
from kapre.time_frequency import Melspectrogram
from kapre.utils import Normalization2D

In [85]:
sound_path = '/home/anton/Documents/phonemes/sounds/'
labels_path = '/home/anton/Documents/phonemes/labels/'

In [86]:
phonemes = ["b", "bcl", "d", "dcl", "g", "gcl", "p", "pcl", "t", "tcl", "k", "kcl", "dx", "q", "jh", "ch", "s", "sh", "z", "zh", 
    "f", "th", "v", "dh", "m", "n", "ng", "em", "en", "eng", "nx", "l", "r", "w", "y", 
    "hh", "hv", "el", "iy", "ih", "eh", "ey", "ae", "aa", "aw", "ay", "ah", "ao", "oy",
    "ow", "uh", "uw", "ux", "er", "ax", "ix", "axr", "ax-h", "pau", "epi", "h#"]

In [142]:
X_train = []
y_train = []

In [78]:
def get_total_duration(file):
    for line in reversed(list(open(file))):
        [_, val, _] = line.split()
        return int(val)

In [79]:
def find_label_vector(file_name):
    labels = []
    f = open(file_name)
    lines = f.readlines()
    for line in lines:
        [start_time, end_time, phoneme] = line.split()
        ph_label = phonemes.index(phoneme)
        start_time = int(start_time)
        end_time = int(end_time)
        dist = end_time - start_time
        for _ in range(dist):
            labels.append(ph_label)
    f.close()
    return labels

In [80]:
def create_mfcc(rate, sample):
    mfcc = features.mfcc(sample, rate, winlen=0.025, winstep=0.01, numcep = 13, nfilt=26,
    preemph=0.97, appendEnergy=True)
    derivative = np.zeros(mfcc.shape)
    for i in range(1, mfcc.shape[0]-1):
        derivative[i, :] = mfcc[i+1, :] - mfcc[i-1, :]
    out = np.concatenate((mfcc, derivative), axis=1)
    return out, out.shape[0]

In [81]:
def create_y_labels(labels, fr):
    total_len = len(labels)
    new_labels = np.zeros(fr) - 1
    sym = labels[0]
    start_indexes = []
    end_indexes = []
    symbols = []
    start_index = 0
    for i, l in enumerate(labels):
        if l != sym:
            end_indexes.append(int((float(i)/float(total_len))*float(fr)))
            symbols.append(sym)
            start_indexes.append(start_index)
            sym = l
            start_index = int((float(i)/float(total_len))*float(fr))
        if i == len(labels) - 1:
            end_indexes.append(fr)
            symbols.append(sym)
            start_indexes.append(start_index)
    if start_index == 0:
        new_labels[0:fr] = sym
    else:
        for i in range(len(start_indexes)):
            new_labels[start_indexes[i]:end_indexes[i]] = symbols[i]
        
    return new_labels

In [82]:
def calc_norm_param(X):
    total_len = 0
    mean_val = np.zeros(X[0].shape[1])
    std_val = np.zeros(X[0].shape[1])
    for obs in X:
        obs_len = obs.shape[0]
        mean_val += np.mean(obs,axis=0)*obs_len
        std_val += np.std(obs, axis=0)*obs_len
        total_len += obs_len

    mean_val /= total_len
    std_val /= total_len


    return mean_val, std_val, total_len

In [83]:
def normalize(X, mean_val, std_val):
    for i in range(len(X)):
        X[i] = (X[i] - mean_val)/std_val
    return X

In [84]:
def create_y_sample(y):
    sample = []
    for l in y:
        lab = np.zeros(len(phonemes))
        lab[int(l)] = 1
        sample.append(lab)
    return np.array(sample)

In [76]:
def find_class(labels):
    common = max(set(labels), key = labels.count)
    cl = np.zeros((len(phonemes), ))
    cl[common] = 1
    return cl

In [143]:
for i in tqdm(range(len(os.listdir(sound_path)))):
    y, sr = librosa.load(sound_path + str(i) + '.WAV', sr=16000)
    label_vector = find_label_vector(labels_path + str(i) + '.PHN')
    y = y[:len(label_vector)]
    for j in range(0, len(label_vector) - 1500, 100):
        X_train.append(y[j:j+1000])
        y_train.append(find_class(label_vector[j:j+1000]))

100%|██████████| 4620/4620 [02:08<00:00, 35.85it/s]


In [144]:
def AttRNNSpeechModel(samplingrate = 16000, inputLength = 1000, rnn_func = LSTM):
    sr = samplingrate
    iLen = inputLength
    
    inputs = Input((inputLength,), name='input')

    x = Reshape((1, -1)) (inputs)

    x = Melspectrogram(n_dft=512, n_hop=128, input_shape=(1, iLen),
                             padding='same', sr=sr, n_mels=80,
                             fmin=40.0, fmax=sr/2, power_melgram=1.0,
                             return_decibel_melgram=True, trainable_fb=False,
                             trainable_kernel=False,
                             name='mel_stft') (x)

    x = Normalization2D(int_axis=0)(x)

    #note that Melspectrogram puts the sequence in shape (batch_size, melDim, timeSteps, 1)
    #we would rather have it the other way around for LSTMs

    x = Permute((2,1,3)) (x)

    x = Conv2D(10, (5,1) , activation='relu', padding='same') (x)
    x = BatchNormalization() (x)
    x = Conv2D(1, (5,1) , activation='relu', padding='same') (x)
    x = BatchNormalization() (x)

    x = Lambda(lambda q: K.squeeze(q, -1), name='squeeze_last_dim') (x) #keras.backend.squeeze(x, axis)

    x = Bidirectional(rnn_func(64, return_sequences = True, dropout=0.3)) (x) # [b_s, seq_len, vec_dim]
    x = Bidirectional(rnn_func(64, return_sequences = True, dropout=0.3)) (x) # [b_s, seq_len, vec_dim]

    xFirst = Lambda(lambda q: q[:,4]) (x) #[b_s, vec_dim]
    query = Dense(128) (xFirst)

    #dot product attention
    attScores = Dot(axes=[1,2])([query, x]) 
    attScores = Softmax(name='attSoftmax')(attScores) #[b_s, seq_len]

    #rescale sequence
    attVector = Dot(axes=[1,1])([attScores, x]) #[b_s, vec_dim]

    x = Dense(256, activation = 'relu')(attVector)
    x = Dense(128)(x)
    x = Dense(64)(x)

    output = Dense(61, activation = 'softmax', name='output')(x)

    model = Model(inputs=[inputs], outputs=[output])

    model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['acc'])
    
    return model

In [145]:
model = AttRNNSpeechModel()

tracking <tf.Variable 'mel_stft_9/real_kernels:0' shape=(512, 1, 1, 257) dtype=float32> dft_real_kernels
tracking <tf.Variable 'mel_stft_9/imag_kernels:0' shape=(512, 1, 1, 257) dtype=float32> dft_imag_kernels
tracking <tf.Variable 'mel_stft_9/Variable:0' shape=(257, 80) dtype=float32> freq2mel


In [146]:
X_train, y_train = shuffle(X_train, y_train, random_state=30)

In [147]:
history = model.fit(np.array(X_train[:20000]), np.array(y_train[:20000]), epochs=24, validation_split=0.2)

Train on 16000 samples, validate on 4000 samples
Epoch 1/24
Epoch 2/24
Epoch 3/24
Epoch 4/24

KeyboardInterrupt: 

In [139]:
X_train[20002]

array([-1.10473633e-02, -1.36718750e-02, -9.67407227e-03, -7.41577148e-03,
       -1.13525391e-02, -1.16577148e-02, -6.22558594e-03,  1.55639648e-03,
        2.71606445e-03,  1.37329102e-03, -1.22070312e-03, -2.86865234e-03,
       -4.63867188e-03, -1.09863281e-03,  6.89697266e-03,  1.14746094e-02,
        9.03320312e-03,  5.40161133e-03,  6.89697266e-03,  8.51440430e-03,
        6.59179688e-03,  2.80761719e-03,  5.15747070e-03,  9.91821289e-03,
        1.17492676e-02,  9.64355469e-03,  9.18579102e-03,  8.05664062e-03,
        4.30297852e-03, -4.57763672e-04,  2.74658203e-04,  4.69970703e-03,
        7.17163086e-03,  4.69970703e-03,  1.61743164e-03, -2.44140625e-04,
       -2.99072266e-03, -6.43920898e-03, -7.41577148e-03, -6.01196289e-03,
       -5.40161133e-03, -6.59179688e-03, -7.23266602e-03, -7.23266602e-03,
       -8.94165039e-03, -1.21154785e-02, -1.39160156e-02, -1.14440918e-02,
       -8.48388672e-03, -6.16455078e-03, -4.54711914e-03, -2.62451172e-03,
       -3.99780273e-03, -

In [110]:
pr = model.predict(np.expand_dims(X_train[20002], axis=0))

In [111]:
pr.max()

0.43401635

In [121]:
np.where(pr[0] == pr.max())[0][0]

18

In [117]:
pr[0][18]

0.43401635

In [122]:
y_train[20001]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [123]:
np.where(y_train[20000] == 1)[0][0]

36

In [140]:
count = 0
for i in range(1000):
    pr = model.predict(np.expand_dims(X_train[20000 + i], axis=0))
    ind = np.where(pr[0] == pr.max())[0][0]
    if np.where(y_train[20000 + i] == 1)[0][0] == ind:
        count += 1

In [141]:
count / 1000

0.393

In [3]:
y, sr = librosa.load('/home/anton/ITS_Partner_Lab/its_partnet_lab/source/tst.wav')

In [4]:
sr

22050

In [5]:
y

array([ 0.00358838,  0.00738503,  0.00793451, ...,  0.00355835,
       -0.00043123, -0.00221435], dtype=float32)

In [6]:
len(y)

88200