In [6]:
import pandas as pd
import numpy as np
import torch 
import torch.nn as nn
import torch.functional as F
from sklearn.preprocessing import *
from sklearn.model_selection import train_test_split
import string
from keras.models import Model
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Input
from keras.layers import Dropout
from keras.layers import LSTM
from keras.layers import Bidirectional
from keras.callbacks import ModelCheckpoint
import tensorflow as tf
import IPython.display
import keras

## Preprocessing stuff

In [None]:
# if no preprocessing done then run the following
words = words[words.word.notnull()]
words = words[~words.phonem.str.contains('#')]
def onehotEncoder(row):
    res = []
    len_word = len(row.word)
    len_phonem = len(row.phonem.split())
    len_total = len_word+len_phonem+1
    word = row.word
    word += "%" # Terminating character
    word += ''.join(['?' for i in range(len_phonem)]) # filling the remaining with ? chars
    res = np.array(letters_encoder.transform([[l] for l in word]).toarray()) # encoding the string
    phonem = ['?' for i in range(len_word+1)] + row.phonem.split()
    phonem_enc = phonem_encoder.transform([[l] for l in phonem]).toarray()
    return res, phonem_enc
    

In [527]:
encoded = words.apply(onehotEncoder, axis = 1,result_type='expand')

encoded = encoded.rename(columns={0:'word_encoded', 1:'phonem_encoded'})

encoded.iloc[0].word_encoded

encoded.to_json('data.json.bz2', compression='bz2')

In [354]:
encoded.head(5)

Unnamed: 0,word_encoded,phonem_encoded
0,"[[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
1,"[[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2,"[[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
3,"[[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
4,"[[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."


## Training

In [11]:
words = pd.read_json('data.json.gz', compression='gzip')
X = words.word_encoded.to_numpy()
y = words.phonem_encoded.to_numpy()

In [12]:
phones = pd.read_csv('phonems.csv')['Phonems'].to_numpy()
words = pd.read_csv('words.csv')

In [13]:
words

Unnamed: 0,word,phonem
0,'bout,B AW T
1,'cause,K AH Z
2,'course,K AO R S
3,'cuse,K Y UW Z
4,'em,AH M
...,...,...
125992,zysk,Z AY S K
125993,zyskowski,Z IH S K AO F S K IY
125994,zyuganov,Z Y UW G AA N AA V
125995,zyuganov's,Z Y UW G AA N AA V Z


In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.15, random_state=42)

In [23]:
letters = [[letter] for letter in string.ascii_lowercase+".'-?%"]
phonem = [[str(ph)] for ph in np.append(phones, '?')]

letters_encoder = OneHotEncoder()
phonem_encoder = OneHotEncoder()

letters_encoder.fit(letters)
phonem_encoder.fit(phonem)

OneHotEncoder()

In [16]:
input_shape = letters_encoder.categories_[0].shape[0]
output_shape = phonem_encoder.categories_[0].shape[0]
input_shape, output_shape

(31, 40)

In [17]:
input_layer = Input(shape=(None, input_shape))
lstm_512 = LSTM(512, return_sequences=True)(input_layer)
dropout_1 = Dropout(0.4)(lstm_512)
blstm_512 = Bidirectional(LSTM(512, return_sequences=True))(input_layer)
dropout_2 = Dropout(0.4)(blstm_512)
lstm_128 = LSTM(128, return_sequences=True)(keras.layers.concatenate([dropout_1, dropout_2]))
dropout_3 = Dropout(0.3)(lstm_128)
output = Dense(output_shape, activation='softmax')(dropout_3)
model = Model(input_layer, output)

In [18]:
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None, 31)]   0                                            
__________________________________________________________________________________________________
lstm (LSTM)                     (None, None, 512)    1114112     input_1[0][0]                    
__________________________________________________________________________________________________
bidirectional (Bidirectional)   (None, None, 1024)   2228224     input_1[0][0]                    
__________________________________________________________________________________________________
dropout (Dropout)               (None, None, 512)    0           lstm[0][0]                       
______________________________________________________________________________________________

In [195]:
filepath="saved_weights/weights.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [196]:
tf.config.experimental_run_functions_eagerly(True)

In [163]:
size = set()
for i in X_train:
    size.add(np.asarray(i).shape)
size = list(size)

In [None]:
for j in size:
    new_X_train = []
    new_Y_train = []
    new_X_valid = []
    new_Y_valid = []
    for i in range(len(X_train)):
        if len(X_train[i]) == j[0]:
            new_X_train.append(np.asarray(X_train[i]))
            new_Y_train.append(np.asarray(y_train[i]))
        try:
            if len(X_test[i]) == j[0]:
                new_X_valid.append(X_test[i])
                new_Y_valid.append(y_test[i])
        except:
            continue
    if len(new_X_train)> 32:
        IPython.display.clear_output(True) 
        model.fit(np.asarray(new_X_train), np.asarray(new_Y_train), epochs=20, 
                  batch_size=32, callbacks=callbacks_list, validation_data=
                 (np.asarray(new_X_valid), np.asarray(new_Y_valid)))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
  4/119 [>.............................] - ETA: 58s - loss: 0.0892 - accuracy: 0.9715

## Prediction

In [20]:
model.load_weights('weights.hdf5')

In [37]:
s = "barn%????"
p = model.predict(np.asarray([letters_encoder.transform([[i] for i in s]).toarray()]))[0]

In [43]:
' '.join([phonem_encoder.categories_[0][np.argmax(i)] for i in p]).replace('? ','')

'B AA R N'