In [1]:
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Embedding, Bidirectional, LSTM, RepeatVector, TimeDistributed, Dense
from keras.initializers import VarianceScaling, RandomUniform
from keras.utils.np_utils import to_categorical

In [2]:
spelling = pd.read_csv("vec_spelling_all.csv", sep="\t")
spelling.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,13,27,42,11,21,20,21,42,0,0,...,0,0,0,0,0,0,0,0,0,0
1,13,27,42,11,21,20,21,42,13,41,...,0,0,0,0,0,0,0,0,0,0
2,13,27,42,11,21,20,21,42,5,29,...,0,0,0,0,0,0,0,0,0,0
3,13,27,42,11,21,20,21,42,8,21,...,0,0,0,0,0,0,0,0,0,0
4,11,41,5,22,14,25,42,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
# the three parameters for the "encoding" step: vocab_size, output_dim, and num_cols

In [4]:
# the input dimension/vocabulary size for the model will be the number of rows passed into the embedding layer for training

In [5]:
vocab_size = spelling.shape[0]
vocab_size

88570

In [6]:
# the output dimension for the embedding layer is the number of columns/parameters

In [7]:
output_dim = spelling.shape[1]
output_dim

23

In [8]:
pronunciation = pd.read_csv("vec_pronunciation_all.csv", sep="\t")
pronunciation.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,13,27,42,12,21,20,21,42,0,0,...,0,0,0,0,0,0,0,0,0,0
1,13,27,42,12,21,20,21,2,41,0,...,0,0,0,0,0,0,0,0,0,0
2,13,27,42,12,21,20,21,42,6,29,...,0,0,0,0,0,0,0,0,0,0
3,13,27,42,12,21,20,21,62,8,21,...,0,0,0,0,0,0,0,0,0,0
4,11,41,5,22,14,25,42,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# As the decoder RNN's input, repeatedly provide with the last output of
# RNN for each time step. Repeat "number of columns" times as that's the maximum
# length of output of the model

In [10]:
num_cols = pronunciation.shape[1]
num_cols

23

In [11]:
# the parameter for the decoding step: onehot_cols

# one-hot encode the target so that all mistakes are treated the same

In [12]:
onehot_pronunciation = to_categorical(pronunciation)
onehot_pronunciation.shape

(88570, 23, 114)

In [13]:
# inspect the one-hot encoding

In [14]:
pronunciation.iloc[0]

0     13
1     27
2     42
3     12
4     21
5     20
6     21
7     42
8      0
9      0
10     0
11     0
12     0
13     0
14     0
15     0
16     0
17     0
18     0
19     0
20     0
21     0
22     0
Name: 0, dtype: int64

In [15]:
len(onehot_pronunciation[0])

23

In [16]:
len(onehot_pronunciation[0][0])

114

In [17]:
# the first value encoded here should match the first value from pronunciation.iloc[0]

In [18]:
onehot_pronunciation[0][0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)

In [19]:
# the output layer should have as many columns as now exist in the one-hot encoded array

In [20]:
onehot_cols = onehot_pronunciation.shape[2]
onehot_cols

114

In [21]:
# NMT models can be divided into two parts: the encoder and the decoder

# the encoder:
# an embedding layer to create word vectors from the input language; an LSTM layer; a RepeatVector which should be repeated
# as many times as the length of the output.
# NOTE: the Bidirectional wrapper around the lSTM layer trains two LSTMs instead of one, with one of the layers provided
#       with reversed copies of the input sequences. this provides more context to the network and can result in faster
#       training and better learning.
# NOTE: mask_zero=True indicates that input value zero is a special "padding" value in the vocabulary that should be
#       masked out. if this is set to True, the vocabulary size should be increased by +1 as index zero cannot be used

# the decoder:
# an LSTM layer which returns sequences (returns the hidden state output at each time step). a TimeDistributed/Dense layer
# which adds one Dense node to each unit of the previous layer. 
# NOTE: using an lSTM layer which returns sequences and a TimeDistributed layer together like this is useful when comparing
#       an entire sequence instead of just a final result like in classification. this way, the loss function is computed
#       for each token

In [22]:
model = Sequential()
model.add(Embedding(vocab_size+1, output_dim, trainable=True, mask_zero=True, 
                    embeddings_initializer=RandomUniform(minval=-0.05, maxval=0.05, seed=42)))
model.add(Bidirectional(LSTM(128, recurrent_activation="hard_sigmoid", 
                             kernel_initializer=VarianceScaling(distribution="uniform", mode="fan_avg"))))
model.add(RepeatVector(num_cols))
model.add(LSTM(256, recurrent_activation="hard_sigmoid", return_sequences=True,
                             kernel_initializer=VarianceScaling(distribution="uniform", mode="fan_avg")))
model.add(TimeDistributed(Dense(onehot_cols, activation="softmax",
                               kernel_initializer=VarianceScaling(distribution="uniform", mode="fan_avg"))))

In [23]:
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

In [24]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 23)          2037133   
_________________________________________________________________
bidirectional (Bidirectional (None, 256)               155648    
_________________________________________________________________
repeat_vector (RepeatVector) (None, 23, 256)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 23, 256)           525312    
_________________________________________________________________
time_distributed (TimeDistri (None, 23, 114)           29298     
Total params: 2,747,391
Trainable params: 2,747,391
Non-trainable params: 0
_________________________________________________________________


In [25]:
model.fit(spelling, onehot_pronunciation, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1d0d20eb548>

In [26]:
# save the weights for later use

In [27]:
# NOTE: after using .save(), load_model() returns an identical, compiled model

In [28]:
model.save("pronunciation_prediction.h5")