In [1]:
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Embedding, Bidirectional, LSTM, RepeatVector, TimeDistributed, Dense
from keras.initializers import VarianceScaling, RandomUniform
from keras.utils.np_utils import to_categorical
from jamotools import Vectorizationer, rules
from unicodedata import normalize

In [2]:
spelling = pd.read_csv("vec_spelling.csv", sep="\t")
spelling.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,2,21,2,26,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,21,2,27,42,0,0,0,0,0,0,0,0,0,0,0,0
2,2,21,2,34,0,0,0,0,0,0,0,0,0,0,0,0,0
3,2,21,3,21,13,41,0,0,0,0,0,0,0,0,0,0,0
4,2,21,3,21,58,5,21,0,0,0,0,0,0,0,0,0,0


In [3]:
# the three parameters for the "encoding" step: vocab_size, output_dim, and num_cols

In [4]:
# the input dimension/vocabulary size for the model will be the number of rows passed into the embedding layer for training

In [5]:
vocab_size = spelling.shape[0]
vocab_size

10120

In [6]:
# the output dimension for the embedding layer is the number of columns/parameters

In [7]:
output_dim = spelling.shape[1]
output_dim

17

In [8]:
pronunciation = pd.read_csv("vec_pronunciation.csv", sep="\t")
pronunciation.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,2,21,113,2,26,0,0,0,0,0,0,0,0,0,0,0,0
1,2,21,2,27,42,0,0,0,0,0,0,0,0,0,0,0,0
2,2,21,2,34,0,0,0,0,0,0,0,0,0,0,0,0,0
3,2,21,3,21,13,41,0,0,0,0,0,0,0,0,0,0,0
4,2,21,3,21,58,6,21,0,0,0,0,0,0,0,0,0,0


In [9]:
# As the decoder RNN's input, repeatedly provide with the last output of
# RNN for each time step. Repeat "number of columns" times as that's the maximum
# length of output of the model

In [10]:
num_cols = pronunciation.shape[1]
num_cols

17

In [11]:
# the parameter for the decoding step: onehot_pronunciation

# one-hot encode the target so that all mistakes are treated the same

In [12]:
onehot_pronunciation = to_categorical(pronunciation)
onehot_pronunciation.shape

(10120, 17, 114)

In [13]:
# inspect the one-hot encoding

In [14]:
pronunciation.iloc[0]

0       2
1      21
2     113
3       2
4      26
5       0
6       0
7       0
8       0
9       0
10      0
11      0
12      0
13      0
14      0
15      0
16      0
Name: 0, dtype: int64

In [15]:
len(onehot_pronunciation[0])

17

In [16]:
len(onehot_pronunciation[0][0])

114

In [17]:
# the first value encoded here should match the first value from pronunciation.iloc[0]

In [18]:
onehot_pronunciation[0][0]

array([0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)

In [19]:
# the output layer should have as many columns which now exist in the one-hot encoded array

In [20]:
onehot_cols = onehot_pronunciation.shape[2]
onehot_cols

114

In [21]:
# NMT models can be divided into two parts: the encoder and the decoder

# the encoder:
# an embedding layer to create word vectors from the input language; an LSTM layer; a RepeatVector which should be repeated
# as many times as the length of the output.
# NOTE: the Bidirectional wrapper around the lSTM layer trains two LSTMs instead of one, with one of the layers provided
#       with reversed copies of the input sequences. this provides more context to the network and can result in faster
#       training and better learning.
# NOTE: mask_zero=True indicates that input value zero is a special "padding" value in the vocabulary that should be
#       masked out. if this is set to True, the vocabulary size should be increased by +1 as index zero cannot be used

# the decoder:
# an LSTM layer which returns sequences (returns the hidden state output at each time step). a TimeDistributed/Dense layer
# which adds one Dense node to each unit of the previous layer. 
# NOTE: using an lSTM layer which returns sequences and a TimeDistributed layer together like this is useful when comparing
#       an entire sequence instead of just a final result like in classification. this way, the loss function is computed
#       for each token

In [22]:
model = Sequential()
model.add(Embedding(vocab_size+1, output_dim, trainable=True, mask_zero=True, 
                    embeddings_initializer=RandomUniform(minval=-0.05, maxval=0.05, seed=42)))
model.add(Bidirectional(LSTM(128, recurrent_activation="hard_sigmoid", 
                             kernel_initializer=VarianceScaling(distribution="uniform", mode="fan_avg"))))
model.add(RepeatVector(num_cols))
model.add(LSTM(256, recurrent_activation="hard_sigmoid", return_sequences=True,
                             kernel_initializer=VarianceScaling(distribution="uniform", mode="fan_avg")))
model.add(TimeDistributed(Dense(onehot_cols, activation="softmax",
                               kernel_initializer=VarianceScaling(distribution="uniform", mode="fan_avg"))))

In [23]:
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

In [24]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 17)          172057    
_________________________________________________________________
bidirectional (Bidirectional (None, 256)               149504    
_________________________________________________________________
repeat_vector (RepeatVector) (None, 17, 256)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 17, 256)           525312    
_________________________________________________________________
time_distributed (TimeDistri (None, 17, 114)           29298     
Total params: 876,171
Trainable params: 876,171
Non-trainable params: 0
_________________________________________________________________


In [25]:
model.fit(spelling, onehot_pronunciation, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x2b6a9ce4b88>

In [26]:
# test the model

In [27]:
# first give it something it's already been trained on

In [28]:
# NOTE: make sure to reshape input before predicting

In [29]:
spelling.iloc[0].shape

(17,)

In [30]:
test_easy = spelling.iloc[0].values.reshape(1, spelling.iloc[0].shape[0])
test_easy.shape

(1, 17)

In [31]:
test_easy

array([[ 2, 21,  2, 26,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0]], dtype=int64)

In [32]:
# use subscript [0] on output of predict as it returns an array of predictions. 
# (the actual shape for a single prediction is (1, 17, 114))

In [33]:
prediction = model.predict(test_easy)[0]

In [34]:
# the shape stored in prediction now is num_cols x onehot_cols.
# there are 17 arrays of length 114 as there are 17 predictions to be made and 114 possibilites to choose from.
# access the highest probability for each prediction using .argmax() with axis=1

In [35]:
prediction.argmax(axis=1)

array([ 2, 21,  2, 26,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
      dtype=int64)

In [36]:
# the model predicted the exact same pronunciation as spelling for this word

In [37]:
(test_easy == prediction.argmax(axis=1)).all()

True

In [38]:
# NOTE however, that the pronunciation is actually different

In [39]:
spelling.iloc[0] == pronunciation.iloc[0]

0      True
1      True
2     False
3     False
4     False
5      True
6      True
7      True
8      True
9      True
10     True
11     True
12     True
13     True
14     True
15     True
16     True
Name: 0, dtype: bool

In [40]:
# test to see if the model simply learned to output the exact same thing it received

In [41]:
test_output = spelling.iloc[4].values.reshape(1, spelling.iloc[4].shape[0])
test_output

array([[ 2, 21,  3, 21, 58,  5, 21,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0]], dtype=int64)

In [42]:
prediction = model.predict(test_output)[0]

In [43]:
prediction.argmax(axis=1)

array([ 2, 21,  3, 21, 58,  6, 21,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
      dtype=int64)

In [58]:
# the output isn't exactly the same, and this one is actually right

In [45]:
(test_output == prediction.argmax(axis=1)).all()

False

In [46]:
(pronunciation.iloc[4].values == prediction.argmax(axis=1)).all()

True

In [59]:
# it certainly isn't producing jibberish, but not all the predictions were correct, even with
# the same data it trained on, so try with more training data

In [48]:
vec = Vectorizationer(rule=rules.RULE_1, max_length=None)

In [49]:
decoder = {v: k for k, v in vec.symbol_map.items()}

In [50]:
def unvectorize_norm_pad(vector):
    temp_list = [decoder[num] for num in vector if num != 0]
    temp_string = "".join(temp_list)
    return normalize("NFC", temp_string)

In [51]:
def unvectorize_norm_prediction(prediction):
    vector = [np.argmax(x) for x in prediction]
    temp_list = [decoder[num] for num in vector if num != 0]
    temp_string = "".join(temp_list)
    return normalize("NFC", temp_string)

In [52]:
# spelling

In [53]:
unvectorize_norm_pad(spelling.iloc[4])

'가깝다'

In [54]:
# predicted pronunciation

In [55]:
unvectorize_norm_prediction(prediction)

'가깝따'

In [56]:
# actual pronunciation

In [57]:
unvectorize_norm_pad(pronunciation.iloc[4])

'가깝따'

In [60]:
unvectorize_norm_prediction(prediction) == unvectorize_norm_pad(pronunciation.iloc[4])

True