In [1]:
import os, sys

from keras.models import Model
from keras.layers import Input, LSTM, GRU, Dense, Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import numpy as np

In [2]:
BATCH_SIZE = 64
EPOCHS = 20
LSTM_NODES =256
NUM_SENTENCES = 20000
MAX_SENTENCE_LENGTH = 50
MAX_NUM_WORDS = 20000
EMBEDDING_SIZE = 100

## Data Preprocessing

##### To develop such a model, we need a dataset that contains English sentences and their Turkish translations.

##### In our dataset, we do not need to process the input, however, we need to generate two copies of the translated sentence: one with the start-of-sentence token and the other with the end-of-sentence token.

In [3]:
# Each line is split into two substrings at the position where the tab occurs.
# The left substring (the English sentence) is inserted into the input_sentences[] list.
# The substring to the right of the tab is the corresponding translated Turkish sentence.
# The <eos> token, which marks the end-of-sentence is prefixed to the translated sentence,
# and the resultant sentence is appended to the output_sentences[] list.
# Similarly, the <sos> token, which stands for "start of sentence",
# is concatenated at the start of the translated sentence and the result is added to the output_sentences_inputs[] list.
# The loop terminates if the number of sentences added to the lists is greater than the NUM_SENTENCES variable, i.e. 20,000.

input_sentences = []
output_sentences = []
output_sentences_inputs = []

count = 0
for line in open(r'tur.txt', encoding="utf-8"):
    count += 1

    if count > NUM_SENTENCES:
        break

    if '\t' not in line:
        continue

    input_sentence, output = line.rstrip().split('\t')

    output_sentence = output + ' <eos>'
    output_sentence_input = '<sos> ' + output

    input_sentences.append(input_sentence)
    output_sentences.append(output_sentence)
    output_sentences_inputs.append(output_sentence_input)

print("num samples input:", len(input_sentences))
print("num samples output:", len(output_sentences))
print("num samples output input:", len(output_sentences_inputs))

num samples input: 20000
num samples output: 20000
num samples output input: 20000


In [4]:
print(input_sentences[47])
print(output_sentences[47])
print(output_sentences_inputs[47])

I'm up.
Ben uyanığım. <eos>
<sos> Ben uyanığım.


## Tokenization and Padding

##### Tokenizing the original and translated sentences and applying padding to the sentences that are longer or shorter than a certain length. This is extremely important since deep learning and machine learning algorithms work with numbers.

In [5]:
input_tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
input_tokenizer.fit_on_texts(input_sentences)
input_integer_seq = input_tokenizer.texts_to_sequences(input_sentences)

word2idx_inputs = input_tokenizer.word_index
print('Total unique words in the input: %s' % len(word2idx_inputs))

max_input_len = max(len(sen) for sen in input_integer_seq)
print("Length of longest sentence in input: %g" % max_input_len)

Total unique words in the input: 3951
Length of longest sentence in input: 5


In [6]:
output_tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, filters='')
output_tokenizer.fit_on_texts(output_sentences + output_sentences_inputs)
output_integer_seq = output_tokenizer.texts_to_sequences(output_sentences)
output_input_integer_seq = output_tokenizer.texts_to_sequences(output_sentences_inputs)

word2idx_outputs = output_tokenizer.word_index
print('Total unique words in the output: %s' % len(word2idx_outputs))

num_words_output = len(word2idx_outputs) + 1
max_out_len = max(len(sen) for sen in output_integer_seq)
print("Length of longest sentence in the output: %g" % max_out_len)

Total unique words in the output: 11047
Length of longest sentence in the output: 8


##### We need to pad the input. The reason behind padding the input and the output is that text sentences can be of varying length, however LSTM (the algorithm that we are going to train our model) expects input instances with the same length. Therefore, we need to convert our sentences into fixed-length vectors. One way to do this is via padding.

In [7]:
encoder_input_sequences = pad_sequences(input_integer_seq, maxlen=max_input_len)
print("encoder_input_sequences.shape:", encoder_input_sequences.shape)
print("encoder_input_sequences[47]:", encoder_input_sequences[47])

encoder_input_sequences.shape: (20000, 5)
encoder_input_sequences[47]: [ 0  0  0  7 34]


In [8]:
print(word2idx_inputs["i'm"])
print(word2idx_inputs["up"])

7
34


In [9]:
decoder_input_sequences = pad_sequences(output_input_integer_seq, maxlen=max_out_len, padding='post')
print("decoder_input_sequences.shape:", decoder_input_sequences.shape)
print("decoder_input_sequences[47]:", decoder_input_sequences[47])

decoder_input_sequences.shape: (20000, 8)
decoder_input_sequences[47]: [   2    4 1087    0    0    0    0    0]


In [10]:
print(word2idx_outputs["<sos>"])
print(word2idx_outputs["ben"])
print(word2idx_outputs["uyanığım."])

2
4
1087


## Word Embeddings

##### For English sentences, i.e. the inputs, we will use the GloVe word embeddings. For the translated Turkish sentences in the output, we will use custom word embeddings.

In [11]:
# Load Glove

from numpy import array
from numpy import asarray
from numpy import zeros

embeddings_dictionary = dict()

glove_file = open(r'glove.6B.100d.txt', encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary[word] = vector_dimensions
glove_file.close()

In [12]:
# Recall that we have 3951 unique words in the input.
# We will create a matrix where the row number will represent the integer value 
# for the word and the columns will correspond to the dimensions of the word.
# This matrix will contain the word embeddings for the words in our input sentences.

num_words = min(MAX_NUM_WORDS, len(word2idx_inputs) + 1)
embedding_matrix = zeros((num_words, EMBEDDING_SIZE))
for word, index in word2idx_inputs.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [13]:
print(embeddings_dictionary["up"])

[ 0.21469    0.43367    0.33964   -0.65715    0.15546    0.15318
 -0.62081    0.27839   -0.3704     0.0029626  0.37131    0.32756
 -0.32802    0.10206    0.52715   -0.33415   -0.012657   0.20382
 -0.19846    0.10483    0.72682    0.30136    0.73955    0.2264
  0.5213    -0.46339   -0.56209   -0.47684    0.056159  -0.46364
 -0.18426    0.15954    0.23868   -0.030124  -0.18315    0.27942
  0.031251  -0.16198   -0.18941    0.2571    -0.48811   -0.70303
 -0.0055224 -0.63184   -0.17694    0.38916   -0.64778   -0.08909
  0.17655   -1.2462    -0.21257   -0.20355    0.11958    1.6196
 -0.77112   -2.8367    -0.21148    0.11873    2.1393     0.78805
  0.41318    0.97607   -0.67157    0.29821    0.12548    0.10129
  0.69104    0.61075    0.58256    0.3346     0.042307  -0.45933
 -0.24029   -0.73154   -0.3054     0.19878   -0.34562    0.0035721
 -0.57002    0.027172   0.68865    0.4502    -0.62077    0.36449
 -1.2001     0.15149    0.58623    0.35867   -0.22877   -0.032302
 -0.18218    0.18319   -

In [14]:
print(embedding_matrix[34])

[ 0.21469     0.43367001  0.33963999 -0.65714997  0.15546     0.15318
 -0.62080997  0.27838999 -0.37040001  0.0029626   0.37131     0.32756001
 -0.32802001  0.10206     0.52714998 -0.33414999 -0.012657    0.20382001
 -0.19846     0.10483     0.72681999  0.30136001  0.73954999  0.2264
  0.52130002 -0.46338999 -0.56208998 -0.47683999  0.056159   -0.46364
 -0.18426     0.15954     0.23868001 -0.030124   -0.18314999  0.27941999
  0.031251   -0.16198    -0.18941     0.25709999 -0.48811001 -0.70302999
 -0.0055224  -0.63183999 -0.17693999  0.38916001 -0.64778    -0.08909
  0.17655    -1.24619997 -0.21257    -0.20355     0.11958     1.61960006
 -0.77112001 -2.83669996 -0.21148001  0.11873     2.13930011  0.78805
  0.41317999  0.97606999 -0.67157     0.29821     0.12548     0.10129
  0.69103998  0.61075002  0.58256     0.3346      0.042307   -0.45932999
 -0.24029    -0.73154002 -0.30540001  0.19878    -0.34562001  0.0035721
 -0.57002002  0.027172    0.68865001  0.45019999 -0.62076998  0.36449
 

In [15]:
# The following script creates the embedding layer for the input:
embedding_layer = Embedding(num_words, EMBEDDING_SIZE, weights=[embedding_matrix], input_length=max_input_len)

## Creating the Model

##### The first thing we need to do is to define our outputs, as we know that the output will be a sequence of words. Recall that the total number of unique words in the output are 11047. Therefore, each word in the output can be any of the 11047 words. The length of an output sentence is 5. And for each input sentence, we need a corresponding output sentence. Therefore, the final shape of the output will be: (number of inputs, length of the output sentence, the number of words in the output)

In [16]:
# Creating the empty output array

decoder_targets_one_hot = np.zeros((
        len(input_sentences),
        max_out_len,
        num_words_output
    ),
    dtype='float32'
)

In [17]:
decoder_targets_one_hot.shape

(20000, 8, 11048)

In [18]:
decoder_output_sequences = pad_sequences(output_integer_seq, maxlen=max_out_len, padding='post')

##### Next, we need to create the encoder and decoders. The input to the encoder will be the sentence in English and the output will be the hidden state and cell state of the LSTM.

In [19]:
for i, d in enumerate(decoder_output_sequences):
    for t, word in enumerate(d):
        decoder_targets_one_hot[i, t, word] = 1

In [20]:
encoder_inputs_placeholder = Input(shape=(max_input_len,))
x = embedding_layer(encoder_inputs_placeholder)
encoder = LSTM(LSTM_NODES, return_state=True)

encoder_outputs, h, c = encoder(x)
encoder_states = [h, c]

##### The decoder will have two inputs: the hidden state and cell state from the encoder and the input sentence, which actually will be the output sentence with an sos token appended at the beginning.

In [21]:
decoder_inputs_placeholder = Input(shape=(max_out_len,))

decoder_embedding = Embedding(num_words_output, LSTM_NODES)
decoder_inputs_x = decoder_embedding(decoder_inputs_placeholder)

decoder_lstm = LSTM(LSTM_NODES, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs_x, initial_state=encoder_states)

In [22]:
# The output from the decoder LSTM is passed through a dense layer to predict decoder outputs.

decoder_dense = Dense(num_words_output, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [23]:
# Compile the model

model = Model([encoder_inputs_placeholder,
  decoder_inputs_placeholder], decoder_outputs)
model.compile(
    optimizer='rmsprop',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

In [24]:
# Train the model

r = model.fit(
    [encoder_input_sequences, decoder_input_sequences],
    decoder_targets_one_hot,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_split=0.1,
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [25]:
encoder_model = Model(encoder_inputs_placeholder, encoder_states)

##### Since now at each step we need the decoder hidden and cell states, we will modify our model to accept the hidden and cell states as shown below

In [26]:
decoder_state_input_h = Input(shape=(LSTM_NODES,))
decoder_state_input_c = Input(shape=(LSTM_NODES,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

##### Now at each time step, there will be only single word in the decoder input, we need to modify the decoder embedding layer as follows:

In [27]:
decoder_inputs_single = Input(shape=(1,))
decoder_inputs_single_x = decoder_embedding(decoder_inputs_single)

##### Next, we need to create the placeholder for decoder outputs:

In [28]:
decoder_outputs, h, c = decoder_lstm(decoder_inputs_single_x, initial_state=decoder_states_inputs)

##### To make predictions, the decoder output is passed through the dense layer:

In [29]:
decoder_states = [h, c]
decoder_outputs = decoder_dense(decoder_outputs)

##### The final step is to define the updated decoder model, as shown here:

In [30]:
decoder_model = Model(
    [decoder_inputs_single] + decoder_states_inputs,
    [decoder_outputs] + decoder_states
)

## Making Prediction

##### In the tokenization steps, we converted words to integers. The outputs from the decoder will also be integers. However, we want our output to be a sequence of words in the Turkish language. To do so, we need to convert the integers back to words. We will create new dictionaries for both inputs and outputs where the keys will be the integers and the corresponding values will be the words.

In [31]:
idx2word_input = {v:k for k, v in word2idx_inputs.items()}
idx2word_target = {v:k for k, v in word2idx_outputs.items()}

##### The method will accept an input-padded sequence English sentence (in the integer form) and will return the translated Turkish sentence.

In [32]:
def translate_sentence(input_seq):
    states_value = encoder_model.predict(input_seq)
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = word2idx_outputs['<sos>']
    eos = word2idx_outputs['<eos>']
    output_sentence = []

    for _ in range(max_out_len):
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        idx = np.argmax(output_tokens[0, 0, :])

        if eos == idx:
            break

        word = ''

        if idx > 0:
            word = idx2word_target[idx]
            output_sentence.append(word)

        target_seq[0, 0] = idx
        states_value = [h, c]

    return ' '.join(output_sentence)

## Testing the Model

In [36]:
i = np.random.choice(len(input_sentences))
input_seq = encoder_input_sequences[i:i+1]
translation = translate_sentence(input_seq)
print('-')
print('Input:', input_sentences[i])
print('Response:', translation)

-
Input: I'll be inside.
Response: i̇çeri olacağım.
