In [1]:
#https://stackabuse.com/python-for-nlp-neural-machine-translation-with-seq2seq-in-keras/
#https://therealschool.in/blog/small-simple-sentences-english-kids-guide-early-childhood-language-training/
import os, sys

from keras.models import Model
from keras.layers import Input, LSTM, GRU, Dense, Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import numpy as np
import matplotlib.pyplot as plt

Using TensorFlow backend.


In [2]:
BATCH_SIZE = 64
EPOCHS = 20
LSTM_NODES =256
NUM_SENTENCES = 20000
MAX_SENTENCE_LENGTH = 50
MAX_NUM_WORDS = 20000
EMBEDDING_SIZE = 100

In [3]:
input_sentences = []
output_sentences = []
output_sentences_inputs = []

count = 0
for line in open(r'Project/Machine-Translation-English-to-Hindi-/hin.txt', encoding="utf-8"):
    count += 1

    if count > NUM_SENTENCES:
        break

    if '\t' not in line:
        continue

    input_sentence, output = line.rstrip().split('\t')

    output_sentence = output + ' <eos>'
    output_sentence_input = '<sos> ' + output

    input_sentences.append(input_sentence)
    output_sentences.append(output_sentence)
    output_sentences_inputs.append(output_sentence_input)

print("num samples input:", len(input_sentences))
print("num samples output:", len(output_sentences))
print("num samples output input:", len(output_sentences_inputs))

num samples input: 2869
num samples output: 2869
num samples output input: 2869


In [4]:
print(input_sentences[172])
print(output_sentences[172])
print(output_sentences_inputs[172])

This is my bag.
यह मेरा बस्ता है। <eos>
<sos> यह मेरा बस्ता है।


In [5]:
input_tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
input_tokenizer.fit_on_texts(input_sentences)
input_integer_seq = input_tokenizer.texts_to_sequences(input_sentences)

word2idx_inputs = input_tokenizer.word_index
print('Total unique words in the input: %s' % len(word2idx_inputs))

max_input_len = max(len(sen) for sen in input_integer_seq)
print("Length of longest sentence in input: %g" % max_input_len)

Total unique words in the input: 2402
Length of longest sentence in input: 22


In [6]:
output_tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, filters='')
output_tokenizer.fit_on_texts(output_sentences + output_sentences_inputs)
output_integer_seq = output_tokenizer.texts_to_sequences(output_sentences)
output_input_integer_seq = output_tokenizer.texts_to_sequences(output_sentences_inputs)

word2idx_outputs = output_tokenizer.word_index
print('Total unique words in the output: %s' % len(word2idx_outputs))

num_words_output = len(word2idx_outputs) + 1
max_out_len = max(len(sen) for sen in output_integer_seq)
print("Length of longest sentence in the output: %g" % max_out_len)

Total unique words in the output: 3161
Length of longest sentence in the output: 26


In [7]:
encoder_input_sequences = pad_sequences(input_integer_seq, maxlen=max_input_len)
print("encoder_input_sequences.shape:", encoder_input_sequences.shape)
print("encoder_input_sequences[172]:", encoder_input_sequences[172])

encoder_input_sequences.shape: (2869, 22)
encoder_input_sequences[172]: [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
  13   5  10 473]


In [8]:
print(word2idx_inputs["my"])
print(word2idx_inputs["bag"])

10
473


In [9]:
decoder_input_sequences = pad_sequences(output_input_integer_seq, maxlen=max_out_len, padding='post')
print("decoder_input_sequences.shape:", decoder_input_sequences.shape)
print("decoder_input_sequences[172]:", decoder_input_sequences[172])

decoder_input_sequences.shape: (2869, 26)
decoder_input_sequences[172]: [  2  23  56 648   3   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0]


In [10]:
print(word2idx_outputs["<sos>"])
print(word2idx_outputs["मेरा"])
print(word2idx_outputs["बस्ता"])

2
56
648


In [11]:
from numpy import array
from numpy import asarray
from numpy import zeros

embeddings_dictionary = dict()

glove_file = open(r'glove.6B.100d.txt', encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary[word] = vector_dimensions
glove_file.close()

In [12]:
num_words = min(MAX_NUM_WORDS, len(word2idx_inputs) + 1)
embedding_matrix = zeros((num_words, EMBEDDING_SIZE))
for word, index in word2idx_inputs.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [13]:
print(embeddings_dictionary["bag"])

[-0.47981    0.20963    0.28709   -0.96992   -0.082366   0.6799
  0.58455    0.76565    0.27056    0.33818    0.30404    0.035035
 -0.11251    0.77807    0.40316    0.37327    0.62617    0.36223
  0.1469    -0.80491   -0.082232   0.22542   -0.3419    -0.72515
  0.41322    0.56049   -0.18406   -0.84558   -0.33033   -0.32269
 -0.03368   -0.16566    0.11023    0.07246    0.22264    0.27425
  0.11551    0.32004    0.54079   -0.53373    0.36514   -0.95486
  0.96945   -0.96607   -0.71563    0.3354    -0.75834    0.3477
 -0.26012   -0.31523    0.13754    0.73761    0.28643    1.0257
 -0.51289   -1.2064    -0.43001    0.3032     1.4753     0.0052773
  0.68161    0.43473    0.015818   1.0475    -0.035287  -0.42816
  0.35977   -0.41322    0.13189   -0.10468   -0.56283   -0.11545
  0.41647    0.39828   -0.10006    1.0679     0.0045459 -0.19705
 -0.16776    0.94237    0.33944    0.042042  -0.24697   -0.60955
 -0.89005   -1.0979     0.46529   -0.17461   -0.87865    0.91728
  0.079195   0.80083    0

In [14]:
print(embedding_matrix[473])

[-0.47981     0.20963     0.28709    -0.96991998 -0.082366    0.67989999
  0.58455002  0.76564997  0.27056     0.33818001  0.30404001  0.035035
 -0.11251     0.77806997  0.40316001  0.37327     0.62616998  0.36223
  0.1469     -0.80491    -0.082232    0.22542    -0.34189999 -0.72514999
  0.41321999  0.56049001 -0.18406001 -0.84557998 -0.33033001 -0.32269001
 -0.03368    -0.16565999  0.11023     0.07246     0.22263999  0.27425
  0.11551     0.32003999  0.54079002 -0.53372997  0.36513999 -0.95485997
  0.96945    -0.96607    -0.71562999  0.33539999 -0.75834     0.3477
 -0.26012    -0.31523001  0.13754     0.73760998  0.28643     1.02569997
 -0.51288998 -1.20640004 -0.43000999  0.30320001  1.47529995  0.0052773
  0.68160999  0.43472999  0.015818    1.04750001 -0.035287   -0.42816001
  0.35977    -0.41321999  0.13189    -0.10468    -0.56282997 -0.11545
  0.41646999  0.39827999 -0.10006     1.06789994  0.0045459  -0.19705001
 -0.16776     0.94237     0.33943999  0.042042   -0.24697    -0.609

In [15]:
embedding_layer = Embedding(num_words, EMBEDDING_SIZE, weights=[embedding_matrix], input_length=max_input_len)

In [16]:
decoder_targets_one_hot = np.zeros((
        len(input_sentences),
        max_out_len,
        num_words_output
    ),
    dtype='float32'
)

In [17]:
decoder_targets_one_hot.shape

(2869, 26, 3162)

In [18]:
encoder_inputs_placeholder = Input(shape=(max_input_len,))
x = embedding_layer(encoder_inputs_placeholder)
encoder = LSTM(LSTM_NODES, return_state=True)

encoder_outputs, h, c = encoder(x)
encoder_states = [h, c]

In [19]:
decoder_inputs_placeholder = Input(shape=(max_out_len,))

decoder_embedding = Embedding(num_words_output, LSTM_NODES)
decoder_inputs_x = decoder_embedding(decoder_inputs_placeholder)

decoder_lstm = LSTM(LSTM_NODES, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs_x, initial_state=encoder_states)

In [20]:
decoder_dense = Dense(num_words_output, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [21]:
model = Model([encoder_inputs_placeholder,
  decoder_inputs_placeholder], decoder_outputs)
model.compile(
    optimizer='rmsprop',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

In [22]:
epochs = 300

In [130]:
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,batch_size=batch_size,epochs=epochs,validation_split=0.2)

Train on 2284 samples, validate on 572 samples
Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300

KeyboardInterrupt: 

In [131]:
model.save('s2s.h5')

In [135]:
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

reverse_input_char_index = dict(
    (i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict(
    (i, char) for char, i in target_token_index.items())

In [136]:
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)

    target_seq = np.zeros((1, 1, num_decoder_tokens))
    target_seq[0, 0, target_token_index['\t']] = 1.

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        if (sampled_char == '\n' or
           len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        states_value = [h, c]

    return decoded_sentence
