In [None]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, Dense, Input, LSTM
from keras.models import Model, load_model
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint

In [None]:
import keras.backend as K
if len(K.tensorflow_backend._get_available_gpus()) > 0:
    from keras.layers import CuDNNLSTM as LSTM

In [None]:
BATCH_SIZE = 64
EPOCHS = 100
EMBEDDING_DIM = 100
MAX_VOCAB_SIZE = 20000
MAX_SEQUENCE_LENGTH = 100
NUM_SAMPLES = 30000
LATENT_DIM = 256

In [None]:
input_texts = []
target_texts = []
target_input_texts = []

In [None]:
count = 0
with open('D:/Downloads/spa-eng/spa.txt', encoding='utf8') as f:
    for line in f:
        count+=1
        if count > NUM_SAMPLES:
            break
        if '\t' not in line:
            continue
        input_text, translation = line.rstrip().split('\t')
        
        input_texts.append(input_text)
        target_input_texts.append('<sos> '+translation)
        target_texts.append(translation+' <eos>')
    print('Number of samples %d' % len(input_texts))

In [None]:
tokenizer_inputs = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer_inputs.fit_on_texts(input_texts)
input_sequences = tokenizer_inputs.texts_to_sequences(input_texts)

word2idx_inputs = tokenizer_inputs.word_index
print('Num of input words %d' % len(word2idx_inputs))

In [None]:
max_len_input = max(len(s) for s in input_sequences)

In [None]:
tokenizer_outputs = Tokenizer(num_words=MAX_VOCAB_SIZE, filters='')
tokenizer_outputs.fit_on_texts(target_texts + target_input_texts)
target_sequences = tokenizer_outputs.texts_to_sequences(target_texts)
target_input_sequences = tokenizer_outputs.texts_to_sequences(target_input_texts)

word2idx_outputs = tokenizer_outputs.word_index
print('Num of output words %d' % len(word2idx_outputs))

In [None]:
num_words_output = len(word2idx_outputs) + 1
max_len_target = max(len(s) for s in target_sequences)

In [None]:
encoder_inputs = pad_sequences(input_sequences, maxlen=max_len_input)
print('encoder_inputs shape: ', encoder_inputs.shape)

In [None]:
decoder_inputs = pad_sequences(target_input_sequences, maxlen=max_len_target, padding='post')
print('decoder_inputs shape: ', decoder_inputs.shape)

In [None]:
decoder_targets = pad_sequences(target_sequences, maxlen=max_len_target, padding='post')

In [None]:
word2vec = {}
with open('D:/Downloads/glove.6B/glove.6B.%sd.txt' % EMBEDDING_DIM, encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vec = np.asarray(values[1:], dtype='float32')
        word2vec[word] = vec
    print('Num of word vectors %d' % len(word2vec))

In [None]:
print('Filling pre-trained embeddings..')
num_words = max(MAX_VOCAB_SIZE, len(word2idx_inputs)+1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word2idx_inputs.items():
    if i < MAX_VOCAB_SIZE:
        embedding_vector = word2vec.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [None]:
embedding_layer = Embedding(
    num_words,
    EMBEDDING_DIM,
    weights=[embedding_matrix],
    input_length=max_len_input,
    trainable=False
)

In [None]:
decoder_targets_one_hot = np.zeros((len(input_texts), max_len_target, num_words_output), dtype='float32')

for i, d in enumerate(decoder_targets):
    for t, word in enumerate(d):
        decoder_targets_one_hot[i, t, word] = 1

In [None]:
encoder_inputs_placeholder = Input(shape=(max_len_input,))
x = embedding_layer(encoder_inputs_placeholder)
encoder = LSTM(LATENT_DIM, return_state=True)
encoder_outputs, h, c = encoder(x)
encoder_states = [h, c]

In [None]:
decoder_inputs_placeholder = Input(shape=(max_len_target,))
decoder_embedding = Embedding(num_words_output, LATENT_DIM, trainable=True)
decoder_inputs_x = decoder_embedding(decoder_inputs_placeholder)
decoder_lstm = LSTM(LATENT_DIM, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs_x, initial_state=encoder_states)
decoder_dense = Dense(num_words_output, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [None]:
if os.path.exists(filepath):
    print("loading model")
    model = load_model(filepath)
else:
    model = Model([encoder_inputs_placeholder, decoder_inputs_placeholder], decoder_outputs)
    model.compile(
        optimizer='rmsprop',
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
filepath = "eng2spa.h5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=0, save_best_only=True, mode='min', period=10)
callbacks_list = [checkpoint]


In [None]:
r = model.fit(
    [encoder_inputs, decoder_inputs],
    decoder_targets_one_hot,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_split=0.2,
    callbacks=callbacks_list)

In [None]:
model.save('my_model.h5')

In [None]:
plt.plot(r.history['loss'], label='loss')
plt.plot(r.history['valid_loss'], label='valid_loss')
plt.legend()
plt.show()

In [None]:
plt.plot(r.history('acc'), label='accuracy')
plt.plot(r.history('valid_acc'), label='valid_accuracy')
plt.legend()
plt.show()

In [None]:
encoder_model = Model(encoder_inputs_placeholder, encoder_states)

decoder_state_input_h = Input((LATENT_DIM,))
decoder_state_input_c = Input((LATENT_DIM,))
decoder_states_input = [decoder_state_input_h, decoder_state_input_c]

decoder_input_single = Input((1,))
decoder_input_single_x = decoder_embedding(decoder_input_single)
decoder_outputs, h, c = decoder_lstm(decoder_input_single_x, initial_state=decoder_states_input)
decoder_states = [h, c]
decoder_outputs = decoder_dense(decoder_outputs)

In [None]:
decoder_model = Model([decoder_input_single, decoder_states_input], [decoder_outputs]+decoder_states)

In [None]:
idx2word_eng = {v:k for k,v in word2idx_inputs.items()}
idx2word_trans = {v:k for k,v in word2idx_outputs.items()}

In [None]:
def decode_sequence(input_seq):
    
    states_values = encoder_model.predict(input_seq)
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = word2idx_outputs['<sos>']
    
    eos = word2idx_outputs['<eos>']
    
    output_sentence = []
    for _ in range(max_len_target):
        output_tokens, h, c = decoder_model.predict([target_seq] + states_values)
        
        idx = np.argmax(output_tokens[0, 0, :])
        
        if eos == idx:
            break
        
        word = ''
        if idx > 0:
            word = idx2word_trans[idx]
            output_sentence.append(word)
        
        target_seq[0, 0]=idx
        states_value = [h, c]
    
    return ' '.join(output_sentence)

In [None]:
while True:
    i = np.random.choice(len(input_texts))
    input_seq = encoder_inputs[i:i+1]
    translation = decode_sequence(input_seq)
    print('---')
    print('Input: '+input_texts[i])
    print('Translation: '+translation)
    
    ans = input("Continue [Y/n]?")
    if ans and ans.lower().startswith('n'):
        break