# Dataset link : http://download.tensorflow.org/data/spa-eng.zip

In [1]:
import numpy as np
import os
import re

In [2]:
spa_eng_file = open('spa-eng/spa.txt',encoding='utf-8').read()

In [3]:
spa_eng_li = spa_eng_file.split('\n')

In [4]:
spa_eng_li[:5]

['Go.\tVe.', 'Go.\tVete.', 'Go.\tVaya.', 'Go.\tVáyase.', 'Hi.\tHola.']

In [5]:
lower_li = []
for i in range(len(spa_eng_li)):
    lower_li.append(spa_eng_li[i].lower().strip())

In [6]:
lower_li[:5]

['go.\tve.', 'go.\tvete.', 'go.\tvaya.', 'go.\tváyase.', 'hi.\thola.']

In [7]:
req_li = []
for i in range(len(lower_li)):
    req_li.append(lower_li[i].split('\t'))

In [8]:
req_li[:5]

[['go.', 've.'],
 ['go.', 'vete.'],
 ['go.', 'vaya.'],
 ['go.', 'váyase.'],
 ['hi.', 'hola.']]

In [9]:
final_li = []
for i in range(len(req_li)):
    li = []
    for j in range(len(req_li[i])):
        w = ''
        w = re.sub(r"([?.!,¿])", r" \1 ", req_li[i][j])
        w = re.sub(r'[" "]+', " ", w)
        w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
        w = w.rstrip().strip()
        w = 'start ' + w + ' end'
        li.append(w)
    final_li.append(li)

In [10]:
final_li.remove(['start  end'])

In [11]:
len(final_li)

118964

In [12]:
final_li[:5]

[['start go . end', 'start ve . end'],
 ['start go . end', 'start vete . end'],
 ['start go . end', 'start vaya . end'],
 ['start go . end', 'start v yase . end'],
 ['start hi . end', 'start hola . end']]

In [13]:
all_sent_li = []
for i in range(len(final_li)):
    all_sent_li.append(final_li[i][0])
    all_sent_li.append(final_li[i][1])

In [14]:
all_sent_li[:5]

['start go . end',
 'start ve . end',
 'start go . end',
 'start vete . end',
 'start go . end']

In [15]:
len(all_sent_li)

237928

In [16]:
X = []
Y = []

for i in range(len(all_sent_li)):
    if i%2 != 0:
        X.append(all_sent_li[i])
    else:
        Y.append(all_sent_li[i])

In [17]:
X = X[:50000]

In [18]:
Y = Y[:50000]

In [19]:
len(X), len(Y)

(50000, 50000)

In [20]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [21]:
tokenizer_X = Tokenizer()
tokenizer_X.fit_on_texts(X)
word2idx_X = tokenizer_X.word_index
idx2word_X = tokenizer_X.index_word

In [22]:
len_word2idx_X = len(word2idx_X) + 1

In [23]:
tokenizer_Y = Tokenizer()
tokenizer_Y.fit_on_texts(Y)
word2idx_Y = tokenizer_Y.word_index
idx2word_Y = tokenizer_Y.index_word

In [24]:
len_word2idx_Y = len(word2idx_Y) + 1

In [25]:
len_word2idx_X, len_word2idx_Y

(12180, 6810)

In [26]:
sequences_X = tokenizer_X.texts_to_sequences(X)

In [27]:
sequences_Y = tokenizer_Y.texts_to_sequences(Y)

In [28]:
len(sequences_X), len(sequences_Y)

(50000, 50000)

In [29]:
li_len_X = []
for i in range(len(sequences_X)):
    li_len_X.append(len(sequences_X[i]))

li_len_Y = []
for i in range(len(sequences_Y)):
    li_len_Y.append(len(sequences_Y[i]))
    
max_len_X = max(li_len_X)
max_len_Y = max(li_len_Y)

print(max_len_X, max_len_Y)

16 10


In [30]:
padded_X = pad_sequences(sequences_X, maxlen = max_len_X, padding='post')

In [31]:
padded_X

array([[   2,  177,    1, ...,    0,    0,    0],
       [   2,  539,    1, ...,    0,    0,    0],
       [   2,  596,    1, ...,    0,    0,    0],
       ...,
       [   2,    9, 4697, ...,    0,    0,    0],
       [   2,   50,  655, ...,    0,    0,    0],
       [   2,    9,  335, ...,    0,    0,    0]])

In [32]:
padded_Y = pad_sequences(sequences_Y, maxlen = max_len_Y, padding='post')

In [33]:
padded_Y

array([[   1,   37,    2, ...,    0,    0,    0],
       [   1,   37,    2, ...,    0,    0,    0],
       [   1,   37,    2, ...,    0,    0,    0],
       ...,
       [   1,    9,  517, ..., 1695,    2,    0],
       [   1,    9,  517, ..., 1695,    2,    0],
       [   1,    9,  282, ...,    0,    0,    0]])

In [34]:
from sklearn.model_selection import train_test_split

In [35]:
X_train, X_test, Y_train, Y_test = train_test_split(padded_X, padded_Y, random_state = 1, test_size = 0.2)

In [36]:
len(X_train), len(Y_train), len(X_test), len(Y_test)

(40000, 40000, 10000, 10000)

In [37]:
len_word2idx_X

12180

In [38]:
X_train[0].shape

(16,)

In [39]:
X_train.shape

(40000, 16)

In [40]:
Y_train.shape

(40000, 10)

In [41]:
import nltk

In [42]:
from keras.layers import Bidirectional, Concatenate, Permute, Dot, Input, LSTM, Multiply
from keras.layers import RepeatVector, Dense, Activation, Lambda
from keras.optimizers import Adam
from keras.utils import to_categorical
from keras.models import load_model, Model
import keras.backend as K
import random
import matplotlib.pyplot as plt
%matplotlib inline
from nmt_utils import *

Using TensorFlow backend.


In [43]:
from attention import AttentionLayer

In [44]:
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate, TimeDistributed
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping

In [45]:

from keras import backend as K 
K.clear_session()

latent_dim = 200
embedding_dim=256

# Encoder
encoder_inputs = Input(shape=(max_len_X,))
enc_emb =  Embedding(len_word2idx_X, embedding_dim,trainable=True)(encoder_inputs)

encoder_lstm1 = LSTM(latent_dim,return_sequences=True,return_state=True,dropout=0.4,recurrent_dropout=0.4)
encoder_output1, state_h1, state_c1 = encoder_lstm1(enc_emb)

encoder_lstm2 = LSTM(latent_dim,return_sequences=True,return_state=True,dropout=0.4,recurrent_dropout=0.4)
encoder_output2, state_h2, state_c2 = encoder_lstm2(encoder_output1)

encoder_lstm3=LSTM(latent_dim, return_state=True, return_sequences=True,dropout=0.4,recurrent_dropout=0.4)
encoder_outputs, state_h, state_c= encoder_lstm3(encoder_output2)

# Decoder
decoder_inputs = Input(shape=(None,))

dec_emb_layer = Embedding(len_word2idx_Y, embedding_dim,trainable=True)
dec_emb = dec_emb_layer(decoder_inputs)

decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True,dropout=0.4,recurrent_dropout=0.2)
decoder_outputs,decoder_fwd_state, decoder_back_state = decoder_lstm(dec_emb,initial_state=[state_h, state_c])

# Attention layer
attn_layer = AttentionLayer(name='attention_layer')
attn_out, attn_states = attn_layer([encoder_outputs, decoder_outputs])

decoder_concat_input = Concatenate(axis=-1, name='concat_layer')([decoder_outputs, attn_out])

decoder_dense =  TimeDistributed(Dense(len_word2idx_Y, activation='softmax'))
decoder_outputs = decoder_dense(decoder_concat_input)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.summary()

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 16)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 16, 256)      3118080     input_1[0][0]                    
__________________________________________________________________________________________________
lstm (LSTM)                     [(None, 16, 200), (N 365600      embedding[0][0]                  
_____________________________________________________________________________________________

In [46]:
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy',metrics=['accuracy'])

In [47]:
model.fit([X_train, Y_train[:,:-1]], Y_train.reshape(Y_train.shape[0],Y_train.shape[1], 1)[:,1:] ,epochs=30,batch_size=128) 

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Train on 40000 samples
Epoch 1/30




Epoch 2/30




Epoch 3/30




Epoch 4/30




Epoch 5/30




Epoch 6/30




Epoch 7/30




Epoch 8/30




Epoch 9/30




Epoch 10/30




Epoch 11/30




Epoch 12/30




Epoch 13/30




Epoch 14/30




Epoch 15/30




Epoch 16/30




Epoch 17/30




Epoch 18/30




Epoch 19/30




Epoch 20/30




Epoch 21/30




Epoch 22/30




Epoch 23/30




Epoch 24/30




Epoch 25/30




Epoch 26/30




Epoch 27/30




Epoch 28/30




Epoch 29/30




Epoch 30/30






<tensorflow.python.keras.callbacks.History at 0x161afeaba08>

In [48]:
model.save('translator_model.h5')

In [49]:
model = model.load_weights('translator_model.h5')

In [50]:
# encoder inference
encoder_model = Model(inputs=encoder_inputs,outputs=[encoder_outputs, state_h, state_c])

# decoder inference
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_hidden_state_input = Input(shape=(max_len_X,latent_dim))

dec_emb2= dec_emb_layer(decoder_inputs)

decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=[decoder_state_input_h, decoder_state_input_c])

#attention inference
attn_out_inf, attn_states_inf = attn_layer([decoder_hidden_state_input, decoder_outputs2])
decoder_inf_concat = Concatenate(axis=-1, name='concat')([decoder_outputs2, attn_out_inf])

decoder_outputs2 = decoder_dense(decoder_inf_concat)

decoder_model = Model(
[decoder_inputs] + [decoder_hidden_state_input,decoder_state_input_h, decoder_state_input_c],
[decoder_outputs2] + [state_h2, state_c2])

In [51]:
def make_predictions(input_seq):
    
    e_out, e_h, e_c = encoder_model.predict(input_seq)
    
    decoded_seq = np.zeros((1,1))
    decoded_seq[0,0] = word2idx_Y['start']
    decoded_seq_word = 'start'
    
    
    li=[]
    len_pred_Y = max_len_Y - 1
    
    while decoded_seq_word !='end' and len_pred_Y>=0:
        
        decoded_seq, h,c = decoder_model.predict([decoded_seq] + [e_out,e_h,e_c])
        decoded_seq = np.argmax(decoded_seq[0,-1,:])
        decoded_seq_word = idx2word_Y[decoded_seq]
        decoded_seq = np.zeros((1,1))
        decoded_seq[0,0] = word2idx_Y[decoded_seq_word]
        
        e_h = h
        e_c = c
        
        li.append(decoded_seq_word)
        
        len_pred_Y = len_pred_Y-1
    
    li.remove('end')
    out_final = " ".join(li)
    
    return out_final

In [52]:
for i in range(10):
    
    print("Input:")
    print("  ", end=' ')
    for j in range(1,len(X_train[i])):
        if X_train[i][j] == 2 or X_train[i][j] == 0 :
            break
        else:
            print(idx2word_X[X_train[i][j]], end=' ')
            
    print()
    print("Actual Translation:")  
    print("  ", end=' ')
    for k in range(1,len(Y_train[i])-2):
        if Y_train[i][k] == 2 or Y_train[i][k] == 0 :
            break
        else:
            print(idx2word_Y[Y_train[i][k]], end=' ')
        
    print()
    print("Predicted Translation:") 
    print("  ", end=' ')
    print(make_predictions(X_train[i].reshape(1,max_len_X)))
    
    print()

Input:
   tom s todav a es joven end 
Actual Translation:
   tom is still young 
Predicted Translation:
   tom is still young

Input:
   tengo que tomar medicinas end 
Actual Translation:
   i have to take medicine 
Predicted Translation:
   i have to take pictures

Input:
   apenas conozco a tom end 
Actual Translation:
   i barely know tom 
Predicted Translation:
   i know tom

Input:
   ella canta muy bien end 
Actual Translation:
   she can sing very well 
Predicted Translation:
   she s very good

Input:
   es demasiado lindo para ser verdad end 
Actual Translation:
   it s too good to be true 
Predicted Translation:
   it s too good to be true

Input:
   tom est sudando end 
Actual Translation:
   tom is sweating 
Predicted Translation:
   tom is getting paid

Input:
   rep rtenos los naipes end 
Actual Translation:
   deal us the cards 
Predicted Translation:
   leave the guitar

Input:
   l estaba enfadado con su hijo end 
Actual Translation:
   he was angry with his son 
Predi

In [54]:
for i in range(10,20):
    
    print("Input:")
    print("  ", end=' ')
    for j in range(1,len(X_test[i])):
        if X_test[i][j] == 2 or X_test[i][j] == 0 :
            break
        else:
            print(idx2word_X[X_test[i][j]], end=' ')
            
    print()
    print("Actual Translation:")    
    print("  ", end=' ')
    for k in range(1,len(Y_test[i])-2):
        if Y_test[i][k] == 2 or Y_test[i][k] == 0 :
            break
        else:
            print(idx2word_Y[Y_test[i][k]], end=' ')
        
    print()
    print("Predicted Translation:") 
    print("  ", end=' ')
    print(make_predictions(X_test[i].reshape(1,max_len_X)))
    
    print()

Input:
   escriban con tinta end 
Actual Translation:
   write in ink 
Predicted Translation:
   write in the gun

Input:
   ¿ por qu compraste una flor end 
Actual Translation:
   why did you buy a flower 
Predicted Translation:
   why did you buy a pencil

Input:
   ¿ qu quieren end 
Actual Translation:
   what do they want 
Predicted Translation:
   what do you want

Input:
   tom durmi en el coche end 
Actual Translation:
   tom slept in the car 
Predicted Translation:
   tom slept in the car

Input:
   ¿ por qu no nos vamos a casa end 
Actual Translation:
   why don t we go home 
Predicted Translation:
   why don t we go home

Input:
   m ralo end 
Actual Translation:
   look at it 
Predicted Translation:
   send it to me

Input:
   esta habitaci n est demasiado oscura end 
Actual Translation:
   this room is too dark 
Predicted Translation:
   this room is too dark

Input:
   hoy anduve en un monociclo end 
Actual Translation:
   i rode a unicycle today 
Predicted Translation:
  