In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np

In [2]:
lines = pd.read_table('/content/fre_eng_translation_master_french.txt', names=['english', 'french'])
lines = lines[:15000]
lines.sample(5)

Unnamed: 0,english,french
8163,She's a hottie.,C'est une bombe.
5696,No one's home.,Personne n'est à la maison.
5075,I love Arabic.,J'adore l'arabe.
2946,Are you home?,Es-tu chez toi ?
5877,That's a tree.,C’est un arbre.


In [3]:
lines.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   english  15000 non-null  object
 1   french   15000 non-null  object
dtypes: object(2)
memory usage: 234.5+ KB


In [4]:
lines.shape

(15000, 2)

In [5]:
lines.english = lines.english.apply(lambda x: x.lower())
lines.french = lines.french.apply(lambda x: x.lower())

In [6]:
import re
lines.english = lines.english.apply(lambda x: re.sub("'", '', x)).apply(lambda x: re.sub(",", 'COMMA', x))
lines.french = lines.french.apply(lambda x: re.sub("'", '', x)).apply(lambda x: re.sub(",", 'COMMA', x))

In [7]:

import string
exclude = set(string.punctuation)
lines.english = lines.english.apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
lines.french = lines.french.apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

In [8]:
from string import digits
remove_digits = str.maketrans('', '', digits)
lines.english = lines.english.apply(lambda x: x.translate(remove_digits))
lines.french = lines.french.apply(lambda x: x.translate(remove_digits))

In [9]:
lines.sample(5)

Unnamed: 0,english,french
1726,be merciful,sois miséricordieuse
1782,come off it,arrête ton char
6606,did i break it,laije brisé
14445,she defeated him,elle la battu
6647,do i need this,aije besoin de ça


In [10]:
# applying start and end tokens in french sentences
lines.french = lines.french.apply(lambda x: 'START_' + ' ' + x + ' ' + '_END')
lines.head()

Unnamed: 0,english,french
0,go,START_ va _END
1,run,START_ cours _END
2,run,START_ courez _END
3,wow,START_ ça alors _END
4,fire,START_ au feu _END


In [11]:
# collecting all unique english words to create a vocabulary
all_english_words = set()
for eng in lines.english:
  for word in eng.split():
    if word not in all_english_words:
      all_english_words.add(word)

# collecting all unique french words to create a vocabulary
all_french_words = set()
for fre in lines.french:
  for word in fre.split():
    if word not in all_french_words:
      all_french_words.add(word)

In [12]:
# printing length of words in each language
print('length of english words: ', len(all_english_words))
print('length of french words: ', len(all_french_words))

length of english words:  2930
length of french words:  6120


In [13]:
# getting maximum sentence length of english sentences
length_list = []
for l in lines.english:
  length_list.append(len(l.split(' ')))

max_input_length = np.max(length_list)
print('max_input_length: ', max_input_length)

max_input_length:  5


In [14]:
# getting maximum sentence length of french sentences
length_list = []
for l in lines.french:
  length_list.append(len(l.split(' ')))

max_output_length = np.max(length_list)
print('max_output_length: ', max_output_length)

max_output_length:  13


In [15]:
# making a list of all input and output words and sorting them out
input_words = sorted(list(all_english_words))
output_words = sorted(list(all_french_words))
print('all input words: ', input_words)
print('all output words: ', output_words)

#getting total tokens(words) from input and output
num_encoder_tokens = len(all_english_words)
num_decoder_tokens = len(all_french_words)
print('encoder tokens: ', num_encoder_tokens)
print('decoder tokens: ', num_encoder_tokens)

all input words:  ['COMMA', 'a', 'abandon', 'abhor', 'able', 'aboard', 'about', 'above', 'absent', 'absurd', 'accept', 'acceptable', 'accessible', 'accidents', 'accurate', 'ache', 'ached', 'acquitted', 'acrobat', 'act', 'action', 'active', 'actor', 'actors', 'acts', 'adaptable', 'addict', 'addicted', 'adjust', 'admire', 'admired', 'admit', 'adopted', 'adorable', 'adore', 'adores', 'adult', 'adults', 'adventurous', 'advice', 'affair', 'afford', 'afraid', 'after', 'afternoon', 'again', 'against', 'age', 'agent', 'ages', 'ago', 'agony', 'agree', 'agreed', 'agrees', 'ahead', 'aim', 'aint', 'air', 'airs', 'alarm', 'alarmed', 'alert', 'alibi', 'alive', 'all', 'allergies', 'allow', 'allowed', 'almost', 'alone', 'along', 'already', 'alright', 'also', 'always', 'am', 'amazed', 'amazing', 'ambidextrous', 'ambition', 'ambitious', 'ambush', 'america', 'american', 'ammo', 'amnesia', 'amuse', 'amused', 'amusing', 'an', 'and', 'angel', 'angry', 'animals', 'annoy', 'annoying', 'another', 'answer', 'an

In [16]:
# getting index for words as these indexes will behave as words for machine interactions
input_token_index = dict([(word,i) for i,word in enumerate(input_words)])
output_token_index = dict([(word,i) for i,word in enumerate(output_words)])

print('input token index: ', input_token_index)
print('output token index: ', output_token_index)

input token index:  {'COMMA': 0, 'a': 1, 'abandon': 2, 'abhor': 3, 'able': 4, 'aboard': 5, 'about': 6, 'above': 7, 'absent': 8, 'absurd': 9, 'accept': 10, 'acceptable': 11, 'accessible': 12, 'accidents': 13, 'accurate': 14, 'ache': 15, 'ached': 16, 'acquitted': 17, 'acrobat': 18, 'act': 19, 'action': 20, 'active': 21, 'actor': 22, 'actors': 23, 'acts': 24, 'adaptable': 25, 'addict': 26, 'addicted': 27, 'adjust': 28, 'admire': 29, 'admired': 30, 'admit': 31, 'adopted': 32, 'adorable': 33, 'adore': 34, 'adores': 35, 'adult': 36, 'adults': 37, 'adventurous': 38, 'advice': 39, 'affair': 40, 'afford': 41, 'afraid': 42, 'after': 43, 'afternoon': 44, 'again': 45, 'against': 46, 'age': 47, 'agent': 48, 'ages': 49, 'ago': 50, 'agony': 51, 'agree': 52, 'agreed': 53, 'agrees': 54, 'ahead': 55, 'aim': 56, 'aint': 57, 'air': 58, 'airs': 59, 'alarm': 60, 'alarmed': 61, 'alert': 62, 'alibi': 63, 'alive': 64, 'all': 65, 'allergies': 66, 'allow': 67, 'allowed': 68, 'almost': 69, 'alone': 70, 'along': 7

In [17]:
# creating arrays of input and output data
encoder_input_data = np.zeros((len(lines.english), max_input_length), dtype='float32')
decoder_input_data = np.zeros((len(lines.french), max_output_length), dtype='float32')

#one hot encoding the target data as Dense layer only gives one output through softmax layer
decoder_target_data = np.zeros((len(lines.french), max_output_length, num_decoder_tokens))

In [18]:
print(encoder_input_data.shape)
print(decoder_input_data.shape)
print(decoder_target_data.shape)


(15000, 5)
(15000, 13)
(15000, 13, 6120)


In [19]:
# putting all the integer values in input, output data and target data
for i,(input_text, output_text) in enumerate(zip(lines.english, lines.french)):
  for t, word in enumerate(input_text.split()):
    encoder_input_data[i,t] = input_token_index[word]
  for t,word in enumerate(output_text.split()):
    decoder_input_data[i,t] = output_token_index[word]
    # as decoder target data is ahead of decoder input data, it will not include start_ character(which will be given to decoder model at prediction)
    if t > 0:
      decoder_target_data[i,t-1,output_token_index[word]] = 1

In [20]:
print("encoder input data: ", encoder_input_data[1])
print('decoder input data: ', decoder_input_data[1])
print('decoder target data: ',decoder_target_data[1])
print('shape of sample decoder target data: ', decoder_target_data[1].shape)

encoder input data:  [2114.    0.    0.    0.    0.]
decoder input data:  [0.000e+00 1.165e+03 1.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00]
decoder target data:  [[0. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
shape of sample decoder target data:  (13, 6120)


In [21]:
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.utils import plot_model

In [22]:
# setting hyperparameters
embedding_size = 120
lstm_dim = 324

In [23]:
# building model for training stage
#encoder model

encoder_inputs = Input(shape=(None,))
en_x = Embedding(num_encoder_tokens, embedding_size)(encoder_inputs)
encoder = LSTM(lstm_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(en_x)
encoder_states = [state_h, state_c]

In [24]:
# decoder model

decoder_inputs = Input(shape=(None,))
final_dex = Embedding(num_decoder_tokens, embedding_size)(decoder_inputs)

decoder_lstm = LSTM(lstm_dim, return_sequences=True, return_state=True)

decoder_outputs, _, _ = decoder_lstm(final_dex, initial_state=encoder_states)

decoder_dense = Dense(num_decoder_tokens, activation='softmax')

decoder_outputs = decoder_dense(decoder_outputs)

In [25]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [26]:
model.compile(optimizer='rmsprop',
              loss = 'categorical_crossentropy',
              metrics=['accuracy'])

In [27]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, None, 120)    351600      ['input_1[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, None, 120)    734400      ['input_2[0][0]']                
                                                                                              

In [28]:
r = model.fit([encoder_input_data, decoder_input_data], decoder_target_data, batch_size=64, epochs=30, validation_split=0.10)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [29]:
#Inference Stage

#encoder model
encoder_model = Model(encoder_inputs, encoder_states)
encoder_model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding (Embedding)       (None, None, 120)         351600    
                                                                 
 lstm (LSTM)                 [(None, 324),             576720    
                              (None, 324),                       
                              (None, 324)]                       
                                                                 
Total params: 928,320
Trainable params: 928,320
Non-trainable params: 0
_________________________________________________________________


In [30]:
#decoder model
decoder_state_input_h = Input(shape=(lstm_dim,))
decoder_state_input_c = Input(shape=(lstm_dim,))
decoder_state_inputs = [decoder_state_input_h, decoder_state_input_c]

final_dex2 = Embedding(num_decoder_tokens, embedding_size)(decoder_inputs)

decoder_outputs2, state_h2, state_c2 = decoder_lstm(final_dex2, initial_state=decoder_state_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2)

decoder_model = Model([decoder_inputs] + decoder_state_inputs, [decoder_outputs2] + decoder_states2)

In [31]:
# reversing the word index dictionary to get words from index values
reverse_input_char_index = dict((i,char) for char, i in input_token_index.items())
reverse_output_char_index = dict((i,char) for char, i in output_token_index.items())
print(reverse_input_char_index)
print(reverse_output_char_index)

{0: 'COMMA', 1: 'a', 2: 'abandon', 3: 'abhor', 4: 'able', 5: 'aboard', 6: 'about', 7: 'above', 8: 'absent', 9: 'absurd', 10: 'accept', 11: 'acceptable', 12: 'accessible', 13: 'accidents', 14: 'accurate', 15: 'ache', 16: 'ached', 17: 'acquitted', 18: 'acrobat', 19: 'act', 20: 'action', 21: 'active', 22: 'actor', 23: 'actors', 24: 'acts', 25: 'adaptable', 26: 'addict', 27: 'addicted', 28: 'adjust', 29: 'admire', 30: 'admired', 31: 'admit', 32: 'adopted', 33: 'adorable', 34: 'adore', 35: 'adores', 36: 'adult', 37: 'adults', 38: 'adventurous', 39: 'advice', 40: 'affair', 41: 'afford', 42: 'afraid', 43: 'after', 44: 'afternoon', 45: 'again', 46: 'against', 47: 'age', 48: 'agent', 49: 'ages', 50: 'ago', 51: 'agony', 52: 'agree', 53: 'agreed', 54: 'agrees', 55: 'ahead', 56: 'aim', 57: 'aint', 58: 'air', 59: 'airs', 60: 'alarm', 61: 'alarmed', 62: 'alert', 63: 'alibi', 64: 'alive', 65: 'all', 66: 'allergies', 67: 'allow', 68: 'allowed', 69: 'almost', 70: 'alone', 71: 'along', 72: 'already', 73

In [32]:
# function to predict translation
def decode_seq(input_seq):
  state_values = encoder_model.predict(input_seq)

  target_seq = np.zeros((1,1))

  target_seq[0,0] = output_token_index['START_']

  stop_condition = False
  decoded_sentence = ''

  while not stop_condition:
    output_tokens, h, c = decoder_model.predict([target_seq] + state_values)

    sampled_token_index = np.argmax(output_tokens[0,-1,:])
    sampled_char = reverse_output_char_index[sampled_token_index]

    decoded_sentence += ' ' + sampled_char

    if(sampled_char == '_END' or len(decoded_sentence) > 52):
      stop_condition = True

    target_seq = np.zeros((1,1))
    target_seq[0,0] = sampled_token_index

    state_values = [h,c]

  return decoded_sentence

In [33]:
# testing the model for a sample from existing data
for seq_index in [1234, 4356, 4565, 34, 2345, 7656]:
  input_seq = encoder_input_data[seq_index:seq_index+1]
  decoded_sentence = decode_seq(input_seq)
  print('----')
  print('Input_sentence: ', lines.english[seq_index:seq_index+1])
  print('decoded sentence: ', decoded_sentence)

----
Input_sentence:  1234    i wrote it
Name: english, dtype: object
decoded sentence:   jai me _END
----
Input_sentence:  4356    youre funny
Name: english, dtype: object
decoded sentence:   tu _END
----
Input_sentence:  4565    do you get it
Name: english, dtype: object
decoded sentence:   tu _END
----
Input_sentence:  34    got it
Name: english, dtype: object
decoded sentence:   elle _END
----
Input_sentence:  2345    is tom well
Name: english, dtype: object
decoded sentence:   tom tom _END
----
Input_sentence:  7656    im interested
Name: english, dtype: object
decoded sentence:   je _END
