In [3]:
import tensorflow as tf
import pandas as pd
import numpy as np

In [4]:
lines = pd.read_table("C:/Users/91897/Desktop/fra.txt", names=['english', 'french'])
lines = lines[:8000]
lines.sample(5)

Unnamed: 0,english,french
That's hers.,C'est la sienne.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
Can you skate?,Sais-tu patiner ?,CC-BY 2.0 (France) Attribution: tatoeba.org #7...
How lovely!,Comme c'est charmant !,CC-BY 2.0 (France) Attribution: tatoeba.org #1...
Blindfold Tom.,Bandez les yeux de Tom.,CC-BY 2.0 (France) Attribution: tatoeba.org #8...
He pinched me!,Il m'a pincée !,CC-BY 2.0 (France) Attribution: tatoeba.org #2...


In [5]:
lines.shape

(8000, 2)

In [6]:
##converting to lowercase
lines.english = lines.english.apply(lambda x: x.lower())
lines.french = lines.french.apply(lambda x: x.lower())

In [7]:
import re
lines.english = lines.english.apply(lambda x: re.sub("'", '', x)).apply(lambda x: re.sub(",", 'COMMA', x))
lines.french = lines.french.apply(lambda x: re.sub("'", '', x)).apply(lambda x: re.sub(",", 'COMMA', x))

In [8]:
import string
exclude = set(string.punctuation)
lines.english = lines.english.apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
lines.french = lines.french.apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

In [9]:
from string import digits
remove_digits = str.maketrans('', '', digits)
lines.english = lines.english.apply(lambda x: x.translate(remove_digits))
lines.french = lines.french.apply(lambda x: x.translate(remove_digits))

In [10]:
lines.sample(5)

Unnamed: 0,english,french
I'm depressed.,je déprime,ccby france attribution tatoebaorg ck mics...
Tom's afraid.,tom est effrayé,ccby france attribution tatoebaorg ck piti...
It's easy.,cest simple,ccby france attribution tatoebaorg ck sacr...
Ask Tom again.,demandez à nouveau à tom,ccby france attribution tatoebaorg ck mics...
Let's move.,on met les voiles,ccby france attribution tatoebaorg meerkat ...


In [11]:
# applying start and end tokens in french sentences
lines.french = lines.french.apply(lambda x: 'START_' + ' ' + x + ' ' + '_END')
lines.head()

Unnamed: 0,english,french
Go.,va,START_ ccby france attribution tatoebaorg cm...
Go.,marche,START_ ccby france attribution tatoebaorg cm...
Go.,bouge,START_ ccby france attribution tatoebaorg cm...
Hi.,salut,START_ ccby france attribution tatoebaorg cm...
Hi.,salut,START_ ccby france attribution tatoebaorg cm...


In [12]:
# collecting all unique english words to create a vocabulary
all_english_words = set()
for eng in lines.english:
  for word in eng.split():
    if word not in all_english_words:
      all_english_words.add(word)

# collecting all unique french words to create a vocabulary
all_french_words = set()
for fre in lines.french:
  for word in fre.split():
    if word not in all_french_words:
      all_french_words.add(word)

In [13]:
# printing length of words in each language
print('length of english words: ', len(all_english_words))
print('length of french words: ', len(all_french_words))

length of english words:  3886
length of french words:  427


In [14]:
# getting maximum sentence length of english sentences
length_list = []
for l in lines.english:
  length_list.append(len(l.split(' ')))

max_input_length = np.max(length_list)
print('max_input_length: ', max_input_length)

max_input_length:  10


In [15]:
# getting maximum sentence length of french sentences
length_list = []
for l in lines.french:
  length_list.append(len(l.split(' ')))

max_output_length = np.max(length_list)
print('max_output_length: ', max_output_length)

max_output_length:  12


In [16]:
# making a list of all input and output words and sorting them out
input_words = sorted(list(all_english_words))
output_words = sorted(list(all_french_words))
print('all input words: ', input_words)
print('all output words: ', output_words)

#getting total tokens(words) from input and output
num_encoder_tokens = len(all_english_words)
num_decoder_tokens = len(all_french_words)
print('encoder tokens: ', num_encoder_tokens)
print('decoder tokens: ', num_encoder_tokens)

all input words:  ['a', 'aaah', 'abandonne', 'abandonner', 'abandonnez', 'abandonnons', 'abandonnèrent', 'abandonné', 'abattu', 'abattue', 'aboient', 'abruti', 'abrutie', 'absurdité', 'accepté', 'accises', 'accompagnemoi', 'accompli', 'accord', 'accordemoi', 'accro', 'accrochetoi', 'accrochezvous', 'accélère', 'accélérez', 'acheter', 'achetezla', 'achetezle', 'achetons', 'acheté', 'achètela', 'achètele', 'achètetoi', 'acquérir', 'actuellement', 'adieu', 'admirateurs', 'admire', 'adorable', 'adorait', 'adore', 'adoré', 'adulte', 'adultes', 'affaiblie', 'affaire', 'affaires', 'affairé', 'affairée', 'affamé', 'affolé', 'affolée', 'affreusement', 'affreux', 'affûtée', 'agent', 'agir', 'agriculteur', 'agréable', 'ah', 'aha', 'ahhh', 'ai', 'aida', 'aide', 'aidemoi', 'aidemoiCOMMA', 'aidenous', 'aidenousCOMMA', 'aident', 'aider', 'aidera', 'aiderai', 'aidetil', 'aidez', 'aidezmoi', 'aidezmoiCOMMA', 'aideznous', 'aideznousCOMMA', 'aidons', 'aidé', 'aije', 'aille', 'ailles', 'ailleurs', 'aima',

In [17]:
# getting index for words as these indexes will behave as words for machine interactions
input_token_index = dict([(word,i) for i,word in enumerate(input_words)])
output_token_index = dict([(word,i) for i,word in enumerate(output_words)])

print('input token index: ', input_token_index)
print('output token index: ', output_token_index)

input token index:  {'a': 0, 'aaah': 1, 'abandonne': 2, 'abandonner': 3, 'abandonnez': 4, 'abandonnons': 5, 'abandonnèrent': 6, 'abandonné': 7, 'abattu': 8, 'abattue': 9, 'aboient': 10, 'abruti': 11, 'abrutie': 12, 'absurdité': 13, 'accepté': 14, 'accises': 15, 'accompagnemoi': 16, 'accompli': 17, 'accord': 18, 'accordemoi': 19, 'accro': 20, 'accrochetoi': 21, 'accrochezvous': 22, 'accélère': 23, 'accélérez': 24, 'acheter': 25, 'achetezla': 26, 'achetezle': 27, 'achetons': 28, 'acheté': 29, 'achètela': 30, 'achètele': 31, 'achètetoi': 32, 'acquérir': 33, 'actuellement': 34, 'adieu': 35, 'admirateurs': 36, 'admire': 37, 'adorable': 38, 'adorait': 39, 'adore': 40, 'adoré': 41, 'adulte': 42, 'adultes': 43, 'affaiblie': 44, 'affaire': 45, 'affaires': 46, 'affairé': 47, 'affairée': 48, 'affamé': 49, 'affolé': 50, 'affolée': 51, 'affreusement': 52, 'affreux': 53, 'affûtée': 54, 'agent': 55, 'agir': 56, 'agriculteur': 57, 'agréable': 58, 'ah': 59, 'aha': 60, 'ahhh': 61, 'ai': 62, 'aida': 63, 

In [18]:
# creating arrays of input and output data
encoder_input_data = np.zeros((len(lines.english), max_input_length), dtype='float32')
decoder_input_data = np.zeros((len(lines.french), max_output_length), dtype='float32')

#one hot encoding the target data as Dense layer only gives one output through softmax layer
decoder_target_data = np.zeros((len(lines.french), max_output_length, num_decoder_tokens))

In [19]:
print(encoder_input_data.shape)
print(decoder_input_data.shape)
print(decoder_target_data.shape)

(8000, 10)
(8000, 12)
(8000, 12, 427)


In [20]:
# putting all the integer values in input, output data and target data
for i,(input_text, output_text) in enumerate(zip(lines.english, lines.french)):
  for t, word in enumerate(input_text.split()):
    encoder_input_data[i,t] = input_token_index[word]
  for t,word in enumerate(output_text.split()):
    decoder_input_data[i,t] = output_token_index[word]
    # as decoder target data is ahead of decoder input data, it will not include start_ character(which will be given to decoder model at prediction)
    if t > 0:
      decoder_target_data[i,t-1,output_token_index[word]] = 1

In [21]:
print("encoder input data: ", encoder_input_data[1])
print('decoder input data: ', decoder_input_data[1])
print('decoder target data: ',decoder_target_data[1])
print('shape of sample decoder target data: ', decoder_target_data[1].shape)

encoder input data:  [2078.    0.    0.    0.    0.    0.    0.    0.    0.    0.]
decoder input data:  [  0.  69. 133.  38. 377.  80. 245.   1.   0.   0.   0.   0.]
decoder target data:  [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
shape of sample decoder target data:  (12, 427)


In [22]:
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.utils import plot_model

In [23]:
# setting hyperparameters
embedding_size = 120
lstm_dim = 324

In [24]:
# building model for training stage
#encoder model

encoder_inputs = Input(shape=(None,))
en_x = Embedding(num_encoder_tokens, embedding_size)(encoder_inputs)
encoder = LSTM(lstm_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(en_x)
encoder_states = [state_h, state_c]

In [25]:
# decoder model

decoder_inputs = Input(shape=(None,))
final_dex = Embedding(num_decoder_tokens, embedding_size)(decoder_inputs)

decoder_lstm = LSTM(lstm_dim, return_sequences=True, return_state=True)

decoder_outputs, _, _ = decoder_lstm(final_dex, initial_state=encoder_states)

decoder_dense = Dense(num_decoder_tokens, activation='softmax') 

decoder_outputs = decoder_dense(decoder_outputs)


In [26]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [27]:
model.compile(optimizer='rmsprop',
              loss = 'categorical_crossentropy',
              metrics=['accuracy'])

In [28]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, None, 120)    466320      ['input_1[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, None, 120)    51240       ['input_2[0][0]']                
                                                                                              

In [29]:
r = model.fit([encoder_input_data, decoder_input_data], decoder_target_data, batch_size=64, epochs=30, validation_split=0.10)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [30]:
#Inference Stage

#encoder model
encoder_model = Model(encoder_inputs, encoder_states)
encoder_model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding (Embedding)       (None, None, 120)         466320    
                                                                 
 lstm (LSTM)                 [(None, 324),             576720    
                              (None, 324),                       
                              (None, 324)]                       
                                                                 
Total params: 1,043,040
Trainable params: 1,043,040
Non-trainable params: 0
_________________________________________________________________


In [31]:
#decoder model
decoder_state_input_h = Input(shape=(lstm_dim,))
decoder_state_input_c = Input(shape=(lstm_dim,))
decoder_state_inputs = [decoder_state_input_h, decoder_state_input_c]

final_dex2 = Embedding(num_decoder_tokens, embedding_size)(decoder_inputs)

decoder_outputs2, state_h2, state_c2 = decoder_lstm(final_dex2, initial_state=decoder_state_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2)

decoder_model = Model([decoder_inputs] + decoder_state_inputs, [decoder_outputs2] + decoder_states2)

In [32]:
# reversing the word index dictionary to get words from index values
reverse_input_char_index = dict((i,char) for char, i in input_token_index.items())
reverse_output_char_index = dict((i,char) for char, i in output_token_index.items())
print(reverse_input_char_index)
print(reverse_output_char_index)

{0: 'a', 1: 'aaah', 2: 'abandonne', 3: 'abandonner', 4: 'abandonnez', 5: 'abandonnons', 6: 'abandonnèrent', 7: 'abandonné', 8: 'abattu', 9: 'abattue', 10: 'aboient', 11: 'abruti', 12: 'abrutie', 13: 'absurdité', 14: 'accepté', 15: 'accises', 16: 'accompagnemoi', 17: 'accompli', 18: 'accord', 19: 'accordemoi', 20: 'accro', 21: 'accrochetoi', 22: 'accrochezvous', 23: 'accélère', 24: 'accélérez', 25: 'acheter', 26: 'achetezla', 27: 'achetezle', 28: 'achetons', 29: 'acheté', 30: 'achètela', 31: 'achètele', 32: 'achètetoi', 33: 'acquérir', 34: 'actuellement', 35: 'adieu', 36: 'admirateurs', 37: 'admire', 38: 'adorable', 39: 'adorait', 40: 'adore', 41: 'adoré', 42: 'adulte', 43: 'adultes', 44: 'affaiblie', 45: 'affaire', 46: 'affaires', 47: 'affairé', 48: 'affairée', 49: 'affamé', 50: 'affolé', 51: 'affolée', 52: 'affreusement', 53: 'affreux', 54: 'affûtée', 55: 'agent', 56: 'agir', 57: 'agriculteur', 58: 'agréable', 59: 'ah', 60: 'aha', 61: 'ahhh', 62: 'ai', 63: 'aida', 64: 'aide', 65: 'aid

In [33]:
# function to predict translation
def decode_seq(input_seq):
  state_values = encoder_model.predict(input_seq)

  target_seq = np.zeros((1,1))

  target_seq[0,0] = output_token_index['START_']

  stop_condition = False
  decoded_sentence = ''

  while not stop_condition:
    output_tokens, h, c = decoder_model.predict([target_seq] + state_values)

    sampled_token_index = np.argmax(output_tokens[0,-1,:])
    sampled_char = reverse_output_char_index[sampled_token_index]

    decoded_sentence += ' ' + sampled_char

    if(sampled_char == '_END' or len(decoded_sentence) > 52):
      stop_condition = True

    target_seq = np.zeros((1,1))
    target_seq[0,0] = sampled_token_index

    state_values = [h,c] 

  return decoded_sentence

In [34]:
# testing the model for a sample from existing data
for seq_index in [1234, 4356, 4565, 34, 2345, 7656]:
  input_seq = encoder_input_data[seq_index:seq_index+1]
  decoded_sentence = decode_seq(input_seq)
  print('----')
  print('Input_sentence: ', lines.english[seq_index:seq_index+1])
  print('decoded sentence: ', decoded_sentence)

----
Input_sentence:  It worked.    ça a fonctionné
Name: english, dtype: object
decoded sentence:   ccby france attribution tatoebaorg ck sacredceltic _END
----
Input_sentence:  All is quiet.    tout est calme
Name: english, dtype: object
decoded sentence:   ccby france attribution tatoebaorg ck sacredceltic _END
----
Input_sentence:  Do you smoke?    tu fumes 
Name: english, dtype: object
decoded sentence:   ccby france attribution tatoebaorg cm micsmithel _END
----
Input_sentence:  Stop!    ça suffit 
Name: english, dtype: object
decoded sentence:   ccby france attribution tatoebaorg cm sacredceltic _END
----
Input_sentence:  No problem.    sans problème
Name: english, dtype: object
decoded sentence:   ccby france attribution tatoebaorg cm sacredceltic _END
----
Input_sentence:  I'll go ahead.    jirai en avant
Name: english, dtype: object
decoded sentence:   ccby france attribution tatoebaorg ck sacredceltic _END
