## Language Translator

In [None]:
import nltk

From `nltk` we can download translated sentences between different languages. You can see the example between **English and French** below but feel free to try different combination as well.

In [None]:
#import packages
import pandas as pd
import numpy as np
import keras,tensorflow
import io
import re
import string
from unicodedata import normalize
from keras.models import Model
from keras.layers import Input, LSTM, Dense

In [None]:
from google.colab import files
upload = files.upload()

Saving bilingual_pairs.txt to bilingual_pairs.txt


In [None]:
#function to read the file
def read_data(file):
  data = []
  with open(file) as file:
    for entry in file:
      entry = entry.strip()
      data.append(entry)
    return data
#saving the data
data = read_data('bilingual_pairs.txt')

In [None]:
data = data[:140000]

In [None]:
#make english and french arrays
def build_english_french_sentences(data):
  english_sentences=[]
  french_sentences=[]
  for line in data:
    english = line.split('\t')[0]
    french = line.split('\t')[1]
    english_sentences.append(english)
    french_sentences.append(french)
  return english_sentences, french_sentences
#calling the function
english_sentences, french_sentences = build_english_french_sentences(data)

In [None]:
print('english_sentences: ',english_sentences[:5])
print('---'*30)
print('french_sentences: ',french_sentences[:5])

english_sentences:  ['Go.', 'Run!', 'Run!', 'Wow!', 'Fire!']
------------------------------------------------------------------------------------------
french_sentences:  ['Va !', 'Cours\u202f!', 'Courez\u202f!', 'Ã‡a alors\u202f!', 'Au feu !']


In [None]:
#function to clean the sentences
def clean_sentences(sentence):
 # prepare regex for char filtering
 re_print = re.compile('[^%s]' % re.escape(string.printable))
 # prepare translation table for removing punctuation
 table = str.maketrans('', '', string.punctuation)
 cleaned_sent = normalize('NFD', sentence).encode('ascii', \
 'ignore')
 cleaned_sent = cleaned_sent.decode('UTF-8')
 cleaned_sent = cleaned_sent.split()
 cleaned_sent = [word.lower() for word in cleaned_sent]
 cleaned_sent = [word.translate(table) for word in cleaned_sent]
 cleaned_sent = [re_print.sub('', w) for w in cleaned_sent]
 cleaned_sent = [word for word in cleaned_sent if \
 word.isalpha()]
 return ' '.join(cleaned_sent)


In [None]:
def build_clean_english_french_sentences(english_sentences,
french_sentences):
 french_sentences_cleaned = []
 english_sentences_cleaned = []
 for sent in french_sentences:
  french_sentences_cleaned.append(clean_sentences(sent))
 for sent in english_sentences:
  english_sentences_cleaned.append(clean_sentences(sent))
 return english_sentences_cleaned, french_sentences_cleaned


In [None]:
#apply the functions
english_sentences_cleaned, french_sentences_cleaned = build_clean_english_french_sentences(english_sentences,
                                                                                           french_sentences)

In the previous steps we cleaned the data and now its time to divide into train and target and portion of our data for testing


In [None]:
def build_data(english_sentences_cleaned,french_sentences_cleaned):
  input_dataset = []
  target_dataset = []
  input_characters = set()
  target_characters = set()
  for french_sentence in french_sentences_cleaned:
    input_datapoint = french_sentence
    input_dataset.append(input_datapoint)
    for char in input_datapoint:
      input_characters.add(char)
  for english_sentence in english_sentences_cleaned:
    target_datapoint = "\t" + english_sentence + "\n"
    target_dataset.append(target_datapoint)
    for char in target_datapoint:
      target_characters.add(char)
  return input_dataset, target_dataset,sorted(list(input_characters)),sorted(list(target_characters))

In [None]:
#calling the function to have input data, input char, output data, and output char
input_dataset, target_dataset, input_characters, target_characters = \
build_data(english_sentences_cleaned,french_sentences_cleaned)

In [None]:
#print the datasets we have
print("input_dataset: ",input_dataset[:5])
print('---'*20)
print('target_dataset: ',target_dataset[10:15])
print('---'*20)
print('input_characters: ',input_characters[:5])
print('----'*20)
print('target_characters: ',target_characters[:5])

input_dataset:  ['va', 'cours', 'courez', 'ca alors', 'au feu']
------------------------------------------------------------
target_dataset:  ['\twait\n', '\twait\n', '\ti see\n', '\ti try\n', '\ti won\n']
------------------------------------------------------------
input_characters:  [' ', 'a', 'b', 'c', 'd']
--------------------------------------------------------------------------------
target_characters:  ['\t', '\n', ' ', 'a', 'b']


We have cleaned our data and we broke everything down to character. for the output character we included \t and \n indicating the start and end of input for the decoder. next we are giong to create some metadata from our input and target variables.

In [None]:
def build_metadata(input_dataset, target_dataset, input_characters, target_characters):
  num_Encoder_tokens = len(input_characters)
  num_Decoder_tokens = len(target_characters)
  max_Encoder_seq_length = max([len(data_point) for data_point in input_dataset])
  max_Decoder_seq_length = max([len(data_point) for data_point in target_dataset])
  print('Number of data points:', len(input_dataset))
  print('Number of unique input tokens:', num_Encoder_tokens)
  print('Number of unique output tokens', num_Decoder_tokens)
  print('Max sequence length for inputs:', max_Encoder_seq_length)
  print('Max sequence length for outputs', max_Decoder_seq_length)
  return num_Encoder_tokens, num_Decoder_tokens, max_Encoder_seq_length, max_Decoder_seq_length

In [None]:
num_Encoder_tokens, num_Decoder_tokens, max_Encoder_seq_length, max_Decoder_seq_length =\
build_metadata(input_dataset,target_dataset,input_characters,target_characters)

Number of data points: 140000
Number of unique input tokens: 27
Number of unique output tokens 29
Max sequence length for inputs: 117
Max sequence length for outputs 58


Building a map from characters to indices and vice-versa

**Represent** our input characters as indices

**Convert** the indices into characters

In [None]:
#building index for each char and vice-versa
def build_indices(input_characters, target_characters):
  input_char_to_idx = {}
  input_idx_to_char = {}
  target_char_to_idx = {}
  target_idx_to_char = {}

  for i,char in enumerate(input_characters):
    input_char_to_idx[char] = i
    input_idx_to_char[i] = char
  for i,char in enumerate(target_characters):
    target_char_to_idx[char] = i
    target_idx_to_char[i] = char

  return input_char_to_idx, input_idx_to_char, target_char_to_idx, target_idx_to_char

In [None]:
input_char_to_idx, input_idx_to_char, target_char_to_idx, target_idx_to_char = \
build_indices(input_characters, target_characters)

In [None]:
#print the values
print('input_char_to_idx:', input_char_to_idx)
print('---'*50)
print('input_idx_to_char:', input_idx_to_char)
print('---'*50)
print('target_char_to_idx:', target_char_to_idx)
print('---'*50)
print('target_idx_to_char:', target_idx_to_char)

input_char_to_idx: {' ': 0, 'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26}
------------------------------------------------------------------------------------------------------------------------------------------------------
input_idx_to_char: {0: ' ', 1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z'}
------------------------------------------------------------------------------------------------------------------------------------------------------
target_char_to_idx: {'\t': 0, '\n': 1, ' ': 2, 'a': 3, 'b': 4, 'c': 5, 'd': 6, 'e': 7, 'f': 8, 'g': 9, 'h': 10, 'i': 11, 'j': 12, 'k': 13, 'l': 14, 'm': 15, 'n': 16, 'o': 17, 'p': 18, 'q': 19,

In [None]:
#we are building our data structure to feed to the neural network
def build_data_structures(length_input_dataset,max_Encoder_seq_length,max_Decoder_seq_length,
                          num_Encoder_tokens,num_Decoder_tokens):
  Encoder_input_data = np.zeros((length_input_dataset,max_Encoder_seq_length,num_Encoder_tokens),dtype='float32')
  Decoder_input_data = np.zeros((length_input_dataset,max_Decoder_seq_length, num_Decoder_tokens),dtype='float32')
  Decoder_target_data = np.zeros((length_input_dataset,max_Decoder_seq_length, num_Decoder_tokens),dtype='float32')
  print("Dimensionality of Encoder input data is : ", \
  Encoder_input_data.shape)
  print("Dimensionality of Decoder input data is : ", \
  Decoder_input_data.shape)
  print("Dimensionality of Decoder target data is : ", \
  Decoder_target_data.shape)
  return Encoder_input_data, Decoder_input_data, \
  Decoder_target_data

In [None]:
Encoder_input_data, Decoder_input_data, Decoder_target_data = \
build_data_structures(len(input_dataset),max_Encoder_seq_length, max_Decoder_seq_length,
                      num_Encoder_tokens, num_Decoder_tokens)

Dimensionality of Encoder input data is :  (140000, 117, 27)
Dimensionality of Decoder input data is :  (140000, 58, 29)
Dimensionality of Decoder target data is :  (140000, 58, 29)


In [None]:
def add_data_to_data_structures(input_dataset, target_dataset,Encoder_input_data, Decoder_input_data, Decoder_target_data):
  for i, (input_data_point, target_data_point) in enumerate(zip(input_dataset, target_dataset)):
    for t, char in enumerate(input_data_point):
      Encoder_input_data[i, t, input_char_to_idx[char]] = 1.
    for t, char in enumerate(target_data_point):
      Decoder_input_data[i, t, target_char_to_idx[char]] = 1.
      if t > 0:
      # Decoder_target_data will be ahead by one timestep
      # and will not include the start character.
        Decoder_target_data[i, t - 1, target_char_to_idx[char]] = 1.
  return Encoder_input_data, Decoder_input_data, Decoder_target_data

In [None]:
Encoder_input_data, Decoder_input_data, Decoder_target_data = add_data_to_data_structures(input_dataset, target_dataset,
                                                                                          Encoder_input_data, Decoder_input_data,
                                                                                          Decoder_target_data)

In [None]:
#lets define the hyperparameters
batch_size = 200
epochs = 100
latent_dim = 256

In [None]:
#Encoder
Encoder_inputs = Input(shape=(None,num_Encoder_tokens))
Encoder = LSTM(latent_dim,return_state=True)
Encoder_outputs, state_h, state_c = Encoder(Encoder_inputs)
Encoder_states = [state_h,state_c]

In [None]:
#Decoder
Decoder_inputs = Input(shape=(None,num_Decoder_tokens))
Decoder_lstm = LSTM(latent_dim,return_sequences=True,return_state=True)
Decoder_outputs, _, _ = Decoder_lstm(Decoder_inputs,initial_state=Encoder_states)
Decoder_dense = Dense(num_Decoder_tokens, activation='softmax')
Decoder_outputs = Decoder_dense(Decoder_outputs)

In [None]:
model = Model(inputs=[Encoder_inputs,Decoder_inputs],
              outputs=Decoder_outputs)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy',metrics=['accuracy'])
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None, 27)]   0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None, 29)]   0                                            
__________________________________________________________________________________________________
lstm (LSTM)                     [(None, 256), (None, 290816      input_1[0][0]                    
__________________________________________________________________________________________________
lstm_1 (LSTM)                   [(None, None, 256),  292864      input_2[0][0]                    
                                                                 lstm[0][1]            

In [None]:
model.fit([Encoder_input_data, Decoder_input_data],
          Decoder_target_data,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.2)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7fe1200bd4a8>

In [None]:
model.save('neural_machine_translation_french_to_english.h5')

The model is trained and saved. now the next step is to find a way to infer from the model we build

In [None]:
Encoder_model = Model(Encoder_inputs, Encoder_states)
Decoder_state_input_c = Input(shape=(latent_dim,))
Decoder_state_input_h = Input(shape=(latent_dim,))
Decoder_states_inputs = [Decoder_state_input_h,Decoder_state_input_c]
Decoder_outputs, state_h, state_c = Decoder_lstm(Decoder_inputs,initial_state=Decoder_states_inputs)
Decoder_states = [state_h, state_c]
Decoder_outputs = Decoder_dense(Decoder_outputs)
Decoder_model = Model([Decoder_inputs] + Decoder_states_inputs,[Decoder_outputs] + Decoder_states)


In [None]:
def decode_sequence(input_seq):
  states_value = Encoder_model.predict(input_seq)

  target_seq = np.zeros((1,1,num_Decoder_tokens))
  target_seq[0, 0, target_char_to_idx['\t']] = 1.

  stop_condition=False
  decoded_sentence = ' '
  while not stop_condition:
    output_tokens, h, c = Decoder_model.predict([target_seq]+states_value)
    sampled_token_index = np.argmax(output_tokens[0, -1, :])
    sampled_char = target_idx_to_char[sampled_token_index]
    decoded_sentence += sampled_char

    if(sampled_char =='\n' or len(decoded_sentence) > max_Decoder_seq_length):
      stop_condition = True
    
    target_seq = np.zeros((1, 1, num_Decoder_tokens))
    target_seq[0, 0, sampled_token_index] = 1.
    states_value = [h, c]
  return decoded_sentence

In [None]:
#lets translate
def decode(seq_index):
 input_seq = Encoder_input_data[seq_index: seq_index + 1]
 decoded_sentence = decode_sequence(input_seq)
 print('-')
 print('Input sentence:', input_dataset[seq_index])
 print('Decoded sentence:', decoded_sentence)


In [None]:
for i in range(130000,130020):
  print(decode(i))
  print(target_dataset[i])
  print('----'*50)

-
Input sentence: si vous ne voulez pas le faire vous ny etes pas oblige
Decoded sentence:  if you dont want to do anything for

None
	if you dont want to do it you dont have to

--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
-
Input sentence: si tu ne veux pas le faire tu ny es pas obligee
Decoded sentence:  if you dont want to do anything for

None
	if you dont want to do it you dont have to

--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
-
Input sentence: si vous ne voulez pas le faire vous ny etes pas obliges
Decoded sentence:  if you dont want to do anything for

None
	if you dont want to do it you dont have to

-------------------------------------------------------------------