#### Reference:-
https://blog.keras.io/a-ten-minute-introduction-to-sequence-to-sequence-learning-in-keras.html

# Importing the required libraries

In [9]:
import numpy as np # to manipulate the data
import matplotlib.pyplot as plt # to visualise the result of our model

import tensorflow
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input #to create the Embedding matrix
from tensorflow.keras.layers import LSTM # predicting using the Long Short Term Memory model
from tensorflow.keras.layers import Dropout #for the regularization(handling the overfitting)
from tensorflow.keras.layers import Dense #to create the output layer

# Configuration for the Project

In [10]:
batch_size = 64  # Batch size for training.
epochs = 100  # Number of epochs to train for.
latent_dim = 256  # Latent dimensionality of the encoding space.
num_samples = 100000  # Number of samples to train on.
# Path to the data txt file on disk.
data_path = 'deu-eng/deu.txt'

# Data Preparation

In [11]:
#Vectorizing the data
input_texts = [] # creating empty list to add the input sentences
target_texts = [] # creating empty list to add the target sentences
input_characters = set() # creating empty set to add the input language charecters(since we need only the unique charecter so we use set)
target_characters = set() # creating empty set to add the target language charecters(since we need only the unique charecter so we use set)
with open(data_path, 'r' , encoding = 'utf-8') as f: # reading the file from the directory
    lines = f.read().split('\n') # reading each line of the code
# we need to separate the english and german so we use for loop
for line in lines[: min(num_samples, len(lines) - 1)]: # we are going to take only 100000 sentences not more than that
    input_text, target_text, _ = line.split("\t") #since the english statement and german is separated by tab and remaining sentence also by tab so we are spliting with tab
    # We use "tab" as the "start sequence" character for the targets, and "\n" as "end sequence" character.
    target_text = "\t" + target_text + "\n"
    input_texts.append(input_text) # appending the input text to the to the list
    target_texts.append(target_text) # appending the target text to the list
    for char in input_text:
        if char not in input_characters: # taking only the unique charecter of the input language(english)
            input_characters.add(char)
    for char in target_text:
        if char not in target_characters: # taking only the unique charecter of the target language(german)
            target_characters.add(char)

In [12]:
# converting the set into list and sorting the charecters
input_characters = sorted(list(input_characters)) 
target_characters = sorted(list(target_characters))
# calculating the length of total charecter in the input and target language
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)
#calculating the length of the longest sentence in both input and the output language
max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])

In [13]:
print("Number of samples:", len(input_texts)) # total sample of text that we are taking under consideration
print("Number of unique input tokens:", num_encoder_tokens) # length of charecters in input language
print("Number of unique output tokens:", num_decoder_tokens) # length of charecters in target language
print("Max sequence length for inputs:", max_encoder_seq_length) # words in the longest sentence in input language
print("Max sequence length for outputs:", max_decoder_seq_length) # words in the longest sentence in target language

Number of samples: 100000
Number of unique input tokens: 80
Number of unique output tokens: 104
Max sequence length for inputs: 28
Max sequence length for outputs: 122


In [14]:
# indexing the charecter like assigning 0 to first charecter and 1 to second and so on
input_token_index = dict([(char, i) for i, char in enumerate(input_characters)]) 
target_token_index = dict([(char, i) for i, char in enumerate(target_characters)])

In [15]:
input_token_index

{' ': 0,
 '!': 1,
 '"': 2,
 '$': 3,
 '%': 4,
 "'": 5,
 '+': 6,
 ',': 7,
 '-': 8,
 '.': 9,
 '/': 10,
 '0': 11,
 '1': 12,
 '2': 13,
 '3': 14,
 '4': 15,
 '5': 16,
 '6': 17,
 '7': 18,
 '8': 19,
 '9': 20,
 ':': 21,
 '?': 22,
 'A': 23,
 'B': 24,
 'C': 25,
 'D': 26,
 'E': 27,
 'F': 28,
 'G': 29,
 'H': 30,
 'I': 31,
 'J': 32,
 'K': 33,
 'L': 34,
 'M': 35,
 'N': 36,
 'O': 37,
 'P': 38,
 'Q': 39,
 'R': 40,
 'S': 41,
 'T': 42,
 'U': 43,
 'V': 44,
 'W': 45,
 'Y': 46,
 'Z': 47,
 'a': 48,
 'b': 49,
 'c': 50,
 'd': 51,
 'e': 52,
 'f': 53,
 'g': 54,
 'h': 55,
 'i': 56,
 'j': 57,
 'k': 58,
 'l': 59,
 'm': 60,
 'n': 61,
 'o': 62,
 'p': 63,
 'q': 64,
 'r': 65,
 's': 66,
 't': 67,
 'u': 68,
 'v': 69,
 'w': 70,
 'x': 71,
 'y': 72,
 'z': 73,
 '\xa0': 74,
 'é': 75,
 'ï': 76,
 'ñ': 77,
 '’': 78,
 '€': 79}

In [16]:
# creating the required input and the output to the encoder and decoder. Initiating all the arrays with zero
#len(input_texts) = total no sentences in the input language
#max_encoder_seq_length = longest sentence in the input language
#num_encoder_tokens = number of charecter in the input language

encoder_input_data = np.zeros(
    (len(input_texts), max_encoder_seq_length, num_encoder_tokens), dtype="float32"
)
decoder_input_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype="float32"
)
decoder_target_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype="float32"
)

In [17]:
#One hot representation
for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)): # 'i' will give the count and input_text = input_texts[i] and target_text = target_texts[i](extracting the sentence one by one)
    for t, char in enumerate(input_text): # 't' is the count and char will get sentence in the input_text[0](extracting the charecter one by one)
        encoder_input_data[i, t, input_token_index[char]] = 1.0 # assigning 1 to the charecter
    encoder_input_data[i, t + 1 :, input_token_index[" "]] = 1.0 # after each word where ever there is space(' ') then also assigning 1
    for t, char in enumerate(target_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t, target_token_index[char]] = 1.0
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1, target_token_index[char]] = 1.0
    decoder_input_data[i, t + 1 :, target_token_index[" "]] = 1.0
    decoder_target_data[i, t:, target_token_index[" "]] = 1.0

Here Decoder_target_data will result first from the context vector provided by the encoder and the same output will go to the input of the next decoder so we are adding the offset as shown above

# Model Creation

In [18]:
# Defining an input sequence and processing it.
encoder_inputs = tensorflow.keras.Input(shape=(None, num_encoder_tokens))
encoder = tensorflow.keras.layers.LSTM(latent_dim, return_state=True) # since we don't need the output  so return_state = True
encoder_outputs, state_h, state_c = encoder(encoder_inputs) # adding all the outcomes of encoder in the new variable

# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = tensorflow.keras.Input(shape=(None, num_decoder_tokens))

# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = tensorflow.keras.layers.LSTM(latent_dim, return_sequences=True, return_state=True) #defining the LSTM for the decoder
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states) # keeping the output of the decoder
decoder_dense = tensorflow.keras.layers.Dense(num_decoder_tokens, activation="softmax") #dense layer to get the combined output sentence
decoder_outputs = decoder_dense(decoder_outputs) # passing all the out of the decoder to get the complete sentence

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = tensorflow.keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Training the Model

In [19]:
model.compile(
    optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"]
)
model.fit(
    [encoder_input_data, decoder_input_data],
    decoder_target_data,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.2,
)

Train on 80000 samples, validate on 20000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/1

<tensorflow.python.keras.callbacks.History at 0x20fff88cef0>

In [20]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None, 80)]   0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None, 104)]  0                                            
__________________________________________________________________________________________________
lstm (LSTM)                     [(None, 256), (None, 345088      input_1[0][0]                    
__________________________________________________________________________________________________
lstm_1 (LSTM)                   [(None, None, 256),  369664      input_2[0][0]                    
                                                                 lstm[0][1]                   

In [21]:
encoder_model = tensorflow.keras.Model(encoder_inputs, encoder_states)

decoder_state_input_h = tensorflow.keras.Input(shape=(latent_dim,), name="input_3")
decoder_state_input_c = tensorflow.keras.Input(shape=(latent_dim,), name="input_4")
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_lstm = model.layers[3]
decoder_outputs, state_h_dec, state_c_dec = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs
)
decoder_states = [state_h_dec, state_c_dec]
decoder_dense = model.layers[4]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = tensorflow.keras.Model(
    [decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states
)

# Reverse-lookup token index to decode sequences back to
# something readable.
reverse_input_char_index = dict((i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict((i, char) for char, i in target_token_index.items())


def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, target_token_index["\t"]] = 1.0

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ""
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if sampled_char == "\n" or len(decoded_sentence) > max_decoder_seq_length:
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.0

        # Update states
        states_value = [h, c]
    return decoded_sentence

In [176]:
input_sentence = input('Enter the English sentence you want to convert into the German \n')

Enter the English sentence you want to convert into the German 
Really?


In [177]:
encoder_translation_input_data = np.zeros(
    (1, max_encoder_seq_length, num_encoder_tokens), dtype="float32"
)
#One hot representation
for t, char in enumerate(input_sentence): # 't' is the count and char will get sentence in the input_text[0](extracting the charecter one by one)
    encoder_translation_input_data[0, t, input_token_index[char]] = 1.0 # assigning 1 to the charecter
encoder_translation_input_data[0, t + 1 :, input_token_index[" "]] = 1.0 # after each word where ever there is space(' ') then also assigning 1

In [178]:
encoder_translation_input_data.shape

(1, 28, 80)

In [179]:
decoded_sentence = decode_sequence(encoder_translation_input_data)
print("Input sentence:", input_sentence)
print("Decoded sentence:", decoded_sentence)

Input sentence: Really?
Decoded sentence: Wirklich?

