In [1]:
# Import required modules (numpy, re, tensorflow, keras[layers, models])
import numpy as np
import re
import tensorflow as tf
from tensorflow import keras
from keras.layers import Input, LSTM, Dense
from keras.models import Model

Using TensorFlow backend.


In [2]:
# Path for the file from dataset Tatoeba Project
filename="C:\\Users\\saxen\\Desktop\\Dataset\cmn.txt"
with open(filename,'r', encoding='utf-8') as file: # 'with open' closes the file when it is no longer required
     lines = file.read().split('\n') # read each line

In [3]:
# Get data in a lists
docs_in = []
docs_out= []

# Vocabulary sets
tokens_in = set()
tokens_out = set()

# Change the value '150' to read more lines from file
#(The program may take longer to run in that case.)
for line in lines[:100]:
    doc_in, doc_out = line.split('\t')[:2]
    # Append input sentences to docs_in
    docs_in.append(doc_in)
    doc_out=" ".join(re.findall(r"[\w]+[^\s\w]", doc_out))
    # add <START/END> tags to each output sentence
    doc_out='<START> '+ doc_out + ' END'
    # Append output sentences to docs_out
    docs_out.append(doc_out)
    
    # Split input sentences to words from each sentence
    # Add each unique word only once
    for token in re.findall(r"[\w']+|[^\s\w]", doc_in):
        if token not in tokens_in:
            tokens_in.add(token)
    # Split output sentences to words from each sentence
    # Add each unique word only once
    for token in doc_out.split():
        if token not in tokens_out:
            tokens_out.add(token)

In [4]:
# Sort the set of tokens
tokens_in = sorted(list(tokens_in))
tokens_out = sorted(list(tokens_out))

In [5]:
# Define Encoder & Decoder
num_encoder_tokens = len(tokens_in)
num_decoder_tokens = len(tokens_out)

# Assign seq length as max length from each word in each line in list of sentences.
max_encoder_seq_length = max([len(re.findall(r"[\w']+|[^\s\w]", doc_in)) for doc_out in docs_in])
max_decoder_seq_length = max([len(re.findall(r"[\w']+|[^\s\w]", doc_out)) for doc_out in docs_out])

# Token index as a dict for reference
input_features_dict = dict([(token, i) for i, token in enumerate(tokens_in)])
target_features_dict = dict([(token, i) for i, token in enumerate(tokens_out)])

# Reverse token index as a dict for reference
reverse_input_features_dict = dict((i, token) for token, i in input_features_dict.items())
reverse_target_features_dict = dict((i, token) for token, i in target_features_dict.items())

# Create arrays for max possible length with all values initialised as zeros.
encoder_input_data = np.zeros((len(docs_in), max_encoder_seq_length, num_encoder_tokens), dtype= 'float32')
decoder_input_data = np.zeros((len(docs_in), max_decoder_seq_length, num_decoder_tokens), dtype='float32')
decoder_target_data = np.zeros((len(docs_in), max_decoder_seq_length, num_decoder_tokens), dtype= 'float32')

In [6]:
# Iterate over bot input and output simultaneously using enumerate(zip(l1,l2))
for line, (doc_in, doc_out) in enumerate(zip(docs_in, docs_out)):
    for timestep, token in enumerate(re.findall(r"[\w']+|[^\s\w]", doc_in)):
        encoder_input_data[line, timestep, input_features_dict[token]] = 1.
    for timestep, token in enumerate(doc_out.split()):
        decoder_input_data[line, timestep, target_features_dict[token]] = 1.
        if timestep > 0:
            decoder_target_data[line, timestep - 1, target_features_dict[token]] = 1.

In [7]:
latent_dim=256 # Dimensionality
batch_size=100 # No. of samples processed in one iteration
epochs=100 # No. of passes over the training set

In [8]:
# Encoder training using LSTM
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_hidden, state_cell = encoder_lstm(encoder_inputs)
encoder_states = [state_hidden, state_cell]

In [9]:
# Decoder training using LSTM, Dense
decoder_inputs = Input(shape=(None, num_decoder_tokens))
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, decoder_state_hidden, decoder_state_cell = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [10]:
# Define a training model 
training_model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
print("Model summary:")
training_model.summary()

Model summary:
Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None, 105)    0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None, 93)     0                                            
__________________________________________________________________________________________________
lstm_1 (LSTM)                   [(None, 256), (None, 370688      input_1[0][0]                    
__________________________________________________________________________________________________
lstm_2 (LSTM)                   [(None, None, 256),  358400      input_2[0][0]                    
                                                                 lstm_1[0][1]

In [11]:
# Train the model
training_model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
print("Training the model:\n")
training_model.fit([encoder_input_data, decoder_input_data], decoder_target_data, batch_size = batch_size, epochs = epochs, validation_split = 0.2)
training_model.save('lang_translate_model.h5')

Training the model:

Train on 80 samples, validate on 20 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73

In [12]:
# ??
encoder_inputs = training_model.input[0]
encoder_outputs, state_h_enc, state_c_enc = training_model.layers[2].output
encoder_states = [state_h_enc, state_c_enc]
encoder_model = Model(encoder_inputs, encoder_states)

In [13]:
# ??
decoder_state_input_hidden = Input(shape=(latent_dim,))
decoder_state_input_cell = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_hidden, decoder_state_input_cell]
decoder_outputs, state_hidden, state_cell = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_hidden, state_cell]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)

In [14]:
def decode_sequence(test_input):
    states_value = encoder_model.predict(test_input)
    # Create empty output_seq of length 1
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # Initialise the first value in each as '<START>' label
    target_seq[0, 0, target_features_dict['<START>']] = 1.
    decoded_sentence = ''
    stop_condition = False
    
    # Run the model to get possible outputs with probabiliy
    while not stop_condition:
        output_tokens, hidden_state, cell_state = decoder_model.predict([target_seq] + states_value)
        # Choose the output with max probability
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token = reverse_target_features_dict[sampled_token_index]
        decoded_sentence += " " + sampled_token
        # Exit if '<END>' or max length reached
        if (sampled_token == '<END>' or len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True
            
        # Update the output_seq
        #target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        states_value = [hidden_state, cell_state]
    # return the sentence selected    
    return decoded_sentence

In [15]:
for i in range(15, 25):
    temp_in = encoder_input_data[i: i + 1]
    decoded_sentence = decode_sequence(temp_in)
    print('-----')
    print('Input sentence:', docs_in[i])
    print('Decoded sentence:', decoded_sentence)

-----
Input sentence: Listen.
Decoded sentence:  听着。 END END
-----
Input sentence: No way!
Decoded sentence:  没门！ END END
-----
Input sentence: No way!
Decoded sentence:  没门！ END END
-----
Input sentence: Really?
Decoded sentence:  你确定？ END
-----
Input sentence: Try it.
Decoded sentence:  试试吧。 END
-----
Input sentence: We try.
Decoded sentence:  我们来试试。 END
-----
Input sentence: Why me?
Decoded sentence:  为什么是我？ END
-----
Input sentence: Ask Tom.
Decoded sentence:  去问汤姆。 END
-----
Input sentence: Awesome!
Decoded sentence:  好棒！ END END
-----
Input sentence: Be calm.
Decoded sentence:  冷静点。 END
