## Machine translation using LSTM

### Pre-requisites

- Download the data: http://www.manythings.org/anki/
- Download the word vectors: http://nlp.stanford.edu/data/glove.6B.zip

In [13]:
import os, sys
from keras.models import Model
from keras.layers import Input, LSTM, GRU, Dense, Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import numpy as np
import matplotlib.pyplot as plt
import keras.backend as K

In [14]:
if len(K.tensorflow_backend._get_available_gpus()) > 0:
    from keras.layers import CuDNNLSTM as LSTM
    from keras.layers import CuDNNGRU as GRU

In [15]:
# Configuration

BATCH_SIZE=64             # Batch size for training.
EPOCHS = 100              # Number of epochs to train for.
LATENT_DIM = 256          # Latent dimensionality of the encoding space.
NUM_SAMPLES = 10000       # Number of samples to train on
MAX_SEQUENCE_LENGTH = 100
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 100

In [16]:
# Where we will store the data
input_texts = []          # sentence in original language
target_texts = []         # sentence in target language
target_texts_inputs = []  # sentence in target language offset by 1

### Prepare training data

In [17]:
t = 0
for line in open('data/hin-eng/hin.txt'):
    # only keep a limited number of samples.
    t += 1
    if t > NUM_SAMPLES:
        break
        
    # input and target are separated by tab
    if '\t' not in line:
        continue
        
    # split up the input and translation
    input_text, translation = line.rstrip().split('\t')
    
    # make the target input and output using teacher forcing
    target_text = translation + '<eos>'
    target_text_input = '<sos>' + translation
    
    input_texts.append(input_text)
    target_texts.append(target_text)
    target_texts_inputs.append(target_text_input)
    
print("Number of samples:", len(input_texts))

Number of samples: 2808


### Tokenize the inputs (Convert the sentences (string) into integers)

In [23]:
tokenizer_inputs = Tokenizer(num_words = MAX_NUM_WORDS)
tokenizer_inputs.fit_on_texts(input_texts)
input_sequences = tokenizer_inputs.texts_to_sequences(input_texts)

### Word to integer mapping for input text

In [44]:
word2idx_inputs = tokenizer_inputs.word_index
print('Found %s unique input tokens.' %len(word2idx_inputs))
print(word2idx_inputs)

Found 2388 unique input tokens.
{'the': 1, 'i': 2, 'to': 3, 'you': 4, 'is': 5, 'a': 6, 'he': 7, 'of': 8, 'in': 9, 'my': 10, 'it': 11, 'me': 12, 'this': 13, 'have': 14, 'she': 15, 'for': 16, 'was': 17, 'are': 18, 'do': 19, 'that': 20, 'his': 21, 'your': 22, 'we': 23, 'will': 24, 'on': 25, 'what': 26, "don't": 27, 'at': 28, 'him': 29, 'her': 30, 'not': 31, 'like': 32, 'go': 33, 'with': 34, 'be': 35, 'how': 36, 'and': 37, "i'm": 38, 'can': 39, 'time': 40, 'has': 41, 'there': 42, 'know': 43, 'all': 44, 'up': 45, 'they': 46, 'come': 47, 'very': 48, 'as': 49, 'had': 50, 'please': 51, 'from': 52, 'want': 53, "it's": 54, 'here': 55, 'by': 56, 'out': 57, 'am': 58, 'when': 59, 'did': 60, 'no': 61, 'been': 62, 'get': 63, "can't": 64, 'take': 65, 'going': 66, 'an': 67, 'father': 68, 'book': 69, 'if': 70, 'about': 71, 'one': 72, 'india': 73, 'were': 74, 'money': 75, 'would': 76, 'tom': 77, 'day': 78, 'long': 79, 'two': 80, 'tomorrow': 81, 'today': 82, 'where': 83, 'but': 84, 'must': 85, 'make': 86,

In [64]:
# Determine maximum length input sequence
max_len_input = max(len(s) for s in input_sequences)

print('max_len_input:', max_len_input)

max_len_input: 22


### Tokenize the output (Convert the sentences (string) into integers)

In [40]:
# We don't filter out special characters, otherwise <eos> and <sos> won't appear.
tokenizer_outputs = Tokenizer(num_words=MAX_NUM_WORDS)
# This is an inefficient way as we are tokenizing it on the same data (target_texts and target_texts_input)
# just offset by <sos> and <eos>
tokenizer_outputs.fit_on_texts(target_texts + target_texts_inputs)

target_sequences = tokenizer_outputs.texts_to_sequences(target_texts)
target_sequences_input = tokenizer_outputs.texts_to_sequences(target_texts_inputs)

### Word to integer mapping for output texts

In [52]:
word2idx_outputs = tokenizer_outputs.word_index
print('Found %s unique output tokens.' %len(word2idx_outputs))

print(word2idx_outputs)

Found 2992 unique output tokens.
{'eos': 1, 'sos': 2, 'है।': 3, 'में': 4, 'नहीं': 5, 'मैं': 6, 'वह': 7, 'से': 8, 'के': 9, 'क्या': 10, 'मुझे': 11, 'है': 12, 'को': 13, 'हैं।': 14, 'की': 15, 'हो': 16, 'बहुत': 17, 'का': 18, 'एक': 19, 'पर': 20, 'उसने': 21, 'हूँ।': 22, 'तुम': 23, 'था।': 24, 'यह': 25, 'कर': 26, 'लिए': 27, 'मेरे': 28, 'कि': 29, 'और': 30, 'हैं': 31, 'इस': 32, 'उसे': 33, 'भी': 34, 'अपने': 35, 'मेरी': 36, 'मैंने': 37, 'रहा': 38, 'ने': 39, 'पास': 40, 'तो': 41, 'करने': 42, 'अपनी': 43, 'कल': 44, 'हम': 45, 'गया।': 46, 'ही': 47, 'काम': 48, 'कोई': 49, 'उसके': 50, 'तुम्हारे': 51, 'उसकी': 52, 'उस': 53, 'तुम्हें': 54, 'था': 55, 'साथ': 56, 'घर': 57, 'करना': 58, 'थी।': 59, 'समय': 60, 'मेरा': 61, 'गया': 62, 'सकते': 63, 'आप': 64, 'बात': 65, 'रही': 66, 'पता': 67, 'कुछ': 68, 'यहाँ': 69, 'आज': 70, 'अभी': 71, 'बजे': 72, 'तक': 73, 'तुम्हे': 74, 'उसको': 75, 'चाहिए।': 76, 'लगता': 77, 'किताब': 78, 'दिया।': 79, 'मुझसे': 80, 'पसंद': 81, 'अच्छा': 82, 'थे।': 83, 'कभी': 84, 'मत': 85, 'रहे': 86, 'आ': 87, '

In [59]:
# store number of output words for later. remember to add 1 since indexing starts at 1
num_words_output = len(word2idx_outputs) + 1
print('num_words_output:', num_words_output)

# determine the maximum length output sequence
max_len_target = max(len(s) for s in target_sequences)
print ('Maximum length of target/translated text is:', max_len_target)

num_words_output: 2993
Maximum length of target/translated text is: 26


### Pad Sequences

In [67]:
encoder_inputs = pad_sequences(input_sequences, maxlen=max_len_input)
print('encoder_inputs.shape:', encoder_inputs.shape)
print('encoder_inputs[0]:', encoder_inputs[0])

encoder_inputs.shape: (2808, 22)
encoder_inputs[0]: [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0 1274]


In [69]:
decoder_inputs = pad_sequences(target_sequences_input, maxlen=max_len_target, padding='post')
# 26 is the maximum length of the sentence in the translated data in the training set
print('decoder_inputs.shape:', decoder_inputs.shape)
print('decoder_inputs[0]:', decoder_inputs[0])

decoder_inputs.shape: (2808, 26)
decoder_inputs[0]: [  2 757   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0]


In [91]:
decoder_targets = pad_sequences(target_sequences, maxlen=max_len_target, padding='post')
print('decoder_targets.shape:', decoder_targets.shape)
print('decoder_targets[0]:', decoder_targets[0])

decoder_targets.shape: (2808, 26)
decoder_targets[0]: [757   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0]


### Load pre-trained word vectors - Embeddings

In [74]:
word2vec = {}
# Load word vector of 100 dimensions.
with open (os.path.join('Embeddings/glove.6B/glove.6B.%sd.txt' %EMBEDDING_DIM)) as f:
    for line in f:
        # split at spaces
        values = line.split()
        word = values[0]
        #np.asarray Converts the input to an array.
        vec = np.asarray(values[1:], dtype='float32')
        word2vec[word] = vec
        
print('Found %s word vectors.' %len(word2vec))
print('Embedding dimensions: ', word2vec['the'].shape)

Found 400000 word vectors.
Embedding dimensions:  (100,)


### Prepare Embedding matrix

In [87]:
print('Filling pre-trained embeddings')
# MAX_NUM_WORDS = 20000
num_words = min(MAX_NUM_WORDS, len(word2idx_inputs) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word2idx_inputs.items():
    if i < MAX_NUM_WORDS:
        embedding_vector = word2vec.get(word)
        if embedding_vector is not None:
            # words not foun20000d in the embedding index will all be zero
            embedding_matrix[i] = embedding_vector

print('num_words:', num_words)
# 2389 words each of 100 dimensions
print('Shape of Embeddings matrix: ', embedding_matrix.shape)

Filling pre-trained embeddings
num_words: 2389
Shape of Embeddings matrix:  (2389, 100)


### Embedding layer

In [86]:
embedding_layer = Embedding(num_words, 
                           EMBEDDING_DIM,
                           weights = [embedding_matrix],
                           input_length = max_len_input,
                           #trainable=true
                           )

In [92]:
# create targets, since we cannot use sparse
# categorical cross entropy when we have sequences
decoder_targets_one_hot = np.zeros(
  (
    len(input_texts),
    max_len_target,
    num_words_output
  ),
  dtype='float32'
)

# assign the values
for i, d in enumerate(decoder_targets):
  for t, word in enumerate(d):
    decoder_targets_one_hot[i, t, word] = 1

### Build Encoder-decoder LSTM model

In [101]:
encoder_inputs_placeholder = Input(shape=(max_len_input, ))
x = embedding_layer(encoder_inputs_placeholder)

encoder_lstm = LSTM(
    LATENT_DIM,
    return_state=True,
    dropout=0.5
)

encoder_outputs, h, c = encoder_lstm(x)

# keep only the states to pass into decoder
encoder_states = [h,c]

# setup the decoder, using [h,c] as initial state
decoder_inputs_placeholder = Input(shape=(max_len_target, ))

# this word embedding will not use pre-trained vectors although we could
decoder_embedding_layer = Embedding(num_words_output, LATENT_DIM)
decoder_inputs_x = decoder_embedding_layer(decoder_inputs_placeholder)

decoder_lstm = LSTM(
        LATENT_DIM, 
        return_sequences=True,
        return_state=True,
        # dropout = 0.5
    )

decoder_outputs, _, _ = decoder_lstm(
                          decoder_inputs_x,
                          initial_state=encoder_states
                        )

print('num_words_output:', num_words_output)

# Final dense layers
decoder_dense  = Dense(num_words_output, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Create the model object
model = Model([encoder_inputs_placeholder, decoder_inputs_placeholder], decoder_outputs)

num_words_output: 2993


### Compile the model

In [102]:
model.compile(
        optimizer = 'rmsprop',
        loss='categorical_crossentropy',
        metrics=['acc'])

W0923 11:31:13.260341 139675065751360 deprecation_wrapper.py:119] From /home/agoel/anaconda3/envs/ML/lib/python3.7/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.



In [None]:
model.fit(
    [encoder_inputs, decoder_inputs], decoder_targets_one_hot,
    batch_size=BATCH_SIZE,
    
    
)

### Training Model

### Visualize accuracy and loss curves