In [1]:
#imports
import re
import tensorflow as tf
import numpy as np
import keras
from keras.models import Model
from keras.layers import Input, Dense, GRU, Embedding
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [24]:
mark_start = "SSSS "
mark_end = " EEEE"
def readDanishfile(filename):
        data = []
        with open(filename) as file:
            for line in file:
                line = line.rstrip('\n')
                data.append(line)
        return data

def readEnglishfile(filename):
        data = []
        with open(filename) as file:
            for line in file:
                line = line.rstrip('\n')
                line = mark_start+line+mark_end
                data.append(line)
        return data

eng_data = readEnglishfile("../input/enlish_data.txt")
da_data = readDanishfile("../input/danish-data.txt")
num_words = 10000

In [25]:
print(eng_data[:5])

['SSSS Resumption of the session EEEE', 'SSSS I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period. EEEE', "SSSS Although, as you will have seen, the dreaded 'millennium bug' failed to materialise, still the people in a number of countries suffered a series of natural disasters that truly were dreadful. EEEE", 'SSSS You have requested a debate on this subject in the course of the next few days, during this part-session. EEEE', "SSSS In the meantime, I should like to observe a minute' s silence, as a number of Members have requested, on behalf of all the victims concerned, particularly those of the terrible storms, in the various countries of the European Union. EEEE"]


In [47]:
# inspired by open source code from keras models
# https://github.com/keras-team/keras-preprocessing/blob/master/keras_preprocessing/text.py
class Wrapper(Tokenizer):
    
    def __init__(self,texts,padding,reverse=False,num_words=None):
        
        Tokenizer.__init__(self,num_words=num_words)
        
        self.fit_on_texts(texts)
        
        self.tokens = self.texts_to_sequences(texts)
          
        self.index_to_word = dict(zip(self.word_index.values(), self.word_index.keys()))
        
        self.num_tokens = [len(x) for x in self.tokens]
        
        self.max_tokens = np.mean(self.num_tokens) + 2*np.std(self.num_tokens)
        
        self.max_tokens  = int(self.max_tokens)
        
        if reverse:
            self.tokens = [list(reversed(x)) for x in self.tokens]
            truncating = 'pre'
        else:
            truncating = 'post'
        
        self.tokens_padded = pad_sequences(self.tokens, maxlen= self.max_tokens, padding=padding,truncating=truncating)
    
    def tokens_to_string(self,tokens):
        words = [self.index_to_word[token] for token in tokens if token !=0]
        sentence = "".join(words)
        return sentence
    
    def token_to_word(self,token):
        word = " " if token == 0 else self.index_to_word[token]
        return word
    
    def text_to_tokens(self,text,reverse=False,padding=False):
        tokens = self.texts_to_sequences([text])
        tokens = np.array(tokens)
        if reverse:
            tokens =np.flip(tokens,axis=1)
            truncating ="pre"
        else:
            truncating = "post"
        
        if padding:
            tokens = pad_sequences(tokens,
                                  maxlen=self.max_tokens,
                                  padding="pre",
                                  truncating = truncating)
            return tokens

In [48]:
# https://github.com/keras-team/keras-preprocessing/blob/master/keras_preprocessing/text.py
# Properties of machine translation https://arxiv.org/pdf/1409.1259.pdf
da_tokenizer = Wrapper(texts=da_data[:100000],padding="pre",reverse=True, num_words=num_words)
eng_tokenizer = Wrapper(texts=eng_data[:100000],padding="post",reverse=False, num_words=num_words)

In [49]:
eng_tokens = eng_tokenizer.tokens_padded
print(eng_tokens[:1])

[[   2 4535    4    1  928    3    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0]]


In [50]:
token_start = 2
token_end = 3

In [51]:
da_tokens = da_tokenizer.tokens_padded
eng_tokens = eng_tokenizer.tokens_padded
print(eng_tokens.shape)
print(da_tokens.shape)

(50000, 57)
(50000, 50)


In [52]:
print(eng_tokenizer.token_to_word(1))

the


In [53]:
# shift technique for providing decoder data
# https://machinelearningmastery.com/develop-encoder-decoder-model-sequence-sequence-prediction-keras/
encoder_input_data = da_tokens
decoder_input_data = eng_tokens[:, :-1]
decoder_output_data = eng_tokens[:, 1:]

Building encoder neural architecture  
1. Reference:  
Keras documentation: https://keras.io/layers/about-keras-layers/  
Keras open source code: https://github.com/keras-team/keras/blob/master/keras/layers/core.py#L796  

In [54]:
# building the basic blocks
encoder_input = Input(shape=(None,), name="encoder_input")
embedding_size = 128
encoder_embedding = Embedding(input_dim=num_words,output_dim=embedding_size, name="encoder_embedding")
state_size = 512

# we need single thoughtVector for gru processing
E_GRU1 = GRU(state_size, name='E_GRU1',return_sequences=True)
E_GRU2 = GRU(state_size, name='E_GRU2',return_sequences=True)
E_GRU3 = GRU(state_size, name='E_GRU3', return_sequences=False)

In [55]:
# assembling the encoder
def assemble_encoder():
    # encoder input layer
    nn = encoder_input
    
    # embedding layer
    nn = encoder_embedding(nn)

    # GRU connections.
    nn = E_GRU1(nn)
    nn = E_GRU2(nn)
    nn = E_GRU3(nn)
    
    encoder_output = nn
    
    return encoder_output

# return encoder assembled model
encoder_output = assemble_encoder()

Building decoder neural architecture  
1. Reference:  
Keras documentation: https://keras.io/layers/about-keras-layers/    
Keras open source code: https://github.com/keras-team/keras/blob/master/keras/layers/core.py#L796   

In [56]:
# Basic building blocks inspired by keras open source layers documentation
decoder_initial_state = Input(shape=(state_size,), name='decoder_initial_state')
decoder_input = Input(shape=(None, ), name='decoder_input')
decoder_embedding = Embedding(input_dim=num_words,output_dim=embedding_size, name='decoder_embedding')

# decoder GRUs to give multivector output
# inspired by https://machinelearningmastery.com/develop-encoder-decoder-model-sequence-sequence-prediction-keras/
D_GRU1 = GRU(state_size, name='D_GRU1', return_sequences=True)
D_GRU2 = GRU(state_size, name='D_GRU2', return_sequences=True)
D_GRU3 = GRU(state_size, name='D_GRU3', return_sequences=True)

decoder_dense = Dense(num_words, activation='linear', name='decoder_output')

In [57]:
# assembling decoder with initial state
def assemble_decoder(initial_state):
    
    #decoder input stage with english tokens shifted by 1
    nn = decoder_input

    # Embedding layer
    nn = decoder_embedding(nn)
    
    # Connect all the GRU-layers.
    nn = D_GRU1(nn, initial_state=initial_state)
    nn = D_GRU2(nn, initial_state=initial_state)
    nn = D_GRU3(nn, initial_state=initial_state)
    
    # dense layer
    decoder_output = decoder_dense(nn)
    
    return decoder_output

# assembled decoder model
decoder_output = assemble_decoder(initial_state=encoder_output)

In [58]:
# Reference: Keras Model API

# Complete architecture
DA_ENG_MODEL = Model(inputs=[encoder_input, decoder_input], outputs=[decoder_output])

#Encoder Model
model_encoder = Model(inputs=[encoder_input],outputs=[encoder_output])

# Decoder output
decoder_output = assemble_decoder(initial_state=decoder_initial_state)

# Decoder Model
model_decoder = Model(inputs=[decoder_input, decoder_initial_state],outputs=[decoder_output])


Loss function: sparse cross entroy loss  
Reference:  
https://keras.io/losses/

In [59]:
# defining sparse cross entropy loss
# reference: https://github.com/keras-team/keras/tree/master/docs
def sparse_cross_entropy(y_true, y_pred):
    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_true, logits=y_pred)
    loss_mean = tf.reduce_mean(loss)
    return loss_mean

In [60]:
# RMSprop optimizer
optimizer = RMSprop(lr=1e-3)
decoder_target = tf.placeholder(dtype='int32', shape=(None, None))

#compiling model
DA_ENG_MODEL.compile(optimizer=optimizer,loss=sparse_cross_entropy, target_tensors=[decoder_target])

In [61]:
x_data = \
{
    'encoder_input': encoder_input_data,
    'decoder_input': decoder_input_data
}

y_data = \
{
    'decoder_output': decoder_output_data
}

validation_split = 10000 / len(encoder_input_data)
print(validation_split)

0.2


In [62]:
DA_ENG_MODEL.fit(x=x_data, y=y_data, batch_size=512, epochs=1, validation_split=validation_split)

Train on 40000 samples, validate on 10000 samples
Epoch 1/1


<keras.callbacks.History at 0x7f2e7888c5f8>

In [65]:
# Translate text by index in danish source language
# inspired by https://stackoverflow.com/questions/41971587/how-to-convert-predicted-sequence-back-to-text-in-keras
def translate(input_text, true_output_text=None):
    #input tokens
    input_tokens = da_tokenizer.text_to_tokens(text=input_text,reverse=True,padding=True)
    
    #predict the result throught encoder
    initial_state = model_encoder.predict(input_tokens)

    #max_tokens for english
    max_tokens = eng_tokenizer.max_tokens

    shape = (1, max_tokens)
    decoder_input_data = np.zeros(shape=shape, dtype=np.int)

    token_int = token_start

    output_text = ''

    count_tokens = 0

    while token_int != token_end and count_tokens < max_tokens:
        decoder_input_data[0, count_tokens] = token_int

        x_data = \
        {
            'decoder_initial_state': initial_state,
            'decoder_input': decoder_input_data
        }

        decoder_output = model_decoder.predict(x_data)
        token_onehot = decoder_output[0, count_tokens, :]
        
        token_int = np.argmax(token_onehot)
        sampled_word = eng_tokenizer.token_to_word(token_int)

        try:
            output_text += " " + sampled_word
        except:
            output_text += ""
        count_tokens += 1
        
    output_tokens = decoder_input_data[0]
    
    print("Input text:")
    print(input_text)
    print()

    print("Translated text:")
    print(output_text)
    print()

    if true_output_text is not None:
        print("True output text:")
        print(true_output_text)
        print()

In [66]:
x = 2
translate(input_text=da_data[x],true_output_text=eng_data[x])

Input text:
Som De kan se, indfandt det store "år 2000-problem" sig ikke. Til gengæld har borgerne i en del af medlemslandene været ramt af meget forfærdelige naturkatastrofer.

Translated text:
 i the the the the the the                                                                                                    

True output text:
SSSS Although, as you will have seen, the dreaded 'millennium bug' failed to materialise, still the people in a number of countries suffered a series of natural disasters that truly were dreadful. EEEE

