# Machine Translation
    
    
Our model consists of two parts : An encoder which maps source-text into a vector " summarizes text's content " , which is the input to second part decoder which decodes the vector to destination-text

## Imports

In [1]:
import os
import math
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
%matplotlib inline

In [2]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input , Dense, GRU , Embedding
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.callbacks import EarlyStopping , ModelCheckpoint , TensorBoard
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
tf.__version__

'2.9.1'

In [4]:
tf.keras.__version__

'2.9.0'

## Load Data 

In [5]:
startMark = "ssss "
endMark = " eeee"

In [6]:
ls

 Volume in drive H is Courses
 Volume Serial Number is 4408-8541

 Directory of H:\Projects\Machine Translation

09/18/2022  02:33 PM    <DIR>          .
09/18/2022  02:33 PM    <DIR>          ..
09/16/2022  05:22 PM    <DIR>          .ipynb_checkpoints
09/16/2022  05:52 PM    <DIR>          __pycache__
05/15/2012  03:19 AM       295,533,752 europarl-v7.da-en.da
05/15/2012  03:19 AM       291,801,750 europarl-v7.da-en.en
09/18/2022  02:33 PM            28,670 Untitled.ipynb
               3 File(s)    587,364,172 bytes
               4 Dir(s)  94,831,833,088 bytes free


In [7]:
with open('europarl-v7.da-en.da',encoding='utf-8') as f:
    dataSrc = [ line.strip() for line in f ]

In [8]:
with open('europarl-v7.da-en.en',encoding='utf-8') as f:
    dataDest = [startMark + line.strip() + endMark for line in f ]

In [9]:
dataSrc[2]

'Som De kan se, indfandt det store "år 2000-problem" sig ikke. Til gengæld har borgerne i en del af medlemslandene været ramt af meget forfærdelige naturkatastrofer.'

In [10]:
dataDest[2]

"ssss Although, as you will have seen, the dreaded 'millennium bug' failed to materialise, still the people in a number of countries suffered a series of natural disasters that truly were dreadful. eeee"

We need a few more functions than provided by Keras' Tokenizer-class so we wrap it.


In [11]:
num_words = 10000

In [12]:
class TokenizerWrap(Tokenizer):
    
    def __init__(self,texts,padding , reverse=False, num_words=None):
        
        
        Tokenizer.__init__(self,num_words=num_words)
        
        self.fit_on_texts(texts)
        
        self.index_to_word = dict(zip(self.word_index.values(),
                                      self.word_index.keys()))
        
        self.tokens = self.texts_to_sequences(texts)
        
        
        if reverse :
            # reversing token-sequence
            self.tokens = [list(reversed(x)) for x in self.tokens]
            # sequence that are too long should be trucated at the beginning
            # "which correspond to the end of original sequence" 
            truncating = 'pre'
        else:
            # sequence that are too long should be trucated at the end
            truncating = 'post'
        self.numTokens = [len(x) for x in self.tokens]
        # mean + 2 * std = Around 95% "only have to truncate 5% of all sequences"
        self.maxTokens = np.mean(self.numTokens) + 2 * np.std(self.numTokens)
        
        self.maxTokens = int(self.maxTokens)
        
        self.tokenPadded = pad_sequences(self.tokens , maxlen = self.maxTokens ,
                                         padding = padding ,truncating=truncating)
        
    def token_to_word(self,token):
        """ Lookup a single word from a token
        """
        return " " if token ==0 else self.index_to_word[token]
        
    def tokens_to_string(self,tokens):
        """Convert a list of tokens to a string"""
        text = [self.index_to_word[token] 
                    for token in tokens
                    if token != 0]
        text = " ".join(text)
        return text

    def text_to_tokens(self,text,reverse=False, padding=False):
        """Convert a string to a list of tokens"""
        tokens = self.texts_to_sequences(text)
        tokens = np.array(tokens)

        if reverse:
            tokens =np.flip(tokens, axis = 1)

            truncating = 'pre'
        else:
            truncating = 'post'

        if padding :
            tokens = pad_sequences(tokens , maxlen=self.maxTokens ,
                                   truncating=truncating ,padding='pre')

        return tokens


Note that we pad zeros at the beginning ('pre') of the sequences. We also reverse the sequences of tokens because the research literature suggests that this might improve performance, because the last words seen by the encoder match the first words produced by the decoder, so short-term dependencies are supposedly modelled more accurately.

In [13]:
%%time
tokenizerSrc = TokenizerWrap(texts=dataSrc,
                              padding='pre',
                              reverse=True,
                              num_words=num_words)

Wall time: 2min 33s


In [14]:
%%time
tokenizerDest = TokenizerWrap(texts=dataDest,
                              padding='post',
                              reverse=False,
                              num_words=num_words)

Wall time: 1min 31s


In [15]:
tokenSrc = tokenizerSrc.tokenPadded
tokenDest = tokenizerDest.tokenPadded
print(f'The shape of Source tokens is {tokenSrc.shape}')
print(f'The shape of Destination tokens is {tokenDest.shape}')

The shape of Source tokens is (1968800, 47)
The shape of Destination tokens is (1968800, 55)


In [16]:
startMark.strip()

'ssss'

In [17]:
startToken = tokenizerDest.word_index[startMark.strip()]
startToken

2

This is the integer used to mark the beginning of the sentence in destination language

In [18]:
endToken = tokenizerDest.word_index[endMark.strip()]
endToken

3

### Example of Tokens 

In [19]:
idx = 2

In [20]:
tokenSrc[idx]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0, 3069,
       3374,   43,    7, 1386,  108, 1995,    7,  178,    9,    3,  302,
         19, 2076,    8,   20,   39,  285,  499,   69,  136,    5,  166,
         24,   10,   13])

Note how it is padded with zero at the beginning

In [21]:
tokenizerSrc.tokens_to_string(tokenSrc[idx])

'naturkatastrofer forfærdelige meget af ramt været medlemslandene af del en i borgerne har gengæld til ikke sig problem 2000 år store det se kan de som'

In [22]:
dataSrc[idx]

'Som De kan se, indfandt det store "år 2000-problem" sig ikke. Til gengæld har borgerne i en del af medlemslandene været ramt af meget forfærdelige naturkatastrofer.'

In [23]:
tokenDest[idx]

array([   2,  404,   19,   43,   26,   20,  618,    1, 1451,    5, 9785,
        174,    1,   81,    7,    9,  214,    4,   67, 2200,    9, 1596,
          4,  892, 1762,    8, 1480,  107, 5494,    3,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0])

In [24]:
tokenizerDest.tokens_to_string(tokenDest[idx])

'ssss although as you will have seen the failed to materialise still the people in a number of countries suffered a series of natural disasters that truly were dreadful eeee'

In [25]:
dataDest[idx]

"ssss Although, as you will have seen, the dreaded 'millennium bug' failed to materialise, still the people in a number of countries suffered a series of natural disasters that truly were dreadful. eeee"

## Training Data

In [78]:
encoderInputData = tokenSrc
encoderInputData.shape

(1968800, 47)

In [76]:
decoderInputData = tokenDest[:,:-1]
decoderInputData.shape

(1968800, 54)

In [81]:
decoderOutData = tokenDest[:,1:]
decoderOutData.shape

(1968800, 54)

In [79]:
idx = 2

In [80]:
decoderInputData[idx]

array([   2,  404,   19,   43,   26,   20,  618,    1, 1451,    5, 9785,
        174,    1,   81,    7,    9,  214,    4,   67, 2200,    9, 1596,
          4,  892, 1762,    8, 1480,  107, 5494,    3,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0])

In [82]:
decoderOutData[idx]

array([ 404,   19,   43,   26,   20,  618,    1, 1451,    5, 9785,  174,
          1,   81,    7,    9,  214,    4,   67, 2200,    9, 1596,    4,
        892, 1762,    8, 1480,  107, 5494,    3,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0])

In [83]:
tokenizerDest.tokens_to_string(decoderInputData[idx])

'ssss although as you will have seen the failed to materialise still the people in a number of countries suffered a series of natural disasters that truly were dreadful eeee'

In [84]:
tokenizerDest.tokens_to_string(decoderOutData[idx])

'although as you will have seen the failed to materialise still the people in a number of countries suffered a series of natural disasters that truly were dreadful eeee'

## Create the Nerual Network

### 1) Create the Encoder 

first we create the encoder part of out neural network which maps a sequences of integer-tokens to a vector

In [96]:
encoderInput = Input(shape=(None,),name='encoderInput')

In [97]:
embeddingSize = 128

In [98]:
encoderEmbedding = Embedding(input_dim=num_words , output_dim=embeddingSize
                            ,name='encoder_embedding')

In [99]:
stateSize = 512

In [100]:
encoderGru1 = GRU(stateSize , return_sequences=True , name='encoderGRU1')
encoderGru2 = GRU(stateSize , return_sequences=True , name='encoderGRU2')
encoderGru3 = GRU(stateSize , return_sequences=False , name='encoderGRU3')

In [101]:
def encoderConnection():
    
    # starts the neural network with input layer
    net = encoderInput
    
    # conntect embedding layer
    net = encoderEmbedding(net)
    
    # connect all GRU-Layers
    net = encoderGru1(net)
    net = encoderGru2(net)
    net = encoderGru3(net)
    # return the output of the encoder part
    return net
    

In [102]:
encoderOutput = encoderConnection()

### 2) Create the Decoder

The decoder takes two inputs , first it needs the vector produced by encoder

In [103]:
decoderIntialState = Input(shape=(stateSize,),name='decoderIntialState')

second it needs a sequence of integer-tokens "for desination"

In [104]:
decoderInput = Input(shape=(None,),
                              name='decoderInput')

In [105]:
decoderEmbedding = Embedding(input_dim = num_words,
                            output_dim = embeddingSize,
                            name='decoderEmbedding')

In [106]:
decoderGru1 = GRU(stateSize , return_sequences=True , name='decoderGRU1')
decoderGru2 = GRU(stateSize , return_sequences=True , name='decoderGRU2')
decoderGru3 = GRU(stateSize , return_sequences=True , name='decoderGRU3')

In [132]:
decoderDense = Dense(num_words,
                    activation='softmax',
                    name='decoderOutput')

In [133]:
def decoderConnection(initailState_param):
    net = decoderInput
    
    net = decoderEmbedding(net)
    
    net = decoderGru1(net,initial_state=initailState_param)
    net = decoderGru2(net,initial_state=initailState_param)
    net = decoderGru3(net,initial_state=initailState_param)
    
    output = decoderDense(net)
    
    return output

In [134]:
decoderOutput = decoderConnection(initailState_param=encoderOutput)

In [135]:
modelTrain = Model(inputs=[encoderInput, decoderInput],
                    outputs=[decoderOutput])


In [136]:
modelEncoder = Model(inputs=[encoderInput],outputs=[encoderOutput])

In [137]:
decoderOutput = decoderConnection(decoderIntialState)

decoderModel = Model(inputs=[decoderInput,decoderIntialState],
                    outputs=[decoderOutput])

In [148]:
modelTrain.summary()

Model: "model_10"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoderInput (InputLayer)      [(None, None)]       0           []                               
                                                                                                  
 encoder_embedding (Embedding)  (None, None, 128)    1280000     ['encoderInput[0][0]']           
                                                                                                  
 encoderGRU1 (GRU)              (None, None, 512)    986112      ['encoder_embedding[0][0]']      
                                                                                                  
 decoderInput (InputLayer)      [(None, None)]       0           []                               
                                                                                           

In [152]:
tf.keras.utils.plot_model(
    modelTrain,
    to_file="model.png",
    show_shapes=False,
    show_dtype=False,
    show_layer_names=True,
    rankdir="TB",
    expand_nested=False,
    dpi=96,
    layer_range=None,
    show_layer_activations=False,
)


You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model/model_to_dot to work.


In [138]:
modelTrain.compile(optimizer=RMSprop(learning_rate=1e-3),
                  loss='sparse_categorical_crossentropy')

In [139]:
checkPath = '21_checkpoint.keras'
checkPoint = ModelCheckpoint(filepath=checkPath,
                            monitor='val_loss',
                            verbose=1,
                            save_weights_only=True,
                            save_best_only=True)

stopping the optimization when performance worsens on the validation-set.

In [140]:
earlyStop = EarlyStopping(monitor='val_loss',
                         patience=3,
                         verbose=1)

In [141]:
callTensorBoard = TensorBoard(log_dir='./21_logs/',
                                   histogram_freq=0,
                                   write_graph=False)

In [142]:
callBacks = [checkPoint,
            earlyStop,
            callTensorBoard]

### Load check point

In [143]:
try:
    modelTrain.load_weights(checkPath)
except Exception as e:
    print('Error with loading weights')
    print(e)

Error with loading weights
Unable to open file (unable to open file: name = '21_checkpoint.keras', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)


### Training Model

In [144]:
xData = {
    'encoderInput':encoderInputData,
    'decoderInput':decoderInputData
}

In [145]:
yData= {
    'decoderOutput':decoderOutData
}

In [146]:
validationSplit = 10000/ len(encoderInputData)
validationSplit

0.0050792360828931325

In [147]:
modelTrain.fit(x=xData,
               y=yData,
               epochs=10,
              batch_size=284,
              validation_split=validationSplit,
              callbacks=callBacks)

Epoch 1/10
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
  22/6898 [..............................] - ETA: 20:51:19 - loss: 4.1387

KeyboardInterrupt: 