Tutorial from: https://machinelearningmastery.com/develop-neural-machine-translation-system-keras/

# 1 Preparing Text Data
mwp - eng
eqn - ger

In [8]:
import pandas as pd
import numpy as np

In [1]:
# clean text
def load_doc(fname):
    file = open(fname, mode='rt', encoding='utf-8')
#     read all text
    text = file.read()
    file.close()
    return text

In [2]:
# split loaded text document into sentences
def to_pairs(doc):
    lines = doc.strip().split('\n')
    pairs = [line.split('\t') for line in lines]
    return pairs

In [3]:
def extract_pairs_csv(fname):
#  takes fname and returns src, targ pair in a list of tuples    
    with open(fname, 'r') as data_file:
        json_data = data_file.read()
        
    df = pd.read_json(json_data, orient='records')
    df["q"] = df["Body"] +" "+df["Question"]
    lines = list(df.q)
    lines2 = list(df.Equation)
    pairs = list(zip(lines, lines2))
    return pairs

In [4]:
import string
import re
from pickle import dump
from unicodedata import normalize
from numpy import array

In [10]:
# clean a list of lines
def clean_pairs(lines):
    cleaned = list()
    # regex for char filtering
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    # translation table for removing punctuation
#     table = str.maketrans('', '', string.punctuation)
    
    for pair in lines:
        clean_pair = list()
        for line in pair:
            #normalise unicode chars
            line = normalize('NFD', line).encode('ascii', 'ignore')
            line = line.decode('UTF-8')
            
            # tokenize based on white spaces
            line = line.split()
            
            # make lower
            line = [word.lower() for word in line]
            
            # remove punctuation using trans table created above
#             line = [word.translate(table) for word in line]
            
            #remove non=printable chars from each token
            line = [re_print.sub('', w) for w in line]
            
            # finally, store as string!
            clean_pair.append(' '.join(line))
        cleaned.append(clean_pair)
    
    return array(cleaned)

In [11]:
# save a list of clean sentences to file
def save_clean_data(sentences, fname):
    dump(sentences, open(fname, 'wb'))
    print(f'saved {fname}')

LOAD THE DATASET

In [12]:
filename = '../Datasets/SVAMP.json'
pairs = extract_pairs_csv(filename)
clean_pairs = clean_pairs(pairs)
save_clean_data(clean_pairs, 'SVAMP.pkl')

#check
for i in range(100):
    print(f'{clean_pairs[i,0]} => {clean_pairs[i,1]}')

saved SVAMP.pkl
each pack of dvds costs 76 dollars. if there is a discount of 25 dollars on each pack how much do you have to pay to buy each pack? => ( 76.0 - 25.0 )
dan had $ 3 left with him after he bought a candy bar. if he had $ 4 at the start how much did the candy bar cost? => ( 4.0 - 3.0 )
paco had 26 salty cookies and 17 sweet cookies. he ate 14 sweet cookies and 9 salty cookies. how many salty cookies did paco have left? => ( 26.0 - 9.0 )
43 children were riding on the bus. at the bus stop some children got off the bus. then there were 21 children left on the bus. how many children got off the bus at the bus stop? => ( 43.0 - 21.0 )
28 children were riding on the bus. at the bus stop 82 children got on the bus while some got off the bus. then there were 30 children altogether on the bus. how many more children got on the bus than those that got off? => ( 30.0 - 28.0 )
there were 3 dollars in olivia's wallet. she collected 49 more dollars from an atm. after she visited a super

# 2 Split Text
It's very long: 150,000 phrase pairs. Good for small NMT model.
We will simplify the problem by reducing the dataset to only the first 10,000 examples in the file; these will be the shortest phrases in the dataset.

In [14]:
len(pairs)

1000

In [16]:
from pickle import load
from pickle import dump
from numpy.random import rand
from numpy.random import shuffle

# load a clean dataset
def load_clean_sentences(filename):
    return load(open(filename, 'rb'))
 
# save a list of clean sentences to file
def save_clean_data(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print(f'Saved: {filename}')
 
# load dataset
raw_dataset = load_clean_sentences('SVAMP.pkl')
 
# reduce dataset size
n_sentences = 1000
dataset = raw_dataset[:n_sentences, :]
# random shuffle
shuffle(dataset)
# split into train/test
train, test = dataset[:900], dataset[900:]
# save
save_clean_data(dataset, 'SVAMP.pkl')
save_clean_data(train, 'SVAMPtrain.pkl')
save_clean_data(test, 'SVAMPtest.pkl')

Saved: SVAMP.pkl
Saved: SVAMPtrain.pkl
Saved: SVAMPtest.pkl


In [17]:
for i in range(50):
    print(f'{train[i,0]} => {train[i,1]}')

ed had 29 more marbles than doug. ed lost 17 of his marbles at the playground. how many more marbles did ed have than doug then? => ( 29.0 - 17.0 )
paige was helping her mom plant flowers and together they planted some seeds. they put 10 seeds in each flower bed. if there are 45 flowerbeds how many seeds did they plant? => ( 10.0 * 45.0 )
helen the hippo and her friends are preparing for thanksgiving at helen's house. helen baked 90 chocolate chip cookies yesterday and 51 raisin cookies and 484 chocolate chip cookies this morning. how many chocolate chip cookies did helen bake? => ( 90.0 + 484.0 )
helen the hippo and her friends are preparing for thanksgiving at helen's house. helen baked 527 chocolate chip cookies and 86 raisin cookies yesterday. and she baked 86 raisin cookies and 554 chocolate chip cookies this morning. how many chocolate chip cookies did helen bake? => ( 527.0 + 554.0 )
they decided to hold the party in their backyard. if they have 11 sets of tables and each set ha

the english-german-both.pkl that contains all of the train and test examples that we can use to define the parameters of the problem, such as max phrase lengths and the vocabulary, and the english-german-train.pkl and english-german-test.pkl files for the train and test dataset.

We are now ready to start developing our translation model.

# 3 Train Neural Machine Translation Model

In [18]:
# import necessary modules
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from pickle import load
from numpy import array
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint

In [19]:
# load datasets
dataset = load_clean_sentences('SVAMP.pkl')
train = load_clean_sentences('SVAMPtrain.pkl')
test = load_clean_sentences('SVAMPtest.pkl')

We will use separate tokenizer for the English sequences and the German sequences.

In [20]:
# fit a tokenizer 
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

# max sentence length
def max_length(lines):
    return max(len(line.split()) for line in lines)

In [23]:
# prepare english tokenizer
mwp_tokenizer = create_tokenizer(dataset[:, 0])
mwp_vocab_size = len(mwp_tokenizer.word_index) + 1
mwp_length = max_length(dataset[:, 0])
print('MWP Vocabulary Size: %d' % mwp_vocab_size)
print('MWP Max Length: %d' % (mwp_length))

# prepare german tokenizer
eqn_tokenizer = create_tokenizer(dataset[:, 1])
eqn_vocab_size = len(eqn_tokenizer.word_index) + 1
eqn_length = max_length(dataset[:, 1])
print('Equation Vocabulary Size: %d' % eqn_vocab_size)
print('Equation Max Length: %d' % (eqn_length))

MWP Vocabulary Size: 1237
MWP Max Length: 57
Equation Vocabulary Size: 350
Equation Max Length: 9


We are now ready to prepare the training dataset.

Each input and output sequence must be encoded to integers and padded to the maximum phrase length. This is because we will use a word embedding for the input sequences and one hot encode the output sequences The function below named encode_sequences() will perform these operations and return the result.

In [24]:
def encode_sequences(tokenizer, length, lines):
    # encode sequences to integers
    X = tokenizer.texts_to_sequences(lines)
    # pad sequence with 0's
    X = pad_sequences(X, maxlen=length, padding='post')
    return X

Now we gotts one hot encode it: cus we'll predict prob of each word in vocab as output.

https://www.tensorflow.org/api_docs/python/tf/keras/utils/to_categorical

In [25]:
def encode_output(sequences, vocab_size):
    ylist = list()
    for sequence in sequences:
        encoded = to_categorical(sequence, num_classes=vocab_size)
        ylist.append(encoded)
    y = array(ylist)
    y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
    return y

In [26]:
# prepare training data
trainX = encode_sequences(eqn_tokenizer, eqn_length, train[:, 1])
trainY = encode_sequences(mwp_tokenizer, mwp_length, train[:, 0])
trainY = encode_output(trainY, mwp_vocab_size)

# prepare validation data
testX = encode_sequences(eqn_tokenizer, eqn_length, test[:, 1])
testY = encode_sequences(mwp_tokenizer, mwp_length, test[:, 0])
testY = encode_output(testY, mwp_vocab_size)

### Let's define model.

enc-dec LSTM

https://machinelearningmastery.com/configure-encoder-decoder-model-neural-machine-translation/

In [27]:
# define NMT model
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
    model = Sequential()
#     SOURCE EMBEDDING
    model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
#     N UNITS LONG LSTM
    model.add(LSTM(n_units))
#     REPEAT SAME VAL TAR_STEPS TIMES
    model.add(RepeatVector(tar_timesteps))
#     ADD DECODER N UNITS, WITH RETURNED SEQUENCES
    model.add(LSTM(n_units, return_sequences=True))
#     USE SOFTMAX
    model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
    return model


In [28]:
# define model
model = define_model(eqn_vocab_size, mwp_vocab_size, eqn_length, mwp_length, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy')

# summarize defined model
print(model.summary())
plot_model(model, to_file='model.png', show_shapes=True)

# fit model
filename = 'svampsmodel.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(trainX, trainY, epochs=30, batch_size=64, validation_data=(testX, testY), callbacks=[checkpoint], verbose=2)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 9, 256)            89600     
                                                                 
 lstm (LSTM)                 (None, 256)               525312    
                                                                 
 repeat_vector (RepeatVector  (None, 57, 256)          0         
 )                                                               
                                                                 
 lstm_1 (LSTM)               (None, 57, 256)           525312    
                                                                 
 time_distributed (TimeDistr  (None, 57, 1237)         317909    
 ibuted)                                                         
                                                                 
Total params: 1,458,133
Trainable params: 1,458,133
Non-

<keras.callbacks.History at 0x25b2c62b490>

#  4 Evaluate Model

In [29]:
from pickle import load
from numpy import array
from numpy import argmax
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu

# map an integer to a word
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

# generate target given source sequence
def predict_sequence(model, tokenizer, source):
    prediction = model.predict(source, verbose=0)[0]
    integers = [argmax(vector) for vector in prediction]
    target = list()
    for i in integers:
        word = word_for_id(i, tokenizer)
        if word is None:
            break
        target.append(word)
    return ' '.join(target)

In [30]:
# evaluate the skill of the model
def evaluate_model(model, tokenizer, sources, raw_dataset):
    actual, predicted = list(), list()
    for i, source in enumerate(sources):
        # translate encoded source text
        source = source.reshape((1, source.shape[0]))
        translation = predict_sequence(model, mwp_tokenizer, source)
        raw_target, raw_src = raw_dataset[i]
        if i < 10:
            print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
        actual.append([raw_target.split()])
        predicted.append(translation.split())
    # calculate BLEU score
    print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
    print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
    print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
    print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

In [None]:
# load datasets
dataset = load_clean_sentences('SVAMP.pkl')
train = load_clean_sentences('SVAMPtrain.pkl')
test = load_clean_sentences('SVAMPtest.pkl')
# prepare mwp tokenizer
mwp_tokenizer = create_tokenizer(dataset[:, 0])
mwp_vocab_size = len(mwp_tokenizer.word_index) + 1
mwp_length = max_length(dataset[:, 0])
# prepare eqn tokenizer
eqn_tokenizer = create_tokenizer(dataset[:, 1])
eqn_vocab_size = len(eqn_tokenizer.word_index) + 1
eqn_length = max_length(dataset[:, 1])

# prepare data
trainX = encode_sequences(eqn_tokenizer, eqn_length, train[:, 1])
testX = encode_sequences(eqn_tokenizer, eqn_length, test[:, 1])

# load model
model = load_model('svampsmodel.h5')
# test on some training sequences
print('train')
evaluate_model(model, mwp_tokenizer, trainX, train)
# test on some test sequences
print('test')
evaluate_model(model, mwp_tokenizer, testX, test)

train
src=[( 29.0 - 17.0 )], target=[ed had 29 more marbles than doug. ed lost 17 of his marbles at the playground. how many more marbles did ed have than doug then?], predicted=[had had had the the the the the the the and and and and how how how how how how]
src=[( 10.0 * 45.0 )], target=[paige was helping her mom plant flowers and together they planted some seeds. they put 10 seeds in each flower bed. if there are 45 flowerbeds how many seeds did they plant?], predicted=[had had had the and the the the the the the the the the and and and and and how how how how how how how how]
src=[( 90.0 + 484.0 )], target=[helen the hippo and her friends are preparing for thanksgiving at helen's house. helen baked 90 chocolate chip cookies yesterday and 51 raisin cookies and 484 chocolate chip cookies this morning. how many chocolate chip cookies did helen bake?], predicted=[had had had the and and the the the the the the the the the the and and and and and and how how how how how how how how how]