# Train Neural Translation Model

This involves both loading and preparing the clean text data ready for modeling and defining and training the model on the prepared data.

In [3]:
#import libraries
from pickle import load
from numpy import array

In [4]:
# load a clean dataset
def load_clean_sentences(fname):
    return load(open(fname, 'rb'))

In [5]:
# load datasets
dataset = load_clean_sentences('english-german-both.pkl')
train = load_clean_sentences('english-german-train.pkl')
test = load_clean_sentences('english-german-test.pkl')

In [10]:
dataset[0:10]

array([['he can barely read', 'er kann kaum lesen'],
       ['you have to hurry', 'du musst dich beeilen'],
       ['i am the same age', 'ich bin im gleichen alter'],
       ['id like some coffee', 'ich hatte gerne etwas kaffee'],
       ['let them know were busy',
        'gib ihnen bescheid dass wir zu tun haben'],
       ['the girls had a catfight',
        'zwischen den madchen kam es zum zickenkrieg'],
       ['i dont have any money', 'ich hab keinen pfennig'],
       ['i sure hope youre right', 'ich hoffe wirklich dass du recht hast'],
       ['many people do this', 'viele leute machen das'],
       ['they must be americans', 'sie mussen amerikaner sein']],
      dtype='<U370')

I'm using combination of the train and test datasets to define the maximum length and vocabulary of the problem.

We can use the Keras Tokenize class to map words to integers, as needed for modeling. We will use separate tokenizer for the English sequences and the German sequences. 

In [6]:
# import libraries
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint

Using TensorFlow backend.


In [7]:
# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [8]:
# find the length of the longest sequence in a list of phrases.
def max_length(lines):
    return max(len(line.split()) for line in lines)

In [9]:
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))

English Vocabulary Size: 6715
English Max Length: 8


In [11]:
# prepare german tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])
print('German Vocabulary Size: %d' % ger_vocab_size)
print('German Max Length: %d' % (ger_length))

German Vocabulary Size: 11532
German Max Length: 17


Each input and output sequence must be encoded to integers and padded to the maximum phrase length. This is because we will use a word embedding for the input sequences and one hot encode the output sequences

In [12]:
# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
    # integer encode sequences
    X = tokenizer.texts_to_sequences(lines)
    # pad sequences with 0 values
    X = pad_sequences(X, maxlen=length, padding='post')
    return X

The output sequence needs to be one-hot encoded. This is because the model will predict the probability of each word in the vocabulary as output

In [13]:
# one hot encode target sequence
def encode_output(sequences, vocab_size):
    ylist = list()
    for sequence in sequences:
        encoded = to_categorical(sequence, num_classes=vocab_size)
        ylist.append(encoded)
    y = array(ylist)
    y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
    return y

In [14]:
# prepare training data
trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
trainY = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
trainY = encode_output(trainY, eng_vocab_size)

MemoryError: 

In [None]:
# prepare validation data
testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])
testY = encode_sequences(eng_tokenizer, eng_length, test[:, 0])
testY = encode_output(testY, eng_vocab_size)