In [1]:
import re
import string
from pickle import dump
from unicodedata import normalize
from numpy import array
from keras.preprocessing.text import Tokenizer
import os
import numpy as np
os.environ["CUDA_VISIBLE_DEVICES"]="0"
# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, mode='rt', encoding='utf-8')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

Using TensorFlow backend.


In [3]:
# split a loaded document into sentences
def to_pairs(doc):
	lines = doc.strip().split('\n')
	pairs = [line.split('\t') for line in  lines]
	return pairs

In [4]:
# clean a list of lines
def clean_pairs(lines):
	cleaned = list()
	# prepare regex for char filtering
	re_print = re.compile('[^%s]' % re.escape(string.printable))
	# prepare translation table for removing punctuation
	table = str.maketrans('', '', string.punctuation)
	for pair in lines:
		clean_pair = list()
		for line in pair:
			# normalize unicode characters
			line = normalize('NFD', line).encode('ascii', 'ignore')
			line = line.decode('UTF-8')
			# tokenize on white space
			line = line.split()
			# convert to lowercase
			line = [word.lower() for word in line]
			# remove punctuation from each token
			line = [word.translate(table) for word in line]
			# remove non-printable chars form each token
			line = [re_print.sub('', w) for w in line]
			# remove tokens with numbers in them
			line = [word for word in line if word.isalpha()]
			# store as string
			clean_pair.append(' '.join(line))
		cleaned.append(clean_pair)
	return array(cleaned)

In [5]:
# save a list of clean sentences to file
def save_clean_data(sentences, filename):
	dump(sentences, open(filename, 'wb'))
	print('Saved: %s' % filename)

# load dataset
filename = 'data/deu.txt'
doc = load_doc(filename)
# split into english-german pairs
pairs = to_pairs(doc)
# clean sentences
cp = clean_pairs(pairs)
# save clean pairs to file
save_clean_data(cp, 'old.pkl')
# spot check


Saved: old.pkl


In [6]:
len(cp)

176692

In [7]:
cp[140000]

array(['is it true that nobody lives around here',
       'stimmt es dass in dieser gegend niemand wohnt'], dtype='<U370')

In [8]:
from pickle import load
from pickle import dump
from numpy.random import rand
from numpy.random import shuffle

# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

# save a list of clean sentences to file
def save_clean_data(sentences, filename):
	dump(sentences, open(filename, 'wb'))
	print('Saved: %s' % filename)

# load dataset
raw_dataset = load_clean_sentences('old.pkl')

# reduce dataset size
n_sentences = 100000
dataset = raw_dataset[:n_sentences]
# random shuffle
shuffle(dataset)
# split into train/test
train, test = dataset[:90000], dataset[90000:]
# save
save_clean_data(dataset, 'english-german-both.pkl')
save_clean_data(train, 'english-german-train.pkl')
save_clean_data(test, 'english-german-test.pkl')

Saved: english-german-both.pkl
Saved: english-german-train.pkl
Saved: english-german-test.pkl


In [9]:
# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

# load datasets
dataset = load_clean_sentences('english-german-both.pkl')
train = load_clean_sentences('english-german-train.pkl')
test = load_clean_sentences('english-german-test.pkl')

In [10]:

# fit a tokenizer
def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer


In [11]:
# max sentence length
def max_length(lines):
	return max(len(line.split()) for line in lines)

In [12]:
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))
# prepare german tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])
print('German Vocabulary Size: %d' % ger_vocab_size)
print('German Max Length: %d' % (ger_length))

English Vocabulary Size: 9734
English Max Length: 9
German Vocabulary Size: 17785
German Max Length: 17


In [25]:
dataset[:, 0]

array(['throw the ball to tom', 'do you have any soft drinks',
       'you have to keep your promise', ..., 'thats toms family',
       'arent you new here', 'i forgot to close the curtains'],
      dtype='<U370')

In [32]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences


In [2]:
from pickle import load
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
# from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint

# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

# fit a tokenizer
def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

# max sentence length
def max_length(lines):
	return max(len(line.split()) for line in lines)

# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
	# integer encode sequences
	X = tokenizer.texts_to_sequences(lines)
	# pad sequences with 0 values
	X = pad_sequences(X, maxlen=length, padding='post')
	return X

# one hot encode target sequence
def encode_output(sequences, vocab_size):
	ylist = list()
	for sequence in sequences:
		encoded = to_categorical(sequence, num_classes=vocab_size)
		ylist.append(encoded)
	y = array(ylist)
	y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
	return y

# define NMT model
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
	model = Sequential()
	model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
	model.add(LSTM(n_units))
	model.add(RepeatVector(tar_timesteps))
	model.add(LSTM(n_units, return_sequences=True))
	model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
	return model

# load datasets
dataset = load_clean_sentences('english-german-both.pkl')
train = load_clean_sentences('english-german-train.pkl')
test = load_clean_sentences('english-german-test.pkl')

# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))
# prepare german tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])
print('German Vocabulary Size: %d' % ger_vocab_size)
print('German Max Length: %d' % (ger_length))

# prepare training data
print('Encoding trainX')
trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
trainY = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
print('Encoding trainY...')
trainY = encode_output(trainY, eng_vocab_size)
print('Done')
# prepare validation data
testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])
testY = encode_sequences(eng_tokenizer, eng_length, test[:, 0])
print('Encoding testY...')
testY = encode_output(testY, eng_vocab_size)
print('Done')
# define model
model = load_model('model.h5')
# model = define_model(ger_vocab_size, eng_vocab_size, ger_length, eng_length, 256)
# model.compile(optimizer='adam', loss='categorical_crossentropy')
# summarize defined model
print(model.summary())
# plot_model(model, to_file='model.png', show_shapes=True)
# fit model
filename = 'model_26march.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')


English Vocabulary Size: 9734
English Max Length: 9
German Vocabulary Size: 17785
German Max Length: 17
Encoding trainX
Encoding trainY...
Done
Encoding testY...
Done


NameError: name 'load_model' is not defined

In [68]:
from keras.models import load_model
model = load_model('model_26march.h5')
print(model.summary())
# plot_model(model, to_file='model.png', show_shapes=True)
# fit model
filename = 'model_27march.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 17, 256)           4552960   
_________________________________________________________________
lstm_1 (LSTM)                (None, 256)               525312    
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 9, 256)            0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 9, 256)            525312    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 9, 9734)           2501638   
Total params: 8,105,222
Trainable params: 8,105,222
Non-trainable params: 0
_________________________________________________________________
None


In [64]:
model.fit(trainX, trainY, epochs=2, batch_size=64, validation_data=(testX[:1000], testY[:1000]), callbacks=[checkpoint], verbose=1)

Train on 90000 samples, validate on 1000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f07a5a1fef0>

In [6]:
source = testX[0]
from numpy import argmax

# load model
# model = load_model('model.h5')
# generate target given source sequence
def predict_sequence(model, tokenizer, source):
	source = source.reshape((1, source.shape[0]))
	prediction = model.predict(source, verbose=0)[0]
	integers = [argmax(vector) for vector in prediction]
	target = list()
	for i in integers:
		word = word_for_id(i, tokenizer)
		if word is None:
			break
		target.append(word)
	return ' '.join(target)

# map an integer to a word
def word_for_id(integer, tokenizer):
	for word, index in tokenizer.word_index.items():
		if index == integer:
			return word
	return None


In [88]:
from keras import backend as K
K.clear_session()


In [84]:
for i in range(len(test)-20, len(test)):
    print(test[i][0],'|', test[i][1], '|', predict_sequence(model, eng_tokenizer, testX[i]))

NameError: name 'model' is not defined

In [80]:
text = "Ich wollte nach paris"
mine = encode_sequences(ger_tokenizer, ger_length, [text])
predict_sequence(model, eng_tokenizer, mine[0])

'i wanted to go to paris'

In [58]:
import numpy as np
np.save('ger_tokenizer', ger_tokenizer)
np.save('eng_tokenizer', eng_tokenizer)

array([ 22,   3, 105, 588,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0], dtype=int32)

In [50]:
testX[0]

array([   8,  196,   16,  305,   25,    1,  855,   34,  100, 5006,   15,
          0,    0,    0,    0,    0,    0], dtype=int32)

In [69]:
# evaluate the skill of the model
from nltk.translate.bleu_score import corpus_bleu
def evaluate_model(model, tokenizer, sources, raw_dataset):
    actual, predicted = list(), list()
    for i, source in enumerate(sources):
        # translate encoded source text
#       source = source.reshape((1, source.shape[0]))
        translation = predict_sequence(model, eng_tokenizer, source)
        raw_target, raw_src = raw_dataset[i]
#       if i < 10:
#           print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
        actual.append(raw_target.split())
        predicted.append(translation.split())
        if i>1000:
            break
#   actual = [[['this', 'is', 'a', 'test']], [['this', 'is', 'a', 'test']]]
#   predicted = [['this', 'is', 'a', 'test'], ['this', 'is', 'a', 'test']]
    actual2 = []
    for a in actual:
        actual2.append([a])
    actual = actual2

    # calculate BLEU score
    print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
    print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
    print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
    print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

In [70]:
evaluate_model(model, eng_tokenizer, testX, test)

BLEU-1: 0.620245
BLEU-2: 0.494344
BLEU-3: 0.432624
BLEU-4: 0.312695


In [89]:
# 1-gram individual BLEU
from nltk.translate.bleu_score import sentence_bleu
reference = [['meet', 'me', 'in', 'an', 'hour']]
candidate = ['me', 'in' ,'an' ,'hour']
score = sentence_bleu(reference, candidate,weights=(0.25, 0.25,0.25, 0.25))
print(score)

0.7788007830714049
