In [None]:
from pickle import load
from numpy import array
from numpy import argmax
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu

In [2]:
# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

In [3]:
# fit a tokenizer
def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

In [4]:
# max sentence length
def max_length(lines):
	return max(len(line.split()) for line in lines)

In [5]:
# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
	# integer encode sequences
	X = tokenizer.texts_to_sequences(lines)
	# pad sequences with 0 values
	X = pad_sequences(X, maxlen=length, padding='post')
	return X

In [6]:
# map an integer to a word
def word_for_id(integer, tokenizer):
	for word, index in tokenizer.word_index.items():
		if index == integer:
			return word
	return None


In [7]:
# generate target given source sequence
def predict_sequence(model, tokenizer, source):
	prediction = model.predict(source, verbose=0)[0]
	integers = [argmax(vector) for vector in prediction]
	target = list()
	for i in integers:
		word = word_for_id(i, tokenizer)
		if word is None:
			break
		target.append(word)
	return ' '.join(target)

In [8]:
# evaluate the skill of the model
def evaluate_model(model, tokenizer, sources, raw_dataset):
	actual, predicted = list(), list()
	for i, source in enumerate(sources):
		# translate encoded source text
		source = source.reshape((1, source.shape[0]))
		translation = predict_sequence(model, eng_tokenizer, source)
		raw_target, raw_src = raw_dataset[i]
		if i < 10:
			print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
		actual.append([raw_target.split()])
		predicted.append(translation.split())
	# calculate BLEU score
	print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
	print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
	print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
	print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

In [9]:
# load datasets
dataset = load_clean_sentences('../PKL Files/english-hiero-both.pkl')
train = load_clean_sentences('../PKL Files/english-hiero-train.pkl')
test = load_clean_sentences('../PKL Files/english-hiero-test.pkl')
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
# prepare hieroglyph tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])
# prepare data
trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])

In [10]:
# load model
model = load_model('../Model Files/model.h5')
# test on some training sequences
print('train')
evaluate_model(model, eng_tokenizer, trainX, train)
# test on some test sequences
print('test')
evaluate_model(model, eng_tokenizer, testX, test)

train
src=[N29Q3D36S29D36N37 V28AA1N29Q3I9V28D058 X1D36 Z4G01V28 M17AAA1 G01M170AA1V31N29 I9N29], target=[Theres nothing we can do about it], predicted=[theres nothing we can do about it]
src=[I9N29 X1G01N37 N29D36S29S29I9M170D21D46 Z4AA1D21M17A D46D36N37N29D36S29M17AG01D46], target=[It was terribly cold yesterday], predicted=[it was terribly cold yesterday]
src=[I9 Q3G01W11D36 V28AA1 N37N29G01N29I9N37N29I9Z4N37], target=[I have no statistics], predicted=[i have no statistics]
src=[D46AA1V31S29D36 D058AA1I9V28D058 N29AA1AA1 G43G01N37N29], target=[Youre going too fast], predicted=[youre going too fast]
src=[N29AA1O4N37 D36D46D36N37 G01S29D36 M170D21V31D36], target=[Toms eyes are blue], predicted=[toms eyes are blue]
src=[Q3D36 I9N37 N29Q3D36 D21G01N37N29 O4G01V28 I9 X1G01V28N29 N29AA1 N37D36D36], target=[He is the last man I want to see], predicted=[he is the last man i want to see]
src=[G01D21D21 AA1G43 N29Q3D36N37D36 O4D36D36N29I9V28D058N37 G01S29D36 I9V28 D36V28D058D21I9N37Q3], targe