In [87]:
# %load 1_prepare_data-1.py
import string
import re
from pickle import dump
from unicodedata import normalize
from numpy import array

# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, mode='rt', encoding='utf-8')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

# split a loaded document into sentences
def to_pairs(doc):
	lines = doc.strip().split('\n')
	pairs = [line.split('\t') for line in  lines]
	return pairs

# clean a list of lines
def clean_pairs(lines):
	cleaned = list()
	# prepare regex for char filtering
	re_punc = re.compile('[%s]' % re.escape(string.punctuation))
	re_print = re.compile('[^%s]' % re.escape(string.printable))
	for pair in lines:
		clean_pair = list()
		for line in pair:
			# normalize unicode characters
			line = normalize('NFD', line).encode('ascii', 'ignore')
			line = line.decode('UTF-8')
			# tokenize on white space
			line = line.split()
			# convert to lowercase
			line = [word.lower() for word in line]
			# remove punctuation from each token
			line = [re_punc.sub('', w) for w in line]
			# remove non-printable chars form each token
			line = [re_print.sub('', w) for w in line]
			# remove tokens with numbers in them
			line = [word for word in line if word.isalpha()]
			# store as string
			clean_pair.append(' '.join(line))
		cleaned.append(clean_pair)
	return array(cleaned)

# save a list of clean sentences to file
def save_clean_data(sentences, filename):
	dump(sentences, open(filename, 'wb'))
	print('Saved: %s' % filename)

# load dataset
filename = 'deu.txt'
doc = load_doc(filename)
# split into english-german pairs
pairs = to_pairs(doc)
# clean sentences
clean_pairs = clean_pairs(pairs)
# save clean pairs to file
save_clean_data(clean_pairs, 'english-german.pkl')
# spot check
for i in range(100):
	print('[%s] => [%s]' % (clean_pairs[i,0], clean_pairs[i,1]))

Saved: english-german.pkl
[hi] => [hallo]
[hi] => [gru gott]
[run] => [lauf]
[wow] => [potzdonner]
[wow] => [donnerwetter]
[fire] => [feuer]
[help] => [hilfe]
[help] => [zu hulf]
[stop] => [stopp]
[wait] => [warte]
[go on] => [mach weiter]
[hello] => [hallo]
[i ran] => [ich rannte]
[i see] => [ich verstehe]
[i see] => [aha]
[i try] => [ich probiere es]
[i won] => [ich hab gewonnen]
[i won] => [ich habe gewonnen]
[smile] => [lacheln]
[cheers] => [zum wohl]
[freeze] => [keine bewegung]
[freeze] => [stehenbleiben]
[got it] => [verstanden]
[got it] => [einverstanden]
[he ran] => [er rannte]
[he ran] => [er lief]
[hop in] => [mach mit]
[hug me] => [druck mich]
[hug me] => [nimm mich in den arm]
[hug me] => [umarme mich]
[i fell] => [ich fiel]
[i fell] => [ich fiel hin]
[i fell] => [ich sturzte]
[i fell] => [ich bin hingefallen]
[i fell] => [ich bin gesturzt]
[i know] => [ich wei]
[i lied] => [ich habe gelogen]
[i lost] => [ich habe verloren]
[im] => [ich bin jahre alt]
[im] => [ich bin]
[im

In [154]:
# %load 2_split_data-1.py
from pickle import load
from pickle import dump
from numpy.random import shuffle

# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

# save a list of clean sentences to file
def save_clean_data(sentences, filename):
	dump(sentences, open(filename, 'wb'))
	print('Saved: %s' % filename)

# load dataset
raw_dataset = load_clean_sentences('english-german.pkl')

# reduce dataset size
n_sentences = 20000
dataset = raw_dataset[:n_sentences, :]
# random shuffle
shuffle(dataset)
# split into train/test
train, test = dataset[:18000], dataset[18000:]


# q,w=train.T
# for i in range(0, len(q)):
#     my_string = q[i]
#     reversed_string = " ".join(my_string.split(" ")[::-1])
#     q[i]=reversed_string
# import numpy as np
# train=np.column_stack((q,w))



# save
save_clean_data(dataset, 'english-german-both.pkl')
save_clean_data(train, 'english-german-train.pkl')
save_clean_data(test, 'english-german-test.pkl')

Saved: english-german-both.pkl
Saved: english-german-train.pkl
Saved: english-german-test.pkl


In [155]:
train

array([['call the doctor', 'rufen sie einen arzt'],
       ['come quickly', 'kommt schnell'],
       ['i know how to fly', 'ich wei wie man fliegt'],
       ...,
       ['fix the clock', 'repariere die uhr'],
       ['raise your hand', 'hebe deine hand'],
       ['this cd costs', 'diese cd kostet zehn dollar']], dtype='<U370')

In [156]:
# %load 3_train_model-1.py
from pickle import load
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
# from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint

# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

# fit a tokenizer
def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

# max sentence length
def max_length(lines):
	return max(len(line.split()) for line in lines)

# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
	# integer encode sequences
	X = tokenizer.texts_to_sequences(lines)
	# pad sequences with 0 values
	X = pad_sequences(X, maxlen=length, padding='post')
	return X

# one hot encode target sequence
def encode_output(sequences, vocab_size):
	ylist = list()
	for sequence in sequences:
		encoded = to_categorical(sequence, num_classes=vocab_size)
		ylist.append(encoded)
	y = array(ylist)
	y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
	return y

# define NMT model
def define_model(src_vocab, tar_vocab, source_steps, target_steps, embedding_dim):
	model = Sequential()
	model.add(Embedding(src_vocab, embedding_dim, input_length=source_steps, mask_zero=True))
	model.add(LSTM(embedding_dim))
	model.add(RepeatVector(target_steps))
	model.add(LSTM(embedding_dim, return_sequences=True))
	model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
	# compile model
	model.compile(optimizer='adam', loss='categorical_crossentropy')
	# summarize defined model
	model.summary()
	#plot_model(model, to_file='model.png', show_shapes=True)
	return model

# load datasets
dataset = load_clean_sentences('english-german-both.pkl')
train = load_clean_sentences('english-german-train.pkl')
test = load_clean_sentences('english-german-test.pkl')
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))
# prepare german tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])
print('German Vocabulary Size: %d' % ger_vocab_size)
print('German Max Length: %d' % (ger_length))
# prepare training data
trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
trainY = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
trainY = encode_output(trainY, eng_vocab_size)
# prepare validation data
testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])
testY = encode_sequences(eng_tokenizer, eng_length, test[:, 0])
testY = encode_output(testY, eng_vocab_size)
# define model
model = define_model(ger_vocab_size, eng_vocab_size, ger_length, eng_length, 512)
# fit model
checkpoint = ModelCheckpoint('model.h5', monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(trainX, trainY, epochs=30, batch_size=64, validation_data=(testX, testY), callbacks=[checkpoint], verbose=2)


English Vocabulary Size: 3755
English Max Length: 6
German Vocabulary Size: 5815
German Max Length: 10
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_20 (Embedding)     (None, 10, 512)           2977280   
_________________________________________________________________
lstm_39 (LSTM)               (None, 512)               2099200   
_________________________________________________________________
repeat_vector_20 (RepeatVect (None, 6, 512)            0         
_________________________________________________________________
lstm_40 (LSTM)               (None, 6, 512)            2099200   
_________________________________________________________________
time_distributed_20 (TimeDis (None, 6, 3755)           1926315   
Total params: 9,101,995
Trainable params: 9,101,995
Non-trainable params: 0
_________________________________________________________________
Train on 18000 samples, valid

<keras.callbacks.History at 0x7ff1e86d3e48>

In [157]:
train

array([['call the doctor', 'rufen sie einen arzt'],
       ['come quickly', 'kommt schnell'],
       ['i know how to fly', 'ich wei wie man fliegt'],
       ...,
       ['fix the clock', 'repariere die uhr'],
       ['raise your hand', 'hebe deine hand'],
       ['this cd costs', 'diese cd kostet zehn dollar']], dtype='<U370')

In [158]:
# %load 4_generate-1.py
from pickle import load
from numpy import argmax
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu

# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

# fit a tokenizer
def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

# max sentence length
def max_length(lines):
	return max(len(line.split()) for line in lines)

# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
	# integer encode sequences
	X = tokenizer.texts_to_sequences(lines)
	# pad sequences with 0 values
	X = pad_sequences(X, maxlen=length, padding='post')
	return X

# map an integer to a word
def word_for_id(integer, tokenizer):
	for word, index in tokenizer.word_index.items():
		if index == integer:
			return word
	return None

# generate target given source sequence
def predict_sequence(model, tokenizer, source):
	prediction = model.predict(source, verbose=0)[0]
	integers = [argmax(vector) for vector in prediction]
	target = list()
	for i in integers:
		word = word_for_id(i, tokenizer)
		if word is None:
			break
		target.append(word)
	return ' '.join(target)

# evaluate the skill of the model
def evaluate_model(model, sources, raw_dataset):
	actual, predicted = list(), list()
	for i, source in enumerate(sources):
		# translate encoded source text
		source = source.reshape((1, source.shape[0]))
		translation = predict_sequence(model, eng_tokenizer, source)
		raw_target, raw_src = raw_dataset[i]
		if i < 10:
			print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
		actual.append(raw_target.split())
		predicted.append(translation.split())
	# calculate BLEU score
	print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
	print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
	print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
	print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

# load datasets
dataset = load_clean_sentences('english-german-both.pkl')
train = load_clean_sentences('english-german-train.pkl')
test = load_clean_sentences('english-german-test.pkl')
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
# prepare german tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])
# prepare data
trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])
# load model
model = load_model('model.h5')
# test on some training sequences
print('train')
evaluate_model(model, trainX, train)
# test on some test sequences
print('test')
evaluate_model(model, testX, test)

train
src=[rufen sie einen arzt], target=[call the doctor], predicted=[call a doctor]
src=[kommt schnell], target=[come quickly], predicted=[come quickly]
src=[ich wei wie man fliegt], target=[i know how to fly], predicted=[i know what do fly]
src=[tom stand auf], target=[tom got up], predicted=[tom stood up]
src=[frohliche ostern], target=[happy easter], predicted=[happy easter]
src=[das war armselig], target=[that was pathetic], predicted=[that was pathetic]
src=[vertraue ihm nicht], target=[dont trust him], predicted=[dont trust him]
src=[ich werde mit dir gehen], target=[ill go with you], predicted=[ill go with you]
src=[du bist sehr lustig], target=[youre very funny], predicted=[youre very funny]
src=[ich versuchte zu schreien], target=[i tried to scream], predicted=[i tried to scream]


Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


BLEU-1: 0.069663
BLEU-2: 0.262001
BLEU-3: 0.445064
BLEU-4: 0.508103
test
src=[schlagt tom], target=[hit tom], predicted=[save tom]
src=[er ist rechtsanwalt], target=[hes a lawyer], predicted=[he is]
src=[wollen sie reden], target=[do you wanna talk], predicted=[do you talk talk]
src=[ich tote dich], target=[ill kill you], predicted=[ill call you]
src=[das ist ein schlag unter die gurtellinie], target=[thats a low blow], predicted=[its a fine party]
src=[schreiben sie weiter], target=[keep writing], predicted=[take writing]
src=[tom hat veranlasst dass maria geht], target=[tom made mary go], predicted=[tom just mary mary]
src=[wofur ist das], target=[what is this for], predicted=[what is for]
src=[er wirkt hungrig], target=[he seems hungry], predicted=[he hungry hungry hungry]
src=[er ist ohnmachtig], target=[hes unconscious], predicted=[hes unconscious]
BLEU-1: 0.068916
BLEU-2: 0.257797
BLEU-3: 0.436981
BLEU-4: 0.498607


In [159]:
train

array([['call the doctor', 'rufen sie einen arzt'],
       ['come quickly', 'kommt schnell'],
       ['i know how to fly', 'ich wei wie man fliegt'],
       ...,
       ['fix the clock', 'repariere die uhr'],
       ['raise your hand', 'hebe deine hand'],
       ['this cd costs', 'diese cd kostet zehn dollar']], dtype='<U370')

In [160]:
test

array([['hit tom', 'schlagt tom'],
       ['hes a lawyer', 'er ist rechtsanwalt'],
       ['do you wanna talk', 'wollen sie reden'],
       ...,
       ['please go', 'bitte geh'],
       ['he kept his word', 'er hat sein wort gehalten'],
       ['this is my home', 'das ist mein haus']], dtype='<U370')

In [161]:
q

array(['tom is dubious', 'it was night', 'i love games', ...,
       'im not sick', 'show us the way', 'i misunderstood'], dtype='<U370')

In [162]:
w

array(['tom ist fragwurdig', 'es war nacht', 'ich liebe spiele', ...,
       'ich bin nicht krank', 'zeige uns den weg',
       'ich habe es falsch verstanden'], dtype='<U370')