In [1]:
from pickle import load
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
# from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint
import os
from keras.models import load_model
from random import shuffle
from pickle import dump


# threshold for minimum count to be considered a valid word
MIN_VOCAB_COUNT = 5
OOV_TOKEN = "UNK"


os.environ["CUDA_VISIBLE_DEVICES"]="0"
# load a clean dataset
def load_clean_sentences(filename):
    return load(open(filename, 'rb'))

# save a list of clean sentences to file
def save_clean_data(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print('Saved: %s' % filename)

# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

# max sentence length
def max_length(lines):
    return max(len(line.split()) for line in lines)

# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
    # integer encode sequences
    X = tokenizer.texts_to_sequences(lines)
    # pad sequences with 0 values
    X = pad_sequences(X, maxlen=length, padding='post')
    return X

# one hot encode target sequence
def encode_output(sequences, vocab_size):
    ylist = list()
    for sequence in sequences:
        encoded = to_categorical(sequence, num_classes=vocab_size)
        ylist.append(encoded)
    y = array(ylist)
    y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
    return y

# define NMT model
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
    model = Sequential()
    model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
    model.add(LSTM(n_units))
    model.add(RepeatVector(tar_timesteps))
    model.add(LSTM(n_units, return_sequences=True))
    model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
    return model

def tokenized_to_single(pairs_list):
    new_pairs_list = []
    for pair in pairs_list:
        a = " ".join(pair[0])
        b = " ".join(pair[1])
        new_pairs_list.append([a,b])
    return new_pairs_list

def split_eng_german(pairs):
    eng = []
    ger = []
    for pair in pairs:
        eng.append(pair[0])
        ger.append(pair[1])
    return eng, ger

def max_length_char(data):
    length = 0
    for d in data:
        if len(d) > length:
            length = len(d)
    return length

raw_dataset = load_clean_sentences('english-german-nounk.pkl')



Using TensorFlow backend.


In [2]:
# reduce dataset size
MAX_SENT_LENGTH = 10

dataset = []
for line in raw_dataset:
    engline = line[0]
    gerline = line[1]
    if(len(engline) <= MAX_SENT_LENGTH and len(gerline) <= MAX_SENT_LENGTH):
        dataset.append(line)

# random shuffle
shuffle(dataset)
# split into train/test
NUM_TRAIN = int(len(dataset)*0.8)
train, test = dataset[:NUM_TRAIN], dataset[NUM_TRAIN:]
# save

dataset = tokenized_to_single(dataset)
train = tokenized_to_single(train)
test = tokenized_to_single(test)

save_clean_data(dataset, 'english-german-both.pkl')
save_clean_data(train, 'english-german-train.pkl')
save_clean_data(test, 'english-german-test.pkl')
len(dataset)


Saved: english-german-both.pkl
Saved: english-german-train.pkl
Saved: english-german-test.pkl


20376

In [3]:


# load datasets
dataset = load_clean_sentences('english-german-both.pkl')
train = load_clean_sentences('english-german-train.pkl')
test = load_clean_sentences('english-german-test.pkl')



data_eng, data_ger = split_eng_german(dataset)
train_eng, train_ger = split_eng_german(train)
test_eng, test_ger = split_eng_german(test)

# prepare english tokenizer
eng_tokenizer = create_tokenizer(data_eng)
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(data_eng)

print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))
# prepare german tokenizer
ger_tokenizer = create_tokenizer(data_ger)
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(data_ger)
print('German Vocabulary Size: %d' % ger_vocab_size)
print('German Max Length: %d' % (ger_length))



English Vocabulary Size: 10036
English Max Length: 10
German Vocabulary Size: 14729
German Max Length: 10


In [4]:
data_ger[:]

['andere werden folgen',
 'auch an dieser front fällt amerika zurück',
 'all das trifft zu',
 '###',
 'ich dachte dass alles was ich sagte vollkommen offensichtlich war',
 'was müsste noch geschehen',
 'aber frankreich ist kein normales land',
 'armer maradona',
 'einfache maßnahmen können hier enorme wirkung entfalten',
 'die psychologischen risiken der weltwirtschaft',
 'die jüngsten proteste sind viel strategischer als zuvor',
 'abenomics und asien',
 'einige davon waren möglicherweise nie berechtigt',
 'wie kann es dann die übrige welt',
 'das ende der naturwissenschaften',
 'japan dagegen steht mit seinem massiven schuldenberg alleine da',
 'jetzt ist die ezb europas anker',
 'wie kommt das',
 'zwei faktoren helfen die aktuellen europäischen unsicherheiten zu erklären',
 'nur der staat kann hier in die verantwortung genommen werden',
 'auf kommunaler ebene sind sie noch einflussreicher',
 'die geldpolitischen instrumente funktionieren nicht mehr',
 'die iranische politik ist sehr 

In [5]:
# prepare training data
print('Encoding trainX')

trainX = encode_sequences(ger_tokenizer, ger_length, train_ger)
print('Encoding trainY...')

trainY = encode_sequences(eng_tokenizer, eng_length, train_eng)
trainY = encode_output(trainY, eng_vocab_size)


Encoding trainX
Encoding trainY...


In [6]:
# prepare validation data
testX = encode_sequences(ger_tokenizer, ger_length, test_ger)
testY = encode_sequences(eng_tokenizer, eng_length, test_eng)
testY = encode_output(testY, eng_vocab_size)


In [5]:
import numpy as np


In [23]:
# np.savez('trainY.npy', trainY)
# np.savez('trainX.npy', trainX)
# np.savez('testX.npy', testX)
# np.savez('testY.npy', testY)

# trainY = np.load('trainY.npy.npz')['arr_0']
# trainX = np.load('trainX.npy.npz')['arr_0']
# testX = np.load('testX.npy.npz')['arr_0']
# testY = np.load('testY.npy.npz')['arr_0']

In [7]:

# define model
model = define_model(ger_vocab_size, eng_vocab_size, ger_length, eng_length, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy')
# summarize defined model
print(model.summary())
# plot_model(model, to_file='model.png', show_shapes=True)
# fit model
filename = 'model_real.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 10, 256)           3770624   
_________________________________________________________________
lstm_1 (LSTM)                (None, 256)               525312    
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 10, 256)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 10, 256)           525312    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 10, 10036)         2579252   
Total params: 7,400,500
Trainable params: 7,400,500
Non-trainable params: 0
_________________________________________________________________
None


In [12]:
model.fit(trainX, trainY, epochs=5, batch_size=64, validation_data=(testX[:100], testY[:100]), callbacks=[checkpoint], verbose=1)

Train on 16300 samples, validate on 100 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
 1600/16300 [=>............................] - ETA: 2:11 - loss: 4.4038

KeyboardInterrupt: 

In [9]:
source = testX[0]
from numpy import argmax

# load model
# model = load_model('model.h5')
# generate target given source sequence
def predict_sequence(model, tokenizer, source):
	source = source.reshape((1, source.shape[0]))
	prediction = model.predict(source, verbose=0)[0]
	integers = [argmax(vector) for vector in prediction]
	target = list()
	for i in integers:
		word = word_for_id(i, tokenizer)
		if word is None:
			break
		target.append(word)
	return ' '.join(target)

# map an integer to a word
def word_for_id(integer, tokenizer):
	for word, index in tokenizer.word_index.items():
		if index == integer:
			return word
	return None



In [10]:
predict_sequence(model, eng_tokenizer, testX[100])

'the is is is is is'

In [16]:
len(testY[1])

89

In [14]:
for i in range(len(test)-20, len(test)):
    print(test[i][0],'|', test[i][1], '|', predict_sequence(model, eng_tokenizer, testX[i]))

the new monetary disorder | das neue währungschaos | the the
that situation is much worse among ethnic minorities | bei den ethnischen minderheiten ist die situation noch viel schlimmer | the the is is the the the
good governance and economic performance | gute regierungsführung und wirtschaftsleistung | the the
the new new thing in economics | ein neuer ansatz für die wirtschaft | the is is the
france 's case is different | der fall frankreich liegt anders | the is is the
of course that old indifference was not universal | selbstverständlich galt diese alte gleichgültigkeit nicht für alle | the the is is the the
but what does this really mean | aber was bedeutet das wirklich | the is is
the euro is gaining ground for several reasons | der euro erstarkt aus mehreren gründen | the is is the
but few people took those promises seriously | kaum jemand freilich nahm diese versprechungen ernst | the is is
all of that investment requires plans and years of implementation | das alles bedarf de