In [1]:
# Clean Text

In [2]:
import string
import re
from pickle import dump
from unicodedata import normalize
from numpy import array

In [3]:
def load_doc(filename):
    file = open(filename, mode ='rt', encoding = 'utf-8')
    text = file.read()
    file.close()
    return text

In [4]:
def to_pairs(doc):
    lines = doc.strip().split('\n')
    pairs = [line.split('\t') for line in lines]
    pairs = [[pair[0], pair[1]] for pair in pairs ]
    return pairs

In [5]:
def clean_pairs(lines):
    cleaned = list()
    re_punc = re.compile('[%s]' %re.escape(string.punctuation))
    re_print = re.compile('[^%s]'%re.escape(string.printable))
    for pair in lines:
        clean_pair = list()
        for line in pair:
            line = normalize('NFD',line).encode('ascii','ignore')
            line = line.decode('UTF-8')
            line=line.split()
            line = [word.lower() for word in line]
            line = [re_punc.sub('',w) for w in line]
            line = [re_print.sub('',w) for w in line]
            line = [word for word in line if word.isalpha()]
            clean_pair.append(' '.join(line))
        cleaned.append(clean_pair)
    return array(cleaned)

In [6]:
def save_clean_data(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print('Saved: %s' %filename)

In [7]:
filename='deu.txt'
doc = load_doc(filename)
pairs = to_pairs(doc)
clean_pairs = clean_pairs(pairs)
save_clean_data(clean_pairs, 'english-german.pkl')
for i in range(100):
    print('[%s] => [%s]' %(clean_pairs[i,0], clean_pairs[i,1]))

Saved: english-german.pkl
[go] => [geh]
[hi] => [hallo]
[hi] => [gru gott]
[run] => [lauf]
[run] => [lauf]
[wow] => [potzdonner]
[wow] => [donnerwetter]
[fire] => [feuer]
[help] => [hilfe]
[help] => [zu hulf]
[stop] => [stopp]
[wait] => [warte]
[wait] => [warte]
[begin] => [fang an]
[go on] => [mach weiter]
[hello] => [hallo]
[hurry] => [beeil dich]
[hurry] => [schnell]
[i ran] => [ich rannte]
[i see] => [ich verstehe]
[i see] => [aha]
[i try] => [ich probiere es]
[i won] => [ich hab gewonnen]
[i won] => [ich habe gewonnen]
[relax] => [entspann dich]
[shoot] => [feuer]
[shoot] => [schie]
[smile] => [lacheln]
[attack] => [angriff]
[attack] => [attacke]
[cheers] => [zum wohl]
[eat it] => [iss es]
[eat up] => [iss auf]
[freeze] => [keine bewegung]
[freeze] => [stehenbleiben]
[got it] => [verstanden]
[got it] => [aha]
[got it] => [ich habs]
[got it] => [kapiert]
[got it] => [verstanden]
[got it] => [einverstanden]
[he ran] => [er rannte]
[he ran] => [er lief]
[hop in] => [mach mit]
[hug me

In [8]:
# modified dataset and train test split

In [9]:
from pickle import load
from pickle import dump
from numpy.random import shuffle

In [10]:
def load_clean_sentences(filename):
    return load(open(filename, 'rb'))

In [11]:
def save_clean_data(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print('Saved: %s' %filename)

In [12]:
raw_dataset = load_clean_sentences('english-german.pkl')

In [13]:
n_sentences = 10000
dataset = raw_dataset[:n_sentences, :]
shuffle(dataset)

In [14]:
train, test = dataset[:9000], dataset[9000:]

In [15]:
save_clean_data(dataset, 'english-german-both.pkl')
save_clean_data(train, 'english-german-train.pkl')
save_clean_data(test, 'english-german-test.pkl')

Saved: english-german-both.pkl
Saved: english-german-train.pkl
Saved: english-german-test.pkl


In [23]:
# Training neural translation model

In [24]:
from pickle import load
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint

In [25]:
def load_clean_sentences(filename):
    return load(open(filename,'rb'))

In [31]:
dataset = load_clean_sentences('english-german-both.pkl')
train = load_clean_sentences('english-german-train.pkl')
test = load_clean_sentences('english-german-test.pkl')

In [26]:
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [27]:
def max_length(lines):
    return max(len(line.split()) for line in lines)

In [32]:
eng_tokenizer = create_tokenizer(dataset[:,0])
eng_vocab_size = len(eng_tokenizer.word_index) +1
eng_length = max_length(dataset[:,0])
print("English Vocabulary Size: %d" %eng_vocab_size)
print("English Max Length: %d" %(eng_length))

English Vocabulary Size: 2214
English Max Length: 5


In [33]:
ger_tokenizer = create_tokenizer(dataset[:,1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:,1])
print("German Vocabulary Size: %d" %ger_vocab_size)
print("German Max Length: %d" %ger_length)

German Vocabulary Size: 3526
German Max Length: 9


In [39]:
def encode_sequences(tokenizer, length, lines):
    X = tokenizer.texts_to_sequences(lines)
    X = pad_sequences(X, maxlen=length, padding = 'post')
    return X

In [29]:
def encode_output(sequences, vocab_size):
    ylist = list()
    for sequence in sequences:
        encoded = to_categorical(sequence, num_classes = vocab_size)
        ylist.append(encoded)
    y = array(ylist)
    y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
    return y

In [40]:
# prepare training data
trainX = encode_sequences(ger_tokenizer, ger_length, train[:,1])
trainY = encode_sequences(eng_tokenizer, eng_length, train[:,0])
trainY = encode_output(trainY, eng_vocab_size)

In [42]:
testX = encode_sequences(ger_tokenizer, ger_length, test[:,1])
testY = encode_sequences(eng_tokenizer, eng_length, test[:,0])
testY = encode_output(testY, eng_vocab_size)

In [30]:
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
    model = Sequential()
    model.add(Embedding(src_vocab, n_units, input_length = src_timesteps, mask_zero = True))
    model.add(LSTM(n_units))
    model.add(RepeatVector(tar_timesteps))
    model.add(LSTM(n_units, return_sequences = True))
    model.add(TimeDistributed(Dense(tar_vocab, activation = "softmax")))
    model.compile(optimizer = 'adam', loss = 'categorical_crossentropy')
    model.summary()
    plot_model(model, to_file='mode.png', show_shapes = True)
    return model

In [43]:
model = define_model(ger_vocab_size, eng_vocab_size, ger_length, eng_length, 256)

W0508 19:55:22.621117 140242132813632 deprecation.py:506] From /home/arvind/.local/lib/python3.6/site-packages/tensorflow_core/python/ops/resource_variable_ops.py:1633: calling BaseResourceVariable.__init__ (from tensorflow.python.ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version.
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
W0508 19:55:23.125029 140242132813632 deprecation.py:323] From /home/arvind/.local/lib/python3.6/site-packages/tensorflow_core/python/keras/backend.py:3854: where (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 9, 256)            902656    
_________________________________________________________________
lstm_1 (LSTM)                (None, 256)               525312    
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 5, 256)            0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 5, 256)            525312    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 5, 2214)           568998    
Total params: 2,522,278
Trainable params: 2,522,278
Non-trainable params: 0
_________________________________________________________________


In [45]:
checkpoint = ModelCheckpoint('model.h5', monitor = 'val_loss', verbose=1, save_best_only = True, mode = 'min')
model.fit(trainX, trainY, epochs=1, batch_size=64, validation_data = (testX, testY), callbacks = [checkpoint], verbose = 2)

Train on 9000 samples, validate on 1000 samples
Epoch 1/1
 - 20s - loss: 3.3736 - val_loss: 3.3388

Epoch 00001: val_loss improved from inf to 3.33883, saving model to model.h5


<keras.callbacks.callbacks.History at 0x7f8c044d3080>

In [46]:
# Evaluate neural translation model

In [52]:
from pickle import load
from numpy import argmax
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu

In [57]:
dataset = load_clean_sentences('english-german-both.pkl')
train = load_clean_sentences('english-german-train.pkl')
test = load_clean_sentences('english-german-test.pkl')

In [58]:
eng_tokenizer = create_tokenizer(dataset[:,0])
eng_vocab_size = len(eng_tokenizer.word_index)+1
eng_length = max_length(dataset[:,0])

In [59]:
ger_tokenizer = create_tokenizer(dataset[:,1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:,1])

In [60]:
trainX = encode_sequences(ger_tokenizer, ger_length, train[:,1])
testX = encode_sequences(ger_tokenizer, ger_length, test[:,1])

In [61]:
model = load_model('model.h5')

In [54]:
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

In [65]:
def predict_sequence(model, tokenizer, source):
    prediction = model.predict(source, verbose=0)[0]
    integers = [argmax(vector) for vector in prediction]
    target = list()
    for i in integers:
        word = word_for_id(i, tokenizer)
        if word is None:
            break
        target.append(word)
    return ' '.join(target)

In [70]:
def evaluate_model(model,sources, raw_dataset):
    actual, predicted = list(), list()
    for i, source in enumerate(sources):
        source = source.reshape((1,source.shape[0]))
        translation = predict_sequence(model, eng_tokenizer, source)
        raw_target, raw_src = raw_dataset[i]
        if i < 10:
            print('src = [%s], target = [%s], predicted = [%s]' %(raw_src, raw_target, translation))
        actual.append(raw_target.split())
        predicted.append(translation.split())
        
    print('BLEU-1: %f' %corpus_bleu(actual, predicted, weights = (1.0,0,0,0)))
    print('BLEU-2: %f' %corpus_bleu(actual, predicted, weights = (0.5,0.5,0,0)))
    print('BLEU-3: %f' %corpus_bleu(actual, predicted, weights = (0.3,0.3,0.3,0)))
    print('BLEU-4: %f' %corpus_bleu(actual, predicted, weights = (0.25,0.25,0.25,0.25)))

In [71]:
print('train')
evaluate_model(model, trainX, train)

train
src = [wir hatten angst], target = [we were scared], predicted = [i is]
src = [tom ist nicht schwach], target = [toms not weak], predicted = [i is]
src = [er macht es gut], target = [hes all right], predicted = [i is]
src = [schneide es in zwei halften], target = [cut it in half], predicted = [i is]
src = [wer bin ich], target = [who am i], predicted = [i is]
src = [komm nicht herein], target = [keep out], predicted = [i is]
src = [tom verschlief], target = [tom slept late], predicted = [i is]
src = [versuch doch mal zu lacheln], target = [try to smile], predicted = [i is]
src = [bist du verruckt], target = [are you mad], predicted = [i is]
src = [er hat auf mich geschossen], target = [he shot at me], predicted = [i is]


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


BLEU-1: 0.228757
BLEU-2: 0.000000
BLEU-3: 0.000000
BLEU-4: 0.000000


In [72]:
print('test')
evaluate_model(model, testX, test)

test
src = [tom jammert], target = [tom is whining], predicted = [i is]
src = [er hat uns verraten], target = [he sold us out], predicted = [i is]
src = [tom ist vielleicht auf], target = [tom may be up], predicted = [i is]
src = [er kann nicht fahren], target = [he cant drive], predicted = [i is]
src = [tom ist koch], target = [tom is a cook], predicted = [i is]
src = [lasst uns fortgehen], target = [lets go away], predicted = [i]
src = [niemand sah mich], target = [nobody saw me], predicted = [i is]
src = [probieren sie die hier an], target = [try these on], predicted = [i is]
src = [das ist nicht gut], target = [thats no good], predicted = [i is]
src = [sie sind jung], target = [youre young], predicted = [i is]
BLEU-1: 0.236053
BLEU-2: 0.000000
BLEU-3: 0.000000
BLEU-4: 0.000000
