In [16]:
import string
import re
import pickle
import numpy as np
import unicodedata
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import *
from keras.callbacks import ModelCheckpoint
from keras import models    
from nltk.translate.bleu_score import corpus_bleu
import tensorflow

## Text Preprocessing

In [7]:
def load_file(filename):
    file = open(filename,mode='r',encoding='utf-8')
    content = file.read()
    file.close()
    return content

In [8]:
def get_translations(content):
    lines = content.split('\n')
    translations = [ line.split('\t') for line in lines]
    return translations

In [9]:
content = load_file('english_to_german.txt')

In [15]:
translations = get_translations(content)

In [16]:
print(len(translations))

152821


In [17]:
translations

[['Hi.', 'Hallo!'],
 ['Hi.', 'Grüß Gott!'],
 ['Run!', 'Lauf!'],
 ['Wow!', 'Potzdonner!'],
 ['Wow!', 'Donnerwetter!'],
 ['Fire!', 'Feuer!'],
 ['Help!', 'Hilfe!'],
 ['Help!', 'Zu Hülf!'],
 ['Stop!', 'Stopp!'],
 ['Wait!', 'Warte!'],
 ['Hello!', 'Hallo!'],
 ['I try.', 'Ich probiere es.'],
 ['I won!', 'Ich hab gewonnen!'],
 ['I won!', 'Ich habe gewonnen!'],
 ['Smile.', 'Lächeln!'],
 ['Cheers!', 'Zum Wohl!'],
 ['Freeze!', 'Keine Bewegung!'],
 ['Freeze!', 'Stehenbleiben!'],
 ['Got it?', 'Verstanden?'],
 ['Got it?', 'Einverstanden?'],
 ['He ran.', 'Er rannte.'],
 ['He ran.', 'Er lief.'],
 ['Hop in.', 'Mach mit!'],
 ['Hug me.', 'Drück mich!'],
 ['Hug me.', 'Nimm mich in den Arm!'],
 ['Hug me.', 'Umarme mich!'],
 ['I fell.', 'Ich fiel.'],
 ['I fell.', 'Ich fiel hin.'],
 ['I fell.', 'Ich stürzte.'],
 ['I fell.', 'Ich bin hingefallen.'],
 ['I fell.', 'Ich bin gestürzt.'],
 ['I know.', 'Ich weiß.'],
 ['I lied.', 'Ich habe gelogen.'],
 ['I lost.', 'Ich habe verloren.'],
 ["I'm 19.", 'Ich bin 19 Jahr

In [46]:
def clean_translations(translations):
    cleaned_translations = []
    
    #prepare regex for character filtering
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    
    #create a table to remove the punctuations
    table = str.maketrans('','',string.punctuation)
    
    for translation in translations:
        #print("translation= ",translation)
        clean_translation = []
        for line in translation:
            
            #normalize the unicode characters
            line = unicodedata.normalize('NFD',line).encode('ascii','ignore')        
            line = line.decode('UTF-8')
            
            #tokenize on white space
            line = line.split()
            
            #convert to lower case
            line = [word.lower() for word in line]
            
            #remove punctuation from each token
            line = [word.translate(table) for word in line]
            
            #remove non-printable chars from each token.
            line = [re_print.sub('',w) for w in line]
            
            #remove tokens with numbers in them
            line = [word for word in line if word.isalpha()]
            
            #store as string
            clean_translation.append(' '.join(line))
        cleaned_translations.append(clean_translation)
    return cleaned_translations

In [95]:
cleaned_translations = clean_translations(transalations)

In [48]:
len(cleaned_translations)

152821

In [83]:
def save_the_cleaned_data(cleaned_data):
    with open("saved_clean_data.pkl","wb") as f:
        pickle.dump(cleaned_data,f)

In [85]:
save_the_cleaned_data(cleaned_translations)

In [86]:
def load_the_cleaned_data():
    with open("saved_clean_data.pkl","rb") as f:
        cleaned_translations = pickle.load(f)
    return cleaned_translations

## Split the Text

In [147]:
#Take 10,000 transalations
n_translations = 10000
dataset = cleaned_translations[:n_translations]

In [148]:
len(dataset)

10000

In [149]:
np.random.shuffle(dataset)

In [150]:
train,test = dataset[:9000],dataset[9000:]

In [4]:
# Save the dataset,training and testing data
def saveData():
    with open("training-data.pkl","wb") as f:
        pickle.dump(train,f)

    with open("testing-data.pkl","wb") as f:
        pickle.dump(test,f)

    with open("total-dataset.pkl","wb") as f:
        pickle.dump(dataset,f)
        
# Load the dataset,training and testing data
def loadData():    
    with open("training-data.pkl","rb") as f:
        train_data = pickle.load(f)
        
    with open("testing-data.pkl","rb") as f:
        test_data = pickle.load(f)
        
    with open("total-dataset.pkl","rb") as f:
        total_data = pickle.load(f)
        
    return train_data,test_data,total_data

In [152]:
saveData()

In [159]:
# train=None
# test=None
# dataset=None

In [5]:
train,test,dataset = loadData()

In [6]:
len(train),len(test),len(dataset)

(9000, 1000, 10000)

In [7]:
train = np.array(train)
test = np.array(test)
dataset = np.array(dataset)

## Convert Data to Numerical Values

In [8]:
# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [9]:
# max sentence length
def max_length(lines):
    return max(len(line.split()) for line in lines)

In [10]:
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:,0])
eng_vocab_size = len(eng_tokenizer.word_index)+1
eng_length = max_length(dataset[:,0])

print("English Vocabulary Size: %d"%eng_vocab_size)
print("English Max Length: %d"%eng_length)

English Vocabulary Size: 2404
English Max Length: 5


In [11]:
# prepare german tokenizer
ger_tokenizer = create_tokenizer(dataset[:,1])
ger_vocab_size = len(ger_tokenizer.word_index)+1
ger_length = max_length(dataset[:,1])
print("German Vocabulary Size:%d"%ger_vocab_size)
print("German Max Length: %d"%ger_length)

German Vocabulary Size:3856
German Max Length: 10


In [12]:
#eng_tokenizer.word_index
#ger_tokenizer.word_index

In [13]:
#encode and pad sequence
def encode_sequences(tokenizer,length,lines):
    # integer encode sequences
    X = tokenizer.texts_to_sequences(lines)
    # pad sequences with 0 values
    X = pad_sequences(X,maxlen=length,padding='post')
    return X

In [14]:
# one hot encode target sequence
def encode_output(sequences,vocab_size):
    ylist = []
    for sequence in sequences:
        encoded = to_categorical(sequence,num_classes=vocab_size)
        ylist.append(encoded)
    y = np.array(ylist)
    y = y.reshape(sequences.shape[0],sequences.shape[1],vocab_size)
    return y

In [15]:
#prepare training data
trainX = encode_sequences(ger_tokenizer,ger_length,train[:,1])
trainY = encode_sequences(eng_tokenizer,eng_length,train[:,0])
trainY = encode_output(trainY,eng_vocab_size)

#prepare validation data
testX = encode_sequences(ger_tokenizer,ger_length,test[:,1])
testY = encode_sequences(eng_tokenizer,eng_length,test[:,0])
testY = encode_output(testY,eng_vocab_size)

## Define the NMT Model

In [24]:
def define_model(src_vocab,tar_vocab,src_timestamps,tar_timesteps,n_units):
    model = Sequential()
    model.add(Embedding(src_vocab,n_units,input_length=src_timestamps,mask_zero=True))
    model.add(Bidirectional(LSTM(n_units)))
    model.add(RepeatVector(tar_timesteps))
    model.add(LSTM(n_units,return_sequences=True))
    model.add(TimeDistributed(Dense(tar_vocab,activation='softmax')))
    return model

In [25]:
# define model
model = define_model(ger_vocab_size,eng_vocab_size,ger_length,eng_length,256)
model.compile(optimizer='adam',loss='categorical_crossentropy')

In [26]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 10, 256)           987136    
                                                                 
 bidirectional (Bidirectiona  (None, 512)              1050624   
 l)                                                              
                                                                 
 repeat_vector_2 (RepeatVect  (None, 5, 512)           0         
 or)                                                             
                                                                 
 lstm_4 (LSTM)               (None, 5, 256)            787456    
                                                                 
 time_distributed_1 (TimeDis  (None, 5, 2404)          617828    
 tributed)                                                       
                                                      

In [28]:
#fit model
filename = 'best_model.h5'
checkpoint = ModelCheckpoint(filename,monitor='val_loss',verbose=1,save_best_only=True,mode='min')
model.fit(trainX,trainY,epochs=30,batch_size=64,validation_data=(testX,testY),callbacks=[checkpoint],verbose=2)

Epoch 1/30

Epoch 1: val_loss improved from inf to 3.70944, saving model to best_model.h5
141/141 - 11s - loss: 4.3571 - val_loss: 3.7094 - 11s/epoch - 75ms/step
Epoch 2/30

Epoch 2: val_loss improved from 3.70944 to 3.53423, saving model to best_model.h5
141/141 - 9s - loss: 3.5263 - val_loss: 3.5342 - 9s/epoch - 65ms/step
Epoch 3/30

Epoch 3: val_loss improved from 3.53423 to 3.33698, saving model to best_model.h5
141/141 - 9s - loss: 3.3206 - val_loss: 3.3370 - 9s/epoch - 65ms/step
Epoch 4/30

Epoch 4: val_loss improved from 3.33698 to 3.15624, saving model to best_model.h5
141/141 - 11s - loss: 3.0684 - val_loss: 3.1562 - 11s/epoch - 75ms/step
Epoch 5/30

Epoch 5: val_loss improved from 3.15624 to 2.99687, saving model to best_model.h5
141/141 - 12s - loss: 2.8454 - val_loss: 2.9969 - 12s/epoch - 84ms/step
Epoch 6/30

Epoch 6: val_loss improved from 2.99687 to 2.87627, saving model to best_model.h5
141/141 - 14s - loss: 2.6445 - val_loss: 2.8763 - 14s/epoch - 101ms/step
Epoch 7/30


<keras.callbacks.History at 0x7f786b532d00>

In [29]:
#load model
model = models.load_model('best_model.h5')

In [30]:
#map an integer to a word
def word_for_id(integer,tokenizer):
    for word,index in tokenizer.word_index.items():
        if index==integer:
            return word
    return None

In [31]:
# generate target given source sequence
def predict_sequence(model,tokenizer,source):
    prediction = model.predict(source,verbose=0)[0]
    integers = [np.argmax(vector) for vector in prediction]
    target = []
    for i in integers:
        word = word_for_id(i,tokenizer)
        if word is None:
            break
        target.append(word)
    return ' '.join(target)

In [32]:
# evaluate the skill of the model
def evaluate_model(model,tokenizer,sources,raw_dataset):
    actual = []
    predicted = []
    for i,source in enumerate(sources):
        # translate encoded source text
        source = source.reshape((1,source.shape[0]))
        translation = predict_sequence(model,eng_tokenizer,source)
        raw_target,raw_src = raw_dataset[i]
        if i<10:
            print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
        actual.append(raw_target.split())
        predicted.append(translation.split())
    #calculate BLEU score
    print('BLEU-1: %f' % corpus_bleu(actual,predicted,weights=(1.0,0,0,0)))
    print('BLEU-2: %f' % corpus_bleu(actual,predicted,weights=(0.5, 0.5, 0, 0)))
    print('BLEU-3: %f' % corpus_bleu(actual,predicted,weights=(0.3, 0.3, 0.3, 0)))
    print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

In [33]:
evaluate_model(model,eng_tokenizer,testX,test)

src=[vielleicht ist es wahr], target=[maybe its true], predicted=[it isnt here]
src=[tom bleibt], target=[tom is staying], predicted=[tom is]
src=[ich mag deine idee], target=[i like your idea], predicted=[i like the]
src=[sie haben post], target=[you have mail], predicted=[you have mail]
src=[nehmen sie noch eine], target=[have another], predicted=[take another]
src=[nicht der rede wert], target=[it was nothing], predicted=[mums make to drugs]
src=[wir war es], target=[how was it], predicted=[we have it]
src=[geh auf dein zimmer], target=[go to your room], predicted=[go to your room]
src=[das hat mich umgehauen], target=[it blew my mind], predicted=[that annoys me]
src=[ignoriere sie einfach], target=[just ignore her], predicted=[just it up]
BLEU-1: 0.087959
BLEU-2: 0.006452
BLEU-3: 0.000000
BLEU-4: 0.000000


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
