In [None]:
## https://www.analyticsvidhya.com/blog/2019/01/neural-machine-translation-keras/

In [None]:
import string
import re
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, RepeatVector
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from keras import optimizers
import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option('display.max_colwidth', 200)

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

import unicodedata

In [None]:
# Converts the unicode file to ascii
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')


def preprocess_sentence(w, akk=False):
    w = unicode_to_ascii(str(w).lower().strip())

    w = re.sub(r'[" "]+', " ", w)

    # replacing everything with space except (a-z, A-Z, -), and "-" with " -".
    w = re.sub(r"[^a-zA-Z0-9-\^…]+", " ", w)
    
    # DON'T TOKENIZE BY AKKADIAN TIRETS
    #w = re.sub(r"-", " -", w)
        
    w = re.sub(r"[0-9] lines missing|[0-9] lines fragmentary|unknown no of lines missing", "…", w)
    
    if akk:
        w = re.sub(r"x", "…", w)
    else:
        w = re.sub(r"[- ]+", " ", w)
        
    w = re.sub(r"…+", "…", w)
    w = re.sub(r"(… )+", "… ", w)
    w = re.sub(r"( …)+", " …", w)

    w = w.strip()

    # adding a start and an end token to the sentence
    # so that the model know when to start and stop predicting.
    w = '<start> ' + w + ' <end>'
    return w

In [None]:
import pandas as pd

pairs = pd.read_csv("data/6882pairs.csv", ";")
pairs = pairs.reindex(pairs.akk.str.len().sort_values().index)

In [None]:
t=0
en_sentence = pairs["en"][t] #u"May I borrow this book?"
akk_sentence = pairs["akk"][t] #u"¿Puedo tomar prestado este libro?"
print(en_sentence +"\n")
print(akk_sentence+"\n")
print(preprocess_sentence(en_sentence)+"\n")
print(preprocess_sentence(akk_sentence, akk=True)+"\n")

In [None]:
prep_en  = lambda t: preprocess_sentence(t)
prep_akk = lambda t: preprocess_sentence(t, akk=True)

In [None]:
limit =  -1166
en    =  np.array([prep_en(xi) for xi in np.array(pairs["en"])])[:limit]
akk   =  np.array([prep_akk(xi) for xi in np.array(pairs["akk"])])[:limit]

In [None]:
# empty lists
en_l = []
akk_l = []

# populate the lists with sentence lengths
for i in en:
      en_l.append(len(i.split()))

for i in akk:
      akk_l.append(len(i.split()))

length_df = pd.DataFrame({'en':en_l, 'akk':akk_l})

length_df.hist(bins = 10, range=[0,500])
plt.show()

In [None]:
akk

In [None]:
# function to build a tokenizer
def tokenization(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [None]:
# prepare english tokenizer
en_tokenizer = tokenization(en)
en_vocab_size = len(en_tokenizer.word_index) + 1

en_length = 200
print('English Vocabulary Size: %d' % en_vocab_size)

In [None]:
# prepare Deutch tokenizer
akk_tokenizer = tokenization(akk)
akk_vocab_size = len(akk_tokenizer.word_index) + 1

akk_length = 200
print('Akkadian Vocabulary Size: %d' % akk_vocab_size)

In [None]:
# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
    # integer encode sequences
    seq = tokenizer.texts_to_sequences(lines)
    # pad sequences with 0 values
    seq = pad_sequences(seq, maxlen=length, padding='post')
    return seq


In [None]:
from sklearn.model_selection import train_test_split

# split data into train and test set
xTrain_, xTest_, yTrain_, yTest_  = train_test_split(en, akk, test_size=0.2, random_state = 12)
xTrain = encode_sequences(en_tokenizer, en_length, xTrain_)
yTrain = encode_sequences(akk_tokenizer, akk_length, yTrain_)
xTest = encode_sequences(en_tokenizer, en_length, xTest_)
yTest = encode_sequences(akk_tokenizer, akk_length, yTest_)

In [None]:
# build NMT model
def define_model(in_vocab, out_vocab, in_timesteps, out_timesteps, units):
    model = Sequential()
    model.add(Embedding(in_vocab, units, input_length=in_timesteps, mask_zero=True))
    model.add(LSTM(units))
    model.add(RepeatVector(out_timesteps))
    model.add(LSTM(units, return_sequences=True))
    model.add(Dense(out_vocab, activation='softmax'))
    return model

In [None]:
# model compilation
model = define_model(en_vocab_size, akk_vocab_size, en_length, akk_length, 512)

In [None]:
en_vocab_size

In [None]:
rms = optimizers.RMSprop(lr=0.001)
model.compile(optimizer=rms, loss='sparse_categorical_crossentropy')

In [None]:
filename = 'chkpts/model.h1'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

# train model
history = model.fit(xTrain, yTrain.reshape(yTrain.shape[0], yTrain.shape[1], 1), 
                    validation_data=[xTest, yTest.reshape(yTest.shape[0], yTest.shape[1], 1)],
                    epochs=10, batch_size=512, validation_split = 0.2,callbacks=[checkpoint], 
                    verbose=1)

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.legend(['train','validation'])
plt.show()

In [None]:
preds = model.predict_classes(xTest)

In [None]:
def get_word(n, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == n:
            return word
    return None

In [None]:
preds_text = []
for i in preds:
    temp = []
    for j in range(len(i)):
        t = get_word(i[j], akk_tokenizer)
        if j > 0:
            if (t == get_word(i[j-1], akk_tokenizer)) or (t == None):
                temp.append('')
            else:
                temp.append(t)
        else:
            if(t == None):
                temp.append('')
            else:
                temp.append(t) 

    preds_text.append(' '.join(temp))

In [None]:
yTest_

In [None]:
preds_text