In [0]:
import re
import string
import numpy as np
from unicodedata import normalize
from numpy.random import shuffle
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical


class TranslateData:
  def __init__(self, file, max_pairs=None, skip_set=0):
    print("Reading file...")
    self.pairs = self.read_pairs(file)
    
    if (max_pairs != None):
      self.pairs = self.pairs[skip_set * max_pairs :max_pairs * (skip_set + 1)]
    
    print("Cleaning data...")
    self.pairs = self.clean_pairs(self.pairs)
    print(len(self.pairs))
    
    shuffle(self.pairs)
    
    self.eng_tok = Tokenizer()
    self.ger_tok = Tokenizer()
    
    print("Fitting tokenizers...")
    self.eng_tok.fit_on_texts(self.pairs[:,0])
    self.ger_tok.fit_on_texts(self.pairs[:,1])
    
    print("Tokenizing english...")
    self.eng_seq = self.eng_tok.texts_to_sequences(self.pairs[:,0])
    
    print("Tokenizing german...")
    self.ger_seq = self.ger_tok.texts_to_sequences(self.pairs[:,1])
    
    self.eng_maxlen = max([len(s) for s in self.eng_seq])
    self.ger_maxlen = max([len(s) for s in self.ger_seq])
    
    print("Padding sequences...")
    self.eng_seq = pad_sequences(self.eng_seq, maxlen=self.eng_maxlen, padding='post')
    self.ger_seq = pad_sequences(self.ger_seq, maxlen=self.ger_maxlen, padding='post')
  
    self.eng_vocab_size = len(self.eng_tok.word_index) + 1
    self.ger_vocab_size = len(self.ger_tok.word_index) + 1


    ylist = []

    print("Ger to one hot...")
    for s in self.ger_seq:
      enc = to_categorical(s, num_classes=self.ger_vocab_size)
      ylist.append(enc)
      
    
    self.x = self.eng_seq
    
    self.y = np.array(ylist)
    del ylist
    self.y = self.y.reshape(self.y.shape[0], self.y.shape[1], self.ger_vocab_size)
   
  def text_to_input(self, text):
    input = pad_sequences(self.eng_tok.texts_to_sequences([text]), maxlen=self.eng_maxlen, padding='post')
    
    return input
  
  def pred_to_text(self, pred):
    seq = [np.argmax(w) for w in pred]
    
    text = ''
    
    for s in seq:
      for word, i in self.ger_tok.word_index.items():
        if i == s:
          text += word + ' '
    
    return text.strip()
   
  def read_pairs(self, file):
    with open(file, 'rt', encoding='utf-8') as f:
      text = f.read()
    lines = text.strip().split('\n')
    pairs = [line.split('\t') for line in lines]
    
    return pairs
  
  def clean_pairs(self, lines):
    cleaned = list()
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    table = str.maketrans('', '', string.punctuation)
    for pair in lines:
      clean_pair = list()
      for line in pair:
        line = normalize('NFD', line).encode('ascii', 'ignore')
        line = line.decode('UTF-8')
        line = line.split()
        line = [word.lower() for word in line]
        line = [word.translate(table) for word in line]
        line = [re_print.sub('', w) for w in line]
        line = [word for word in line if word.isalpha()]
        clean_pair.append(' '.join(line))
      cleaned.append(clean_pair)
    return np.array(cleaned)

In [0]:
data = TranslateData('./deu.txt', max_pairs=12500)

In [0]:
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, TimeDistributed, RepeatVector, Dropout, Bidirectional


def get_model(eng_vocab_size, ger_vocab_size, eng_maxlen, ger_maxlen, n_cells):
  model = Sequential()
  model.add(Embedding(eng_vocab_size, n_cells, input_shape=(eng_maxlen, ), mask_zero=True))
  model.add(Bidirectional(LSTM(n_cells)))
  model.add(RepeatVector(ger_maxlen))
  model.add(Bidirectional(LSTM(n_cells, return_sequences=True)))
  model.add(Dropout(.1))
  model.add(TimeDistributed(Dense(ger_vocab_size, activation='softmax')))
  
  model.compile(optimizer='adam', loss='categorical_crossentropy')
  
  return model

In [0]:
model = get_model(data.eng_vocab_size, data.ger_vocab_size,  data.eng_maxlen, data.ger_maxlen, 512)
plot_model(model, to_file='model-bi.png', show_shapes=True)

In [0]:
from keras.callbacks import EarlyStopping, ModelCheckpoint

callback_list = [ModelCheckpoint('./translate-model-ger-bi.h5', monitor='loss'), EarlyStopping(monitor='loss', min_delta=0.01, patience=0)]

model.fit(data.x, data.y, epochs=100, verbose=True, validation_split=.1, callbacks=callback_list)

In [0]:
from random import choice

for i in range(20):
  pair = choice(data.pairs)
  p = data.pred_to_text(model.predict(data.text_to_input(pair[0]))[0])
  print("%s => %s (%s)" % (pair[0], p, pair[1]))