# NN translator example

Reference: https://machinelearningmastery.com/develop-neural-machine-translation-system-keras/

Simple German -> English translator using EMbedding, LSTM Neural Network

In [1]:
import string
import re
from pickle import dump
from unicodedata import normalize
from numpy import array
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

## Global options 

Option_training : whether to train the model or not (T/F)
                  if F, load model.h5 file                   
Option_evaluation : evaluate the model performance with train/test data set 

In [21]:
Option_training = False
Option_evaluation = False

## 1. Clean Text

In [3]:
## 여기에서 함수들이 정의됨

# load doc into memory
# 파일을 string 으로 바꾸어서 return 
def load_doc(filename):
	# open the file as read only
	file = open(filename, mode='rt', encoding='utf-8')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

# split a loaded document into sentences
# 주어진 doc string 을 먼저 \n 으로 나누고,
# 그 다음은 \t 으로 나누어서 
# pairs = [ [영어문장,독일어문장],[ ..],..] 형식으로 저장 
def to_pairs(doc):
	lines = doc.strip().split('\n')
	pairs = [line.split('\t') for line in  lines]
	return pairs

# clean a list of lines
# 하나의 line = [영어문장, 독일어문장] 형식 
# 각 문장에서 
# Remove all non-printable characters.
# Remove all punctuation characters.
# Normalize all Unicode characters to ASCII (e.g. Latin characters).
# Normalize the case to lowercase. (이건 독일어에서 고유명사를 구분하는데 문제가 있을 수 있지않을까?) 
# Remove any remaining tokens that are not alphabetic.
def clean_pairs(lines):
	cleaned = list()
	# prepare regex for char filtering
	re_print = re.compile('[^%s]' % re.escape(string.printable))
	# prepare translation table for removing punctuation
	table = str.maketrans('', '', string.punctuation)
	for pair in lines:
		clean_pair = list()
		for line in pair:
			# normalize unicode characters
			line = normalize('NFD', line).encode('ascii', 'ignore')
			line = line.decode('UTF-8')
			# tokenize on white space
			line = line.split()
			# convert to lowercase
			line = [word.lower() for word in line]
			# remove punctuation from each token
			line = [word.translate(table) for word in line]
			# remove non-printable chars form each token
			line = [re_print.sub('', w) for w in line]
			# remove tokens with numbers in them
			line = [word for word in line if word.isalpha()]
			# store as string
			clean_pair.append(' '.join(line))
		cleaned.append(clean_pair)
	return array(cleaned)

# save a list of clean sentences to file
# 저장은 pickle을 이용 
def save_clean_data(sentences, filename):
	dump(sentences, open(filename, 'wb'))
	print('Saved: %s' % filename)

### Create "english-german.pkl" file

In [9]:
# if pkl file does not exist, create it 
from pathlib import Path
#----target language file---------------
filename = 'deu.txt'
pkl_file = 'english-german.pkl'
#----------------------------------------
path = Path(pkl_file)
if path.is_file():
    print(f'The file {pkl_file} exists')
else:
    print(f'The file {pkl_file} does not exist')
    doc = load_doc(filename)
    # split into english-german pairs
    pairs = to_pairs(doc)
    # clean sentences
    clean_pairs = clean_pairs(pairs)
    # save clean pairs to file
    save_clean_data(clean_pairs, pkl_file)
    # spot check
    for i in range(100):
        print('[%s] => [%s]' % (clean_pairs[i,0], clean_pairs[i,1]))    

The file english-german.pkl exists


## 2. Split Text and Simplification

* Here only 10,000 examples (among 150,000 phrases) will be used as an example
* 9,000 are used for training, 1,000 are for testing

In [10]:
from pickle import load
from pickle import dump
from numpy.random import rand
from numpy.random import shuffle

# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

# save a list of clean sentences to file
def save_clean_data(sentences, filename):
	dump(sentences, open(filename, 'wb'))
	print('Saved: %s' % filename)

### save cleaned train/test set data

In [14]:
# load dataset
raw_dataset = load_clean_sentences(pkl_file)

# reduce dataset size
n_sentences = 50000
n_train  = 45000

path_clean_both = 'english-german-both.pkl'
path_clean_train = 'english-german-train.pkl'
path_clean_test = 'english-german-test.pkl'
path = Path(path_clean_both)

def save_clean_data_files(n_sentences,n_train,
       path_clean_both,path_clean_train,path_clean_test):
    dataset = raw_dataset[:n_sentences, :]
    # random shuffle
    shuffle(dataset)
    # split into train/test
    train, test = dataset[:n_train], dataset[n_train:]
    # save
    save_clean_data(dataset, path_clean_both)
    save_clean_data(train, path_clean_train)
    save_clean_data(test, path_clean_test)
    
if path.is_file():
    #-check consistency with dataset size 
    dataset = load_clean_sentences(path_clean_both)
    if len(dataset)==n_sentences:
        print(f'The file {path_clean_both} exists')
    else:
        print(f'The file {path_clean_both} mismatch. Re-create it.') 
        save_clean_data_files(n_sentences,n_train,
           path_clean_both,path_clean_train,path_clean_test)
else:
    print(f'The file {path_clean_both} does not exist. Re-create it')
    save_clean_data_files(n_sentences,n_train,
           path_clean_both,path_clean_train,path_clean_test)
    


The file english-german-both.pkl exists


### 3. Train Neural Translation Model

The function define_model() below defines the model and takes a number of arguments used to configure the model, such as the size of the input and output vocabularies, the maximum length of input and output phrases, and the number of memory units used to configure the model.

The model is trained using the efficient Adam approach to stochastic gradient descent and minimizes the categorical loss function because we have framed the prediction problem as multi-class classification.

The model configuration was not optimized for this problem, meaning that there is plenty of opportunity for you to tune it and lift the skill of the translations. I would love to see what you can come up with.

In [15]:
from pickle import load
from numpy import array
from keras.preprocessing.text import Tokenizer
#from keras.preprocessing.sequence import pad_sequences
from keras.utils import pad_sequences
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint

# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

# fit a tokenizer
def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

# max sentence length
def max_length(lines):
	return max(len(line.split()) for line in lines)

# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
	# integer encode sequences
	X = tokenizer.texts_to_sequences(lines)
	# pad sequences with 0 values
	X = pad_sequences(X, maxlen=length, padding='post')
	return X

# one hot encode target sequence
def encode_output(sequences, vocab_size):
	ylist = list()
	for sequence in sequences:
		encoded = to_categorical(sequence, num_classes=vocab_size)
		ylist.append(encoded)
	y = array(ylist)
	y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
	return y

# define NMT model
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
	model = Sequential()
	model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
	model.add(LSTM(n_units))
	model.add(RepeatVector(tar_timesteps))
	model.add(LSTM(n_units, return_sequences=True))
	model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
	return model


### Prepare tokenizer/vocabulary and train data 

In [16]:
# load datasets
dataset = load_clean_sentences('english-german-both.pkl')
train = load_clean_sentences('english-german-train.pkl')
test = load_clean_sentences('english-german-test.pkl')

# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))
# prepare german tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])
print('German Vocabulary Size: %d' % ger_vocab_size)
print('German Max Length: %d' % (ger_length))

# prepare training data
trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
trainY = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
trainY = encode_output(trainY, eng_vocab_size)
# prepare validation data
testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])
testY = encode_sequences(eng_tokenizer, eng_length, test[:, 0])
testY = encode_output(testY, eng_vocab_size)

print(' train and test dada prepared. Start model.')

English Vocabulary Size: 6707
English Max Length: 8
German Vocabulary Size: 11545
German Max Length: 17
 train and test dada prepared. Start model.


### define model

In [17]:
# define model
model = define_model(ger_vocab_size, eng_vocab_size, ger_length, eng_length, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy')
# summarize defined model
print(model.summary())
plot_model(model, to_file='model.png', show_shapes=True)


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 17, 256)           2955520   
                                                                 
 lstm (LSTM)                 (None, 256)               525312    
                                                                 
 repeat_vector (RepeatVector  (None, 8, 256)           0         
 )                                                               
                                                                 
 lstm_1 (LSTM)               (None, 8, 256)            525312    
                                                                 
 time_distributed (TimeDistr  (None, 8, 6707)          1723699   
 ibuted)                                                         
                                                                 
Total params: 5,729,843
Trainable params: 5,729,843
Non-

### fit model 

In [19]:
# fit model
filename = 'model.h5'
if Option_training :
    print('start training the model.')
    checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
    earlystop = tf.keras.callbacks.EarlyStopping(monitor='val_loss',min_delta=0,
            patience=0,    verbose=0,    mode='auto',    baseline=None,    restore_best_weights=False)
    model.fit(trainX, trainY, epochs=30, batch_size=64, validation_data=(testX, testY), callbacks=[checkpoint,earlystop], verbose=2)
else:
    print('no new training for the model.')

no new training for the model.


##  Evaluation NMT

If one wants to use existing model, 
one should skip training part and start from here. 

In [20]:
from pickle import load
from numpy import array
from numpy import argmax
from keras.preprocessing.text import Tokenizer
#from keras.preprocessing.sequence import pad_sequences
from keras.utils import pad_sequences
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu

# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

# fit a tokenizer
def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

# max sentence length
def max_length(lines):
	return max(len(line.split()) for line in lines)

# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
	# integer encode sequences
	X = tokenizer.texts_to_sequences(lines)
	# pad sequences with 0 values
	X = pad_sequences(X, maxlen=length, padding='post')
	return X

# map an integer to a word
def word_for_id(integer, tokenizer):
	for word, index in tokenizer.word_index.items():
		if index == integer:
			return word
	return None

# generate target given source sequence
def predict_sequence(model, tokenizer, source):
	prediction = model.predict(source, verbose=0)[0]
	integers = [argmax(vector) for vector in prediction]
	target = list()
	for i in integers:
		word = word_for_id(i, tokenizer)
		if word is None:
			break
		target.append(word)
	return ' '.join(target)

# evaluate the skill of the model
def evaluate_model(model, tokenizer, sources, raw_dataset):
	actual, predicted = list(), list()
	for i, source in enumerate(sources):
		# translate encoded source text
		source = source.reshape((1, source.shape[0]))
		translation = predict_sequence(model, eng_tokenizer, source)
		raw_target, raw_src = raw_dataset[i]
		if i < 10:
			print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
		actual.append([raw_target.split()])
		predicted.append(translation.split())
	# calculate BLEU score
	print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
	print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
	print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
	print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))


In [23]:
# load datasets
dataset = load_clean_sentences('english-german-both.pkl')
train = load_clean_sentences('english-german-train.pkl')
test = load_clean_sentences('english-german-test.pkl')
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
# prepare german tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])
# prepare data
trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])

# load model
model = load_model('model.h5')
if Option_evaluation :
    # test on some training sequences
    print('evaluate model for train data')
    evaluate_model(model, eng_tokenizer, trainX, train)
    # test on some test sequences
    print('evaluate model for test data')
    evaluate_model(model, eng_tokenizer, testX, test)
else:
    print('no new evaluation of the model.')

no new evaluation of the model.


## Usage of NMT
After training, use the NMT for translation from German-> English 

For arbitrary German text input,One have to convert German text into NMT input sequence. 
* (1) clean up text 
* (2) tokenize the cleaned text 
* (3) model predicttion for tokenized input
* (4) translate output of model to english 

In [24]:
# load datasets
dataset = load_clean_sentences('english-german-both.pkl')
train = load_clean_sentences('english-german-train.pkl')
test = load_clean_sentences('english-german-test.pkl')
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
# prepare german tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])
# prepare data
trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])

# load model
model = load_model('model.h5')

def clean_sentences(sentence_list):
    cleaned= list() 
    # prepare regex for char filtering
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    for line in sentence_list:
        # normalize unicode characters
        line = normalize('NFD', line).encode('ascii', 'ignore')
        line = line.decode('UTF-8')
        # tokenize on white space
        line = line.split()
        # convert to lowercase
        line = [word.lower() for word in line]
        # remove punctuation from each token
        line = [word.translate(table) for word in line]
        # remove non-printable chars form each token
        line = [re_print.sub('', w) for w in line]
        # remove tokens with numbers in them
        line = [word for word in line if word.isalpha()]
        # store as string
        cleaned.append(' '.join(line))
    return array(cleaned)

def translate_german(input_sentences):    
    # German input sentence-> cleaning -> tokenized input
    # tokenized output -> English 
    input_sentences_cleaned = clean_sentences(input_sentences)
    input_sentences_tokenized = encode_sequences(ger_tokenizer, ger_length, input_sentences_cleaned)
    predicted = model.predict(input_sentences_tokenized, verbose=0)
    output_sentences = list()
    for prediction in predicted:
        integers = [argmax(vector) for vector in prediction]
        target = list()
        for i in integers:
            word = word_for_id(i, eng_tokenizer)
            if word is None:
                break
            target.append(word)
        output_sentences.append(' '.join(target))
    return array(output_sentences)    

def translate_german_sentence(input_text):
    # same as translate_german 
    # for one sentence.
    input_sentences=[input_text]
    return translate_german(input_sentences)[0]

In [27]:
translate_german_sentence('Ich liebe dich')

'i love you'