# Introduction

We use char2char NLM using seq2seq + Attention model.

The encoder-decoder follows seq2seq model.

The decoding is based on attending to the encoder + the current decoder state.
The attention model follows: Luong et al., 2015 (https://arxiv.org/abs/1508.04025) instead of the traditional Bahadanau et al., 2014 (https://arxiv.org/abs/1409.0473). The implementation of attention follows this blog: https://wanasit.github.io/attention-based-sequence-to-sequence-in-keras.html. The reason to choose that is that the attention level is working at the output level of the LSTM, while Bahdanau attention needs to work at the state level of the encoder.

Planning to move to Attention() layer from fast.ai Jeremy Howard implementation: https://github.com/fastai/courses/blob/master/deeplearning2/attention_wrapper.py, which is following Bahdanau attention. 

# Imports

In [1]:
from __future__ import print_function
import tensorflow as tf
import keras.backend as K
from keras.backend.tensorflow_backend import set_session
from keras.models import Model
from keras.layers import Input, LSTM, Dense, Bidirectional, Concatenate, GRU, Dot, TimeDistributed, Activation, Embedding
from keras import optimizers
from keras.callbacks import ModelCheckpoint, TensorBoard, LearningRateScheduler
import numpy as np
import os
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import json
from nltk.tokenize import word_tokenize
from utils import *
import pdb
%matplotlib inline

Using TensorFlow backend.


In [2]:

gpu_alloc("1")

# Load data

In [3]:
data_path = '.'
file_name = 'wonderland.txt'

In [4]:
num_samples = 100000
full_file_name = os.path.join(data_path, file_name)
raw_texts = load_data(full_file_name, num_samples)
print(raw_texts[:200])

ALICE'S ADVENTURES IN WONDERLAND

Lewis Carroll

THE MILLENNIUM FULCRUM EDITION 3.0

CHAPTER I. Down the Rabbit-Hole

Alice was beginning to get very tired of sitting by her sister on the
bank, and of


In [5]:


in_seq_len = 20
out_seq_len = 20
input_texts, target_texts = generate_lm_data(raw_texts, in_seq_len, out_seq_len)

In [6]:
# Sample data
print(len(input_texts))
for i in range(10):
    print(input_texts[i], '\n', target_texts[i])

86641
ALICE'S ADVENTURES I 
 	N WONDERLAND

Lewis 

LICE'S ADVENTURES IN 
 	 WONDERLAND

Lewis C

ICE'S ADVENTURES IN  
 	WONDERLAND

Lewis Ca

CE'S ADVENTURES IN W 
 	ONDERLAND

Lewis Car

E'S ADVENTURES IN WO 
 	NDERLAND

Lewis Carr

'S ADVENTURES IN WON 
 	DERLAND

Lewis Carro

S ADVENTURES IN WOND 
 	ERLAND

Lewis Carrol

 ADVENTURES IN WONDE 
 	RLAND

Lewis Carroll

ADVENTURES IN WONDER 
 	LAND

Lewis Carroll


DVENTURES IN WONDERL 
 	AND

Lewis Carroll





## Build vocab

In [7]:
all_texts = target_texts + input_texts
vocab_to_int, int_to_vocab = build_vocab(all_texts)
np.savez('vocab-{}', vocab_to_int=vocab_to_int, int_to_vocab=int_to_vocab)

In [8]:
input_characters = sorted(list(vocab_to_int))
target_characters = sorted(list(vocab_to_int))
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)
max_encoder_seq_length = max([len(txt) for txt in input_texts]) + 2 # For '\t' and '\n'
max_decoder_seq_length = max([len(txt) for txt in target_texts]) + 2

In [9]:
print('Number of samples:', len(input_texts))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)

Number of samples: 86641
Number of unique input tokens: 72
Number of unique output tokens: 72
Max sequence length for inputs: 22
Max sequence length for outputs: 24


In [10]:
vocab_to_int # Some special chars need to be removed TODO: Data cleaning

{'\t': 2,
 '\n': 3,
 ' ': 1,
 '!': 61,
 '"': 64,
 "'": 46,
 '(': 49,
 ')': 50,
 '*': 65,
 ',': 42,
 '-': 51,
 '.': 28,
 '0': 29,
 '3': 27,
 ':': 43,
 ';': 54,
 '?': 47,
 'A': 11,
 'B': 66,
 'C': 16,
 'D': 7,
 'E': 8,
 'F': 26,
 'G': 60,
 'H': 22,
 'I': 24,
 'J': 67,
 'K': 56,
 'L': 10,
 'M': 23,
 'N': 4,
 'O': 6,
 'P': 30,
 'Q': 68,
 'R': 9,
 'S': 57,
 'T': 21,
 'U': 25,
 'UNK': 0,
 'V': 52,
 'W': 5,
 'Y': 53,
 'Z': 62,
 '[': 69,
 ']': 70,
 '_': 71,
 'a': 17,
 'b': 32,
 'c': 31,
 'd': 38,
 'e': 12,
 'f': 39,
 'g': 33,
 'h': 40,
 'i': 14,
 'j': 58,
 'k': 41,
 'l': 20,
 'm': 48,
 'n': 34,
 'o': 19,
 'p': 44,
 'q': 55,
 'r': 18,
 's': 15,
 't': 35,
 'u': 45,
 'v': 36,
 'w': 13,
 'x': 59,
 'y': 37,
 'z': 63}

In [11]:
int_to_vocab

{0: 'UNK',
 1: ' ',
 2: '\t',
 3: '\n',
 4: 'N',
 5: 'W',
 6: 'O',
 7: 'D',
 8: 'E',
 9: 'R',
 10: 'L',
 11: 'A',
 12: 'e',
 13: 'w',
 14: 'i',
 15: 's',
 16: 'C',
 17: 'a',
 18: 'r',
 19: 'o',
 20: 'l',
 21: 'T',
 22: 'H',
 23: 'M',
 24: 'I',
 25: 'U',
 26: 'F',
 27: '3',
 28: '.',
 29: '0',
 30: 'P',
 31: 'c',
 32: 'b',
 33: 'g',
 34: 'n',
 35: 't',
 36: 'v',
 37: 'y',
 38: 'd',
 39: 'f',
 40: 'h',
 41: 'k',
 42: ',',
 43: ':',
 44: 'p',
 45: 'u',
 46: "'",
 47: '?',
 48: 'm',
 49: '(',
 50: ')',
 51: '-',
 52: 'V',
 53: 'Y',
 54: ';',
 55: 'q',
 56: 'K',
 57: 'S',
 58: 'j',
 59: 'x',
 60: 'G',
 61: '!',
 62: 'Z',
 63: 'z',
 64: '"',
 65: '*',
 66: 'B',
 67: 'J',
 68: 'Q',
 69: '[',
 70: ']',
 71: '_'}

In [12]:
len(int_to_vocab)

72

# Prepare training data

## Train/test split

In [13]:
# Split the data into training and testing sentences
input_texts, test_input_texts, target_texts, test_target_texts  = train_test_split(input_texts, target_texts, test_size = 0.15, random_state = 42)

## Vectorize data

## Train data

In [14]:
encoder_input_data, decoder_input_data, decoder_target_data = vectorize_data(input_texts=input_texts,
                                                                             target_texts=target_texts, 
                                                                             max_encoder_seq_length=max_encoder_seq_length, 
                                                                             num_encoder_tokens=num_encoder_tokens, 
                                                                             vocab_to_int=vocab_to_int)

In [15]:
print(encoder_input_data.shape)
print(decoder_target_data.shape)

(73644, 22)
(73644, 22, 72)


## Test data

In [16]:
test_encoder_input_data, test_decoder_input_data, test_decoder_target_data = vectorize_data(input_texts=test_input_texts,
                                                                                            target_texts=test_target_texts, 
                                                                                            max_encoder_seq_length=max_encoder_seq_length, 
                                                                                            num_encoder_tokens=num_encoder_tokens, 
                                                                                            vocab_to_int=vocab_to_int)

# Encoder-decoder model

In [17]:

latent_dim = 256  # Latent dimensionality of the encoding space.

In [18]:
model, encoder_model, decoder_model = build_model(latent_dim=latent_dim, num_encoder_tokens=num_encoder_tokens)

[<tf.Tensor 'concatenate_1/concat:0' shape=(?, 512) dtype=float32>, <tf.Tensor 'concatenate_2/concat:0' shape=(?, 512) dtype=float32>]
Tensor("lstm_2/transpose_2:0", shape=(?, ?, 512), dtype=float32)
Tensor("bidirectional_1/concat:0", shape=(?, ?, 512), dtype=float32)
attention Tensor("attention/truediv:0", shape=(?, ?, ?), dtype=float32)
encoder-decoder  model:
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 72)     5184        input_1[0][0]                    
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, No

  encoder_model = Model(input=encoder_inputs, output=[encoder_outputs] + encoder_states)


# Training

In [19]:
batch_size = 64  # Batch size for training.
epochs = 20  
lr = 0.01

# Learning rate decay

In [20]:
model.compile(optimizer=optimizers.Adam(lr=lr), loss='categorical_crossentropy', metrics=['categorical_accuracy'])

In [22]:
#filepath="weights-improvement-{epoch:02d}-{val_categorical_accuracy:.2f}.hdf5"
filepath="best_model.hdf5" # Save only the best model for inference step, as saving the epoch and metric might confuse the inference function which model to use
checkpoint = ModelCheckpoint(filepath, monitor='val_categorical_accuracy', verbose=1, save_best_only=True, mode='max')
tbCallBack = TensorBoard(log_dir='./Graph', histogram_freq=0, write_graph=True, write_images=True)
callbacks_list = [checkpoint, tbCallBack]
#callbacks_list = [checkpoint, tbCallBack, lrate]



In [None]:
def exp_decay(epoch):
    initial_lrate = 0.1
    k = 0.1
    lrate = initial_lrate * np.exp(-k*epoch)
    return lrate
lrate = LearningRateScheduler(exp_decay)
#lr = 0

In [None]:
def step_decay(epoch):
    initial_lrate = 0.1
    drop = 0.5
    epochs_drop = 10.0
    lrate = initial_lrate * math.pow(drop, math.floor((1+epoch)/epochs_drop))
    return lrate
lrate = LearningRateScheduler(step_decay)
#lr = 0

In [None]:
#callbacks_list.append(lrate)

In [None]:
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          #validation_data = ([test_encoder_input_data, test_decoder_input_data], test_decoder_target_data),
          batch_size=batch_size,
          epochs=epochs,
          callbacks=callbacks_list,
          validation_split=0.1,
          shuffle=True)

Train on 66279 samples, validate on 7365 samples
Epoch 1/20

Epoch 00001: val_categorical_accuracy improved from -inf to 0.58832, saving model to best_model.hdf5


  '. They will not be included '


Epoch 2/20

Epoch 00002: val_categorical_accuracy improved from 0.58832 to 0.65087, saving model to best_model.hdf5
Epoch 3/20

Epoch 00003: val_categorical_accuracy improved from 0.65087 to 0.68015, saving model to best_model.hdf5
Epoch 4/20

Epoch 00004: val_categorical_accuracy improved from 0.68015 to 0.69955, saving model to best_model.hdf5
Epoch 5/20

Epoch 00005: val_categorical_accuracy improved from 0.69955 to 0.71596, saving model to best_model.hdf5
Epoch 6/20

Epoch 00006: val_categorical_accuracy improved from 0.71596 to 0.72282, saving model to best_model.hdf5
Epoch 7/20

Epoch 00007: val_categorical_accuracy improved from 0.72282 to 0.73676, saving model to best_model.hdf5
Epoch 8/20

Epoch 00008: val_categorical_accuracy improved from 0.73676 to 0.74652, saving model to best_model.hdf5
Epoch 9/20

Epoch 00009: val_categorical_accuracy improved from 0.74652 to 0.75306, saving model to best_model.hdf5
Epoch 10/20

Epoch 00010: val_categorical_accuracy improved from 0.75306

In [None]:
encoder_model.save('encoder_model.hdf5')
decoder_model.save('decoder_model.hdf5')

# Inference

In [None]:
# Sample output from train data
decoded_sentences = []
target_texts_ =  []
for seq_index in range(100):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_text = input_texts[seq_index]
    target_text = target_texts[seq_index][1:-1]
    
    
    encoder_input_data, decoder_input_data, decoder_target_data = vectorize_data(input_texts=input_text,
                                                                                 target_texts=target_text, 
                                                                                 max_encoder_seq_length=max_encoder_seq_length, 
                                                                                 num_encoder_tokens=num_encoder_tokens, 
                                                                                 vocab_to_int=vocab_to_int)    

    input_seq = encoder_input_data
    #target_seq = np.argmax(decoder_target_data, axis=-1)
    #print(target_seq)
    decoded_seq, _ = decode_sequence(input_seq, encoder_model, decoder_model, num_decoder_tokens, max_encoder_seq_length, int_to_vocab, vocab_to_int)
    
    decoded_sentence = ' '.join(decoded_seq) 
    print('-')
    print('Input sentence:', input_text)
    print('GT sentence:', target_text)
    print('Decoded sentence:', decoded_sentence)   
    decoded_sentences.append(decoded_sentence)
    target_texts_.append(target_text)

# Visualize attention

In [None]:
for seq_index in range(100):

    target_text = target_texts[seq_index][1:-1]
    text = input_texts[seq_index]
    decoded_sentence = visualize_attention(text, encoder_model, decoder_model, max_encoder_seq_length, num_decoder_tokens, vocab_to_int, int_to_vocab)
    print('-')
    print('Input sentence:', text)
    print('GT sentence:', target_text)
    print('Decoded sentence:', decoded_sentence)   


# Test - Short inference

In [None]:
# Sample output from train data
decoded_sentences = []
target_texts_ =  []
corrected_sentences = []
corrected_input_sentences = []
results = open('RESULTS.md', 'w')
results.write('|OCR sentence|GT sentence|Decoded sentence|\n')
results.write('|------------|-----------|----------------|\n')
for seq_index in range(len(test_input_texts)):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_text = test_input_texts[seq_index]
    target_text = test_target_texts[seq_index][1:-1]

    encoder_input_data, decoder_input_data, decoder_target_data = vectorize_data(input_texts=input_text,
                                                                                 target_texts=target_text, 
                                                                                 max_encoder_seq_length=max_encoder_seq_length, 
                                                                                 num_encoder_tokens=num_encoder_tokens, 
                                                                                 vocab_to_int=vocab_to_int)    

    input_seq = encoder_input_data
    #target_seq = np.argmax(decoder_target_data, axis=-1)
    #print(target_seq)
    decoded_sentence, _ = decode_sequence(input_seq, encoder_model, decoder_model, num_decoder_tokens, max_encoder_seq_length, int_to_vocab, vocab_to_int)
    print('-')
    print('Input sentence:', input_text)
    print('GT sentence:', target_text)
    print('Decoded sentence:', decoded_sentence)
    results.write(' | ' + input_text + ' | ' + target_text.strip() + ' | ' + decoded_sentence + ' | \n')
    decoded_sentences.append(decoded_sentence)
    corrected_sentences.append(corrected_sentence)
    corrected_input_sentences.append(corrected_input_sentence)
    target_texts_.append(target_text)


encoder_input_data = vectorize_data(input_texts=input_texts, max_encoder_seq_length=max_encoder_seq_length, num_encoder_tokens=num_encoder_tokens, vocab_to_int=vocab_to_int)
    
results.close() 

In [None]:

for seq_index in range(100):
    target_text = test_target_texts[seq_index][1:-1]
    text = test_input_texts[seq_index]

    decoded_sentence = visualize_attention(text, encoder_model, decoder_model, max_encoder_seq_length, num_decoder_tokens, vocab_to_int, int_to_vocab)
    print('-')
    print('Input sentence:', text)
    print('GT sentence:', target_text)
    print('Decoded sentence:', decoded_sentence)  
