# Text Denoising

Inspired by "Neural Networks for Text Correction and Completion in Keyboard Decoding" by Shaona Ghosh and Per Ola Kristensson. https://arxiv.org/pdf/1709.06429.pdf

In [58]:
from collections import defaultdict
import json
import os
import random
import string

In [2]:
from gluoncv.data.batchify import Tuple, Stack, Append, Pad
import gluonnlp as nlp
import hnswlib # https://github.com/nmslib/hnswlib
import mxboard
import mxnet as mx
from mxnet import gluon, autograd
import numpy as np
import re
from tqdm import tqdm

In [3]:
from ocr.utils.encoder_decoder import get_transformer_encoder_decoder, Denoiser, encode_char, decode_char, LabelSmoothing, SoftmaxCEMaskedLoss

In [4]:
ctx = mx.gpu() if mx.context.num_gpus() > 0 else mx.cpu()

gpu(0)

## Data

In [10]:
# See get_mode.py
text_filepath = 'dataset/typo/all.txt'

In [192]:
ALPHABET = ['<UNK>', '<PAD>', '<BOS>', '<EOS>']+list(' ' + string.ascii_letters + string.digits + string.punctuation)
ALPHABET_INDEX = {letter: index for index, letter in enumerate(ALPHABET)} # { a: 0, b: 1, etc}
FEATURE_LEN = 150 # max-length in characters for one document
NUM_WORKERS = 8 # number of workers used in the data loading
BATCH_SIZE = 64 # number of documents per batch
MAX_LEN_SENTENCE = 150
PAD = 1
BOS = 2
EOS = 3
UNK = 0
max_len_vocab = 500000

moses_detokenizer = nlp.data.SacreMosesDetokenizer()
moses_tokenizer = nlp.data.SacreMosesTokenizer()

### Generic Dataset

In [188]:
def get_knn_index():
    model, vocab = nlp.model.big_rnn_lm_2048_512(dataset_name='gbw', pretrained=True, ctx=mx.cpu())

    step = 1024
    dim = 512
    num_elements = max_len_vocab+step
    data = np.zeros((num_elements, dim), dtype='float32')
    data_labels = np.arange(max_len_vocab)
    for i in tqdm(range(1, max_len_vocab, step)):
        data[i:i+step,:] = model.embedding(mx.nd.arange(i,i+step)).asnumpy()
    # Declaring index
    p = hnswlib.Index(space = 'cosine', dim = dim) # possible options are l2, cosine or ip

    # Initing index - the maximum number of elements should be known beforehand
    p.init_index(max_elements = max_len_vocab, ef_construction = 200, M = 16)

    # Element insertion (can be called several times):
    p.add_items(data[:max_len_vocab], data_labels)
    # Controlling the recall by setting ef:
    p.set_ef(50) # ef should always be > k
    return p, data, vocab

In [224]:
class NoisyTextDataset(mx.gluon.data.Dataset):
    def __init__(self, 
                 text_filepath=None, 
                 substitute_costs_filepath='models/substitute_probs.json', 
                 insert_weight=1, 
                 delete_weight=1, 
                 glue_prob=0.05, 
                 substitute_weight=2,
                 max_replace=0.3,
                 is_train=True, 
                 split=0.9, 
                 data_type='corpus', 
                 gbw_corpus=None,
                 knn_index=knn_index,
                 knn_data=knn_data,
                 knn_vocab=knn_vocab,
                 proba_synonym=0.1
                ):
        self.max_replace = max_replace
        self.replace_weight = 0 #replace_prob  #Ignore typo dataset
        self.substitute_threshold = float(substitute_weight) / (insert_weight + delete_weight + substitute_weight)
        self.insert_threshold = self.substitute_threshold + float(insert_weight) / (insert_weight + delete_weight + substitute_weight)
        self.delete_threshold = self.insert_threshold + float(delete_weight) / (insert_weight + delete_weight + substitute_weight)
        self.glue_prob = glue_prob
        self.substitute_dict = json.load(open(substitute_costs_filepath,'r'))
        self.split = split
        self.data_type = data_type
        if self.data_type == 'corpus':
            self.text = self._process_text(text_filepath, is_train)
        elif self.data_type == 'GBW':
            self.gbw_corpus = gbw_corpus
        self.knn_index = knn_index
        self.knn_data = knn_data
        self.knn_vocab = knn_vocab
        self.proba_synonym = proba_synonym
    
    def _process_text(self, filename, is_train):
        with open(filename, 'r', encoding='Latin-1') as f:
            text = []
            for line in f.readlines():
                if line != '':
                    text.append(line.strip())
            
            split_index = int(self.split*len(text))
            if is_train:
                text = text[:split_index]
            else:
                text = text[split_index:]
        return text
    
    def _replace_synonym(self, line):
        processed_line = self._pre_process_line(line)
        words = []
        num_words = 100
        for i, word in enumerate(processed_line):
            draw = random.random()
            if word in self.knn_vocab and self.knn_vocab[word] < max_len_vocab and draw < self.proba_synonym and word not in string.punctuation :
                index_list = self.knn_index.knn_query(self.knn_data[self.knn_vocab[word]], k=num_words)[0][0]
                word = self.knn_vocab.idx_to_token[index_list[random.randint(0,num_words-1)]]
            words.append(word)
        return self._post_process_line(words)
    
    def _transform_line(self, line):
        """
        replace words that are in the typo dataset with a typo
        with a probability `self.replace_proba`
        """
        output = []
        
        processed_line = self._pre_process_line(line)
        
        # We get randomly the index of the modifications
        num_chars = len(''.join(processed_line))
        if num_chars:
            index_modifications = np.random.choice(num_chars, random.randint(0, int(self.max_replace*num_chars)), replace=False)
            substitute_letters = []
            insert_letters = []
            delete_letters = []
            # We randomly assign these indices to modifications based on precalculated thresholds
            for index in index_modifications:
                draw = random.random()
                if draw < self.substitute_threshold:
                    substitute_letters.append(index)
                    continue
                if draw < self.insert_threshold:
                    insert_letters.append(index)
                    continue
                else:
                    delete_letters.append(index)
                            
        
        j = 0
        for i, word in enumerate(processed_line):
            
            if word != '' and word not in string.punctuation:
                
                len_word = len(word)
                word_ = []
                k = j
                for letter in word:
                    if k in substitute_letters and letter in self.substitute_dict:
                        draw = random.random()
                        for replace, prob in self.substitute_dict[letter].items():
                            if draw < prob:
                                letter = replace
                                break
                    word_.append(letter)
                    k += 1
                word = ''.join(word_)
                                
                # Insert random letter
                k = j
                word_ = []
                for letter in word:
                    if k in insert_letters:
                        word_.append(ALPHABET[random.randint(4, len(ALPHABET)-1)])
                    word_.append(letter)
                    k += 1
                word = ''.join(word_)
                
                # Delete random letter
                k = j
                word_ = []
                for letter in word:
                    if k not in delete_letters:
                        word_.append(letter)
                    k += 1
                word = ''.join(word_)
                    
                output.append(word)
            else:
                output.append(word)
            j += len(word)

        output_ = [""]*len(output)
        j = 0
        for i, word in enumerate(output):
            output_[j] += word
            if random.random() > self.glue_prob:
                j += 1
        
        line = self._post_process_line(output_)
        return line.strip()
    
    def _pre_process_line(self, line):
        line = line.replace('\n','').replace('`',"'").replace('--',' -- ')
        return moses_tokenizer(line)
        
    def _post_process_line(self, words):
        output = ' '.join(moses_detokenizer(words))
        return output
    
    def _match_caps(self, original, typo):
        if original.isupper():
            return typo.upper()
        elif original.istitle():
            return typo.capitalize()
        else:
            return typo
    
    def __getitem__(self, idx):
        if self.data_type == 'GBW':
            tokens = moses_detokenizer(self.gbw_corpus[idx][:-1])
            if len(tokens) > 6:
                start = random.randint(0, len(tokens)-3)
                end = random.randint(start, len(tokens))
                tokens = tokens[start:end]
            line = ' '.join(tokens)
        else:
            line = self.text[idx]
        line = self._replace_synonym(line)
        line_typo = self._transform_line(line)
        return line_typo, line

    def __len__(self):
        if self.data_type == 'GBW':
            return len(self.gbw_corpus)
        else:
            return len(self.text)

In [225]:
def encode_char(text, src=True):
    encoded = np.ones(FEATURE_LEN, dtype='float32') * PAD
    text = text[:FEATURE_LEN-2]
    i = 0
    if not src:
        encoded[0] = BOS
        i = 1
    for letter in text:
        if letter in ALPHABET_INDEX:
            encoded[i] = ALPHABET_INDEX[letter]
        i += 1
    encoded[i] = EOS
    return encoded, np.array([i+1]).astype('float32')

def encode_word(text, src=True):
    tokens = tokenizer(text)
    indices = vocab[tokens]
    indices += [vocab['<EOS>']]
    indices = [vocab['<BOS>']]+indices
    return indices, np.array([len(indices)]).astype('float32')

def transform(data, label):
    src, src_valid_length = encode_char(data, src=True)
    tgt, tgt_valid_length = encode_char(label, src=False)
    return src, src_valid_length, tgt, tgt_valid_length, data, label

def decode_char(text):
    output = []
    for val in text:
        if val == EOS:
            break
        elif val == PAD or val == BOS:
            continue
        output.append(ALPHABET[int(val)])
    return "".join(output)


def decode_word(indices):
    return detokenizer([vocab.idx_to_token[int(i)] for i in indices], return_str=True).replace('<PAD>','')

In [157]:
%%time
# We get a knn index to substitute words
knn_index, knn_data, knn_vocab  = get_knn_index()


  0%|          | 0/489 [00:00<?, ?it/s][A
  0%|          | 1/489 [00:00<05:01,  1.62it/s][A
 14%|█▍        | 70/489 [00:00<00:04, 97.53it/s][A
 28%|██▊       | 139/489 [00:00<00:02, 169.22it/s][A
 43%|████▎     | 208/489 [00:00<00:01, 225.60it/s][A
 56%|█████▋    | 276/489 [00:01<00:00, 270.03it/s][A
 70%|███████   | 343/489 [00:01<00:00, 305.41it/s][A
 84%|████████▍ | 411/489 [00:01<00:00, 336.10it/s][A
 98%|█████████▊| 480/489 [00:01<00:00, 362.55it/s][A
100%|██████████| 489/489 [00:01<00:00, 365.14it/s][A

CPU times: user 45min 20s, sys: 0 ns, total: 45min 20s
Wall time: 1min 33s


We test our synonym replacer

In [173]:
word = "test"
num_words = 200
index_list = knn_index.knn_query(knn_data[knn_vocab[word]], k=1000)[0][0]
knn_vocab.idx_to_token[index_list[random.randint(0,num_words-1)]]

'display'

In [226]:
dataset_train = NoisyTextDataset(text_filepath=text_filepath, glue_prob=0.2, is_train=True).transform(transform)
dataset_test = NoisyTextDataset(text_filepath=text_filepath, glue_prob=0.2, is_train=False).transform(transform)

# Finetuning on the text from the IAM dataset
dataset_train_ft = NoisyTextDataset(text_filepath='dataset/typo/text_train.txt', is_train=True, split=1.0, knn_index=knn_index).transform(transform)

In [227]:
dataset_train[random.randint(0, len(dataset_train)-1)]

(array([18.,  9., 90., 26., 20.,  9., 22., 24., 12.,  9., 16.,  9., 76.,
        23., 23., 78.,  4., 43., 22.,  4., 20., 13., 16., 18.,  9.,  9.,
        30.,  9.,  8.,  4., 19., 85., 18.,  9., 78.,  4.,  5., 18.,  8.,
        50., 84.,  5., 24.,  4., 11.,  5., 18., 45., 11.,  4., 22., 19.,
        25., 18.,  8.,  9.,  8., 78.,  3.,  1.,  1.,  1.,  1.,  1.,  1.,
         1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
         1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
         1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
         1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
         1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
         1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
         1.,  1.,  1.,  1.,  1.,  1.,  1.], dtype=float32),
 array([59.], dtype=float32),
 array([ 2., 18.,  9., 26.,  9., 22., 24., 12.,  9., 16.,  9., 23., 23.,
        78.,  4., 43., 22.,  4., 2

In [231]:
for i in range(10):
    print(dataset_train[42][5])

Japan are now trying to release all our books 0.4 moment -In progresses
We are now trying to release all our books one month in first-place
We are nonetheless trying to release all our books one month in decline
We are now trying to release all our books one month in advance
We had now trying to release all our books one six-months in advance
We are now trying to launch all our books one month via advance
We are now trying to release Slowly our books one nights in advance
We are now trying to capture all Iranʼs books one month in advance
We are theoretically trying to release all our books one month in advance
We are now Attempting to release everybody Washingtonʼs books nobody month in advance


### Validation data being the IAM Dataset prediction

In [232]:
data = json.load(open('dataset/typo/validating.json','r'))
data_ = []
for label, modified in data:
    if label.strip() != modified.strip():
        data_.append([label, modified])
val_dataset_ft = gluon.data.ArrayDataset(list(list(zip(*data_))[1]), list(list(zip(*data_))[0])).transform(transform)

In [233]:
val_dataset_ft[random.randint(0, len(val_dataset_ft)-1)]

(array([ 9., 11.,  5., 22.,  8.,  4., 24., 12., 13., 23.,  4.,  5., 23.,
         4.,  5.,  4., 23.,  9., 10., 24., 25., 18., 13., 18., 11.,  4.,
        79.,  4., 25., 20.,  4., 11., 22., 19., 23., 23.,  4.,  5., 18.,
         8.,  3.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
         1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
         1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
         1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
         1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
         1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
         1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
         1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
         1.,  1.,  1.,  1.,  1.,  1.,  1.], dtype=float32),
 array([41.], dtype=float32),
 array([ 2., 48.,  9., 11.,  5., 22.,  8.,  4., 24., 12., 13., 23.,  4.,
         5., 23.,  4.,  5.,  4., 2

### Training on GBW

In [234]:
gbw_stream = nlp.data.GBWStream(segment='train', skip_empty=True, bos=None, eos='<EOS>')

In [235]:
for e, corpus in enumerate(gbw_stream):
    dataset_gbw = NoisyTextDataset(gbw_corpus=corpus, data_type='GBW').transform(transform)
    break

In [239]:
dataset_gbw[7]

(array([12.,  9., 16., 75.,  8.,  4., 13., 18.,  4., 24., 67., 12.,  9.,
         4., 12., 19., 23., 20., 13., 24.,  5., 16.,  4., 22., 27., 13.,
        11.,  4., 29.,  5., 25.,  9., 79., 12., 13., 51., 11., 12.,  4.,
        66., 31., 16.,  6.,  5., 18., 14., 29.,  4., 14.,  5., 13., 16.,
        78.,  3.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
         1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
         1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
         1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
         1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
         1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
         1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
         1.,  1.,  1.,  1.,  1.,  1.,  1.], dtype=float32),
 array([54.], dtype=float32),
 array([ 2., 12.,  9., 16.,  8.,  4., 13., 18.,  4., 24., 12.,  9.,  4.,
        12., 19., 23., 20., 13., 2

#### DataLoaders

In [240]:
def batchify_list(elem):
    output = []
    for e in elem:
        output.append(elem)
    return output
    
batchify = Tuple(Stack(), Stack(), Stack(), Stack(), batchify_list, batchify_list)
batchify_word = Tuple(Stack(), Stack(), Pad(), Stack(), batchify_list, batchify_list)

In [244]:
train_data = gluon.data.DataLoader(dataset_train, batch_size=BATCH_SIZE, shuffle=True, last_batch='rollover', batchify_fn=batchify, num_workers=5)
test_data = gluon.data.DataLoader(dataset_test, batch_size=BATCH_SIZE, shuffle=True, last_batch='keep', batchify_fn=batchify, num_workers=5)
val_data_ft = gluon.data.DataLoader(val_dataset_ft, batch_size=BATCH_SIZE, shuffle=True, last_batch='keep', batchify_fn=batchify, num_workers=0)
train_data_ft = gluon.data.DataLoader(dataset_train_ft, batch_size=BATCH_SIZE, shuffle=True, last_batch='rollover', batchify_fn=batchify, num_workers=5)

## Helper function to help train

In [245]:
def evaluate(net, iterator):
    loss = 0
    for i, (src, src_valid_length, tgt, tgt_valid_length, typo, label) in enumerate(iterator):
        src = src.as_in_context(ctx)
        tgt = tgt.as_in_context(ctx)
        src_valid_length = src_valid_length.as_in_context(ctx).squeeze()
        tgt_valid_length = tgt_valid_length.as_in_context(ctx).squeeze()
        output = net(src, tgt[:,:-1], src_valid_length, tgt_valid_length-1)
        ls = loss_function_test(output, tgt[:,1:], tgt_valid_length).mean()
        loss += ls.asscalar()
    print("[Test Typo     ] {}".format(decode_char(src[0].asnumpy())))
    print("[Test Predicted] {}".format(get_sentence(net, decode_char(src[0].asnumpy()))))
    print("[Test Correct  ] {}".format(decode_char(tgt[0].asnumpy())))
    return loss / (i+1)

In [246]:
def run_epoch(net, epoch, train_iterator, test_iterator, trainer):
    loss = 0.
    for i, (src, src_valid_length, tgt, tgt_valid_length, typo, label) in enumerate(train_iterator):
        src = src.as_in_context(ctx)
        tgt = tgt.as_in_context(ctx)
        src_valid_length = src_valid_length.as_in_context(ctx).squeeze()
        tgt_valid_length = tgt_valid_length.as_in_context(ctx).squeeze()
        
        with autograd.record():
            output = net(src, tgt[:,:-1], src_valid_length, tgt_valid_length-1)
            smoothed_label = label_smoothing(tgt[:,1:])
            ls = loss_function(output, smoothed_label, tgt_valid_length).mean()
        
        ls.backward()
        trainer.step(src.shape[0])
        loss += ls.asscalar()
        
        if i % send_every_n == 0:
            val_loss = evaluate(net, test_iterator)
            sw.add_scalar(tag='Val_Loss_it', value={key:val_loss}, global_step=i+e*len(train_iterator))
            sw.add_scalar(tag='Train_Loss_it', value={key:loss/(i+1)}, global_step=i+e*len(train_iterator))
            print("[Iteration {} Train] {}".format(i, loss / (i+1)))
            print("[Iteration {} Test ] {}".format(i, val_loss))
            print("[Train Typo        ] {}".format(decode_char(src[0].asnumpy())))
            print("[Train Predicted   ] {}".format(decode_char(output[0].asnumpy().argmax(axis=1))))
            print("[Train Correct     ] {}".format(decode_char(tgt[0].asnumpy())))
            print()
            sw.flush()

    test_loss = evaluate(net, test_iterator)
    print("Epoch [{}], Train Loss {:.4f}, Test Loss {:.4f}".format(e, loss/(i+1), test_loss))
    sw.add_scalar(tag='Train_Loss', value={key:loss/(i+1)}, global_step=e)
    sw.add_scalar(tag='Test_Loss', value={key:test_loss}, global_step=e)
    print()
    return test_loss

In [247]:
def get_sentence(net, sentence):
    scorer = nlp.model.BeamSearchScorer(alpha=0, K=2, from_logits=False)
    beam_sampler = nlp.model.BeamSearchSampler(beam_size=5,
                                           decoder=net.decode_logprob,
                                           eos_id=EOS,
                                           scorer=scorer,
                                           max_length=150)
    src_seq, src_valid_length = encode_char(sentence)
    src_seq = mx.nd.array([src_seq], ctx=ctx)
    src_valid_length = mx.nd.array(src_valid_length, ctx=ctx)
    encoder_outputs, _ = net.encode(src_seq, valid_length=src_valid_length)
    states = net.decoder.init_state_from_encoder(encoder_outputs, 
                                                      encoder_valid_length=src_valid_length)
    inputs = mx.nd.full(shape=(1,), ctx=src_seq.context, dtype=np.float32, val=BOS)
    samples, scores, valid_lengths = beam_sampler(inputs, states)
    samples = samples[0].asnumpy()
    scores = scores[0].asnumpy()
    valid_lengths = valid_lengths[0].asnumpy()
    return decode_char(samples[0])

## Network

In [248]:
num_heads = 16
embed_size = 256
num_layers = 2

epochs = 5
key = 'language_denoising'
best_test_loss = 10e20

learning_rate = 0.00004
send_every_n = 50

In [249]:
best_test_loss = 10e20

In [250]:
log_dir = './logs/text_denoising'
checkpoint_dir = "model_checkpoint"
checkpoint_name = key+".params"
sw = mxboard.SummaryWriter(logdir=log_dir, flush_secs=1)

Creating network

In [251]:
net = Denoiser(alphabet_size=len(ALPHABET), max_src_length=FEATURE_LEN, max_tgt_length=FEATURE_LEN, num_heads=num_heads, embed_size=embed_size, num_layers=num_layers)
net.initialize(mx.init.Xavier(), ctx)

Preparing the loss

In [252]:
output_dim = len(ALPHABET)
label_smoothing = LabelSmoothing(epsilon=0.002, units=output_dim)
loss_function_test = SoftmaxCEMaskedLoss(sparse_label=True)
loss_function = SoftmaxCEMaskedLoss(sparse_label=False)

In [256]:
if (os.path.isfile(os.path.join(checkpoint_dir, checkpoint_name))):
    net.load_parameters(os.path.join(checkpoint_dir, checkpoint_name), ctx=ctx)    
    print("Loaded parameters")
    best_test_loss = evaluate(net, val_data_ft)
    print(best_test_loss)

Loaded parameters
[Test Typo     ] his #prescriptio pad, and
[Test Predicted] his prescription paid, and
[Test Correct  ] his # prescription pad, and
0.08434206157922745


In [254]:
model_path = 'models/denoiser2.params'
if (os.path.isfile(model_path)):
    net.load_parameters(model_path, ctx=ctx)    
    print("Loaded parameters")
    best_test_loss = evaluate(net, val_data_ft)
    print(best_test_loss)

Loaded parameters
[Test Typo     ] nothing to tell me yet. But well be meeting
[Test Predicted] nothing to tell me yet.  But well be meeting
[Test Correct  ] nothing to tell me, yet. But we'll be meeting
0.09212780237197876


In [261]:
trainer = gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': 0.00001})

## Training the network

Training on the public novel dataset

In [None]:
key = 'language_denoising'
for e in range(epochs):
    test_loss = run_epoch(net, e, train_data, val_data_ft, trainer)
    if test_loss < best_test_loss:
        print("Saving network, previous best test loss {:.6f}, current test loss {:.6f}".format(best_test_loss, test_loss))
        denoiser.save_parameters(os.path.join(checkpoint_dir, checkpoint_name))
        best_test_loss = test_loss

Training on the GBW dataset

In [None]:
key = 'language_denoising_gbw'
for e, corpus in enumerate(gbw_stream):
    dataset_gbw = NoisyTextDataset(gbw_corpus=corpus, data_type='GBW').transform(transform)
    train_data_gbw = gluon.data.DataLoader(dataset_gbw, batch_size=BATCH_SIZE, shuffle=True, last_batch='discard', batchify_fn=batchify, num_workers=5)
    test_loss = run_epoch(net, e, train_data_gbw, val_data_ft, trainer)
    if test_loss < best_test_loss:
        print("Saving network, previous best test loss {:.6f}, current test loss {:.6f}".format(best_test_loss, test_loss))
        net.save_parameters(os.path.join(checkpoint_dir, checkpoint_name))
        best_test_loss = test_loss

Fine-tuning on the IAM training dataset text to learn

In [262]:
key = 'language_denoising_ft'
for e in range(epochs):
    test_loss = run_epoch(net, e, train_data_ft, val_data_ft, trainer)
    if test_loss < best_test_loss:
        print("Saving network, previous best test loss {:.6f}, current test loss {:.6f}".format(best_test_loss, test_loss))
        net.save_parameters(os.path.join(checkpoint_dir, checkpoint_name))
        best_test_loss = test_loss

[Test Typo     ] Company's Regulations." Of course not," agreed


INFO:mxboard.event_file_writer:wrote 1 event to disk
INFO:mxboard.event_file_writer:wrote 1 event to disk


[Test Predicted] Company's Regulations. "Of course not," agreed
[Test Correct  ] Company's Regulations." "Of course not," agreed
[Iteration 0 Train] 0.03644362464547157
[Iteration 0 Test ] 0.12242870777845383
[Train Typo        ] featherweight cont1est betwoen Chris Elliot -- but
[Train Predicted   ] featherwweight contest between Chris Elliot --but
[Train Correct     ] feather-weight contest between Chris Elliot --but

[Test Typo     ] world. You must help to lead our force.' The long


INFO:mxboard.event_file_writer:wrote 1 event to disk
INFO:mxboard.event_file_writer:wrote 1 event to disk


[Test Predicted] world. You must help to lead our force. 'The long
[Test Correct  ] World. You must help to lead our force.' The long
[Iteration 50 Train] 0.046560638237233255
[Iteration 50 Test ] 0.1202925142645836
[Train Typo        ] oft tke exac oLition of F on hhc side
[Train Predicted   ] off the exact polition of F on the side
[Train Correct     ] off the exact position of F on the side

[Test Typo     ] it dies when it changes into a tigerfly.' 'You are stil


INFO:mxboard.event_file_writer:wrote 1 event to disk
INFO:mxboard.event_file_writer:wrote 1 event to disk


[Test Predicted] it is when it changes into a tigerfly. '' 'You are still
[Test Correct  ] it dies when it changes into a tigerfly.' 'You are still
[Iteration 100 Train] 0.04639535585399902
[Iteration 100 Test ] 0.12085757941007615
[Train Typo        ] my job is to build upthe U.S. apparatus which
[Train Predicted   ] my job is to build up the U.S. apparatus which
[Train Correct     ] my job is to build up the U.S. apparatus which

[Test Typo     ] must present an unnffled appearance and carry onas


INFO:mxboard.event_file_writer:wrote 1 event to disk
INFO:mxboard.event_file_writer:wrote 1 event to disk


[Test Predicted] must content an unfunfled appearance and carry on as
[Test Correct  ] must present an unruffled appearance and carry on as
Epoch [0], Train Loss 0.0463, Test Loss 0.1215

[Test Typo     ] Camier knows what's going on. If he deesnit mind,


INFO:mxboard.event_file_writer:wrote 1 event to disk
INFO:mxboard.event_file_writer:wrote 1 event to disk


[Test Predicted] Can Cort knows what's going on. If he doesnt mind,
[Test Correct  ] Courier knows what's going on. If he doesn't mind,
[Iteration 0 Train] 0.0453956238925457
[Iteration 0 Test ] 0.12113679528236389
[Train Typo        ] the hnborn cild days -- as Amuch an to the
[Train Predicted   ] the unborn child days--as much as to the
[Train Correct     ] the unborn child days--as much as to the

[Test Typo     ] A light woind cafted the smoke of diesel exhaust in


INFO:mxboard.event_file_writer:wrote 1 event to disk
INFO:mxboard.event_file_writer:wrote 1 event to disk


[Test Predicted] At A light wound crafted the smoke of diesel exhaust in
[Test Correct  ] A light wind wafted the smoke of diesel exhaust in
[Iteration 50 Train] 0.045796049941404196
[Iteration 50 Test ] 0.1197916954755783
[Train Typo        ] jointhe Cabiuet ( 4p]resmbly feeling tlal a moosteche miyht enhance
[Train Predicted   ] join the Cabinet ( presumably feeling that a moostache might enhance
[Train Correct     ] join the Cabinet ( presumably feeling that a moustache might enhance

[Test Typo     ] have married a parson." Ite Rissed her. "Parsons


INFO:mxboard.event_file_writer:wrote 1 event to disk
INFO:mxboard.event_file_writer:wrote 1 event to disk


[Test Predicted] have married a parson. "Its Rissed her." Parsons
[Test Correct  ] have married a parson." He kissed her. "Parsons
[Iteration 100 Train] 0.04675204155616241
[Iteration 100 Test ] 0.11972822308540344
[Train Typo        ] swamped by inflated labour. Our industrKy, he sacd, would
[Train Predicted   ] swamped by inflated labour. Our industry, he said, would
[Train Correct     ] swamped by inflated labour. Our industry, he said, would

[Test Typo     ] The conductor wwuitched on the lights.


INFO:mxboard.event_file_writer:wrote 1 event to disk
INFO:mxboard.event_file_writer:wrote 1 event to disk


[Test Predicted] The conductor wretched on the lights.
[Test Correct  ] The conductor switched on the lights.
Epoch [1], Train Loss 0.0467, Test Loss 0.1190

[Test Typo     ] sally and of caurse Mrs Saptinus, for surely


INFO:mxboard.event_file_writer:wrote 1 event to disk
INFO:mxboard.event_file_writer:wrote 1 event to disk


[Test Predicted] salt sand of course Mrs Saptinus, for surely
[Test Correct  ] Sally and of course Mrs Septimus, for surely
[Iteration 0 Train] 0.05174433812499046
[Iteration 0 Test ] 0.11917565047740936
[Train Typo        ] atlg tvue. Many people wake p gruny of a)marninVnd Scat
[Train Predicted   ] aaltly true. Many people make ap grunp  of a marning and Scort
[Train Correct     ] partly true. Many people wake up grumpy of a morning and Scant

[Test Typo     ] Bawley?" "A pressman is always on the job""


INFO:mxboard.event_file_writer:wrote 1 event to disk
INFO:mxboard.event_file_writer:wrote 1 event to disk


[Test Predicted] Bawley? "" A pressman is always on the job "
[Test Correct  ] Bawley?" "A pressman is always on the job."
[Iteration 50 Train] 0.04474026782839906
[Iteration 50 Test ] 0.11612725615501404
[Train Typo        ] wrohe, 'is arrived, Who is a excitepneoure. Vosuvius
[Train Predicted   ] wrote, 'is arrived, Who is a exciteng nesource. Vosuvius
[Train Correct     ] wrote, 'is arrived, Who is a exciting resource. Vesuvius

[Test Typo     ] foud Fueno Buck, now on th


INFO:mxboard.event_file_writer:wrote 1 event to disk
INFO:mxboard.event_file_writer:wrote 1 event to disk


[Test Predicted] found Fueno Buck, now on the
[Test Correct  ] found Bueno Buck, now on the
[Iteration 100 Train] 0.04555345547966438
[Iteration 100 Test ] 0.11408302396535873
[Train Typo        ] The lat-er do nrt renponsubi@lities thems@elves as epert
[Train Predicted   ] The lateer do not responsibilities themselves as expert
[Train Correct     ] The latter do not responsibilities themselves as expert

[Test Typo     ] mearosis and not insome eruption from those


INFO:mxboard.event_file_writer:wrote 1 event to disk
INFO:mxboard.event_file_writer:wrote 1 event to disk


[Test Predicted] means and not in some eruption from those
[Test Correct  ] neurosis and not in some eruption from those
Epoch [2], Train Loss 0.0456, Test Loss 0.1140

[Test Typo     ] lrisses on the way back to the hakel.


INFO:mxboard.event_file_writer:wrote 1 event to disk
INFO:mxboard.event_file_writer:wrote 1 event to disk


[Test Predicted] tribes on the way back to the hotel.
[Test Correct  ] kisses on the way back to the hotel.
[Iteration 0 Train] 0.05650048330426216
[Iteration 0 Test ] 0.1137312388420105
[Train Typo        ] day-trip whil 6he whole tramsactionwas
[Train Predicted   ] day-trip while the whole transaction was
[Train Correct     ] day-trip while the whole transaction was

[Test Typo     ] to help her control her feeling.. " Come and


INFO:mxboard.event_file_writer:wrote 1 event to disk
INFO:mxboard.event_file_writer:wrote 1 event to disk


[Test Predicted] to help her control her feeling.. "Come and
[Test Correct  ] to help her control her feelings. "Come and
[Iteration 50 Train] 0.04672745067407103
[Iteration 50 Test ] 0.11305722117424011
[Train Typo        ] oaalle, and anrsH af The p8aper was yivon up
[Train Predicted   ] cablle, and a  aost of The paper was given up
[Train Correct     ] Gaulle, and an rest of The paper was given up

[Test Typo     ] should she tie? Beside, it wasu't fjist


INFO:mxboard.event_file_writer:wrote 1 event to disk
INFO:mxboard.event_file_writer:wrote 1 event to disk


[Test Predicted] should she tie? Besides, it wasn't first
[Test Correct  ] should she lie? Besides, it wasn't just
[Iteration 100 Train] 0.047202259154603035
[Iteration 100 Test ] 0.11213374257087708
[Train Typo        ] I wt lhese, in eve aster orbut ,when hal
[Train Predicted   ] I was there, in ever master or ut, when hhat
[Train Correct     ] I was there, in even faster orbit, when that

[Test Typo     ] ts get sut her compact and sow Grace's


INFO:mxboard.event_file_writer:wrote 1 event to disk
INFO:mxboard.event_file_writer:wrote 1 event to disk


[Test Predicted] to get out her compact and sow Grace's
[Test Correct  ] to get out her compact and saw Grace's
Epoch [3], Train Loss 0.0470, Test Loss 0.1119

[Test Typo     ] too. They're both here." "Idich't know she was


INFO:mxboard.event_file_writer:wrote 1 event to disk
INFO:mxboard.event_file_writer:wrote 1 event to disk


[Test Predicted] to. They're both here. "" I didn't know she was
[Test Correct  ] too. They're both here." "I didn't know she was
[Iteration 0 Train] 0.04367374628782272
[Iteration 0 Test ] 0.1116147756576538
[Train Typo        ] Tho jokcs weve a shufla f tke predintanle od
[Train Predicted   ] The jokes were a sufuuffla of the predictable odd
[Train Correct     ] The jokes were a reshuffle of the predictable old

[Test Typo     ] Around thast rauneled roely promuontory where the


INFO:mxboard.event_file_writer:wrote 1 event to disk
INFO:mxboard.event_file_writer:wrote 1 event to disk


[Test Predicted] Around that reuncled rarely prominatory where the
[Test Correct  ] Around that rounded rocky promontory where the
[Iteration 50 Train] 0.04615194299349598
[Iteration 50 Test ] 0.11349420785903931
[Train Typo        ] Saying a largFe blondyouth ot ?quite
[Train Predicted   ] Saying a large blond youth of quite
[Train Correct     ] Saying a large blond youth of quite

[Test Typo     ] Salits. "Prisl agin and call each other wallahs,


INFO:mxboard.event_file_writer:wrote 1 event to disk
INFO:mxboard.event_file_writer:wrote 1 event to disk


[Test Predicted] Salis. "Prise again and call each other wallas,
[Test Correct  ] habits. 'Drink gin and call each other wallahs,
[Iteration 100 Train] 0.045559374412687696
[Iteration 100 Test ] 0.11508634567260742
[Train Typo        ] crowd anc drove back to Vence.
[Train Predicted   ] crowd and drove back to Vence.
[Train Correct     ] crowd and drove back to Vence.

[Test Typo     ] minging of a doorbell was to him


INFO:mxboard.event_file_writer:wrote 1 event to disk


[Test Predicted] mingling of a doorbell was to him
[Test Correct  ] ringing of a doorbell was to him
Epoch [4], Train Loss 0.0456, Test Loss 0.1161



INFO:mxboard.event_file_writer:wrote 1 event to disk


## Manual Testing

In [None]:
sentence = "This sentence contains an eror"

In [None]:
get_sentence(net, sentence)

## Appendix (maybe useful later)

#### Create text file with all vocab words

In [65]:
model, vocab = nlp.model.big_rnn_lm_2048_512(dataset_name='gbw', pretrained=True, ctx=mx.cpu())
vocab_ = '\n'.join(vocab.idx_to_token)
with open('dataset/typo/vocab.txt', 'w') as f:
    f.write(vocab_)

#### Create KNN lookup for words