In [1]:
import numpy as np
import mxnet as mx
import difflib

from handwriting_line_recognition import Network as BiLSTMNetwork
from utils.noisy_forms_dataset import Noisy_forms_dataset
from utils.ngram_dataset import Ngram_dataset
from utils.iam_dataset import resize_image

In [2]:
line_image_size = (30, 400)
def handwriting_recognition_transform(image):
    image, _ = resize_image(image, line_image_size)
    image = mx.nd.array(image)/255.
    image = (image - 0.942532484060557) / 0.15926149044640417
    image = image.as_in_context(ctx)
    image = image.expand_dims(0).expand_dims(0)
    return image

def get_ns(train):
    network = BiLSTMNetwork(rnn_hidden_states=128, rnn_layers=2, ctx=ctx)
    # params = mx.ndarray.load("model_checkpoint/handwriting_line_good.params")
    # print(params.keys())
    network.load_params("models/handwriting_line_recognition2.params", ctx=ctx)

    def noise_source_transform(image, text):
        image = handwriting_recognition_transform(image)
        output = network(image)
        predict_probs = output.softmax().asnumpy()
        return predict_probs
    ns = Noisy_forms_dataset(noise_source_transform, train=train, name="OCR_noise2")
    return ns

In [3]:
ctx = mx.gpu(0)

In [4]:
train_ns = get_ns(train=True)
ng_train_ds = Ngram_dataset(train_ns, "word_5train", output_type="word", n=5)



In [14]:
insertions = []
deletions = []
substitutions = []

for i in range(len(ng_train_ds)):
    _, _, noisy, actual = ng_train_ds[i]
    diffs = []
    for diff in difflib.ndiff(noisy, actual):
        if diff[0] == "+" or diff[0] == "-":
            diffs.append(diff)
    if len(diffs) == 1:
        if diffs[0][0] == "+":
            insertions.append(diffs[0][-1])
        if diffs[0][0] == "-":
            deletions.append(diffs[0][-1])
    if len(diffs) == 2:
        if diffs[0][0] == "+" and diffs[1][0] == "-" or diffs[0][0] == "-" and diffs[1][0] == "+":
            changes1 = (diffs[0][-1], diffs[1][-1])
            changes2 = (diffs[1][-1], diffs[0][-1])
            substitutions.append(changes1)
            substitutions.append(changes2)

In [None]:
insertions = []
deletions = []
substitutions = []

for i in range(len(ng_train_ds)):
    _, _, noisy, actual = ng_train_ds[i]
    seqm = difflib.SequenceMatcher(None, noisy, actual)
    for opcode, a0, a1, b0, b1 in seqm.get_opcodes():
        if opcode == 'equal':
            output.append(seqm.a[a0:a1])
        elif opcode == 'insert':
            for char in seqm.b[b0:b1]:
                insertions.append(char)
        elif opcode == 'delete':
            for char in seqm.a[a0:a1]:
                deletions.append(char)
        elif opcode == 'replace':
            # seqm.a[a0:a1] -> seqm.b[b0:b1]
            if len(seqm.a[a0:a1]) == len(seqm.b[b0:b1]):
                for charA, charB in zip(seqm.a[a0:a1], seqm.b[b0:b1]):
                    substitutions.append((charA, charB))
        else:
            pass

In [15]:
insertion_dict = {}
for insertion in insertions:
    if insertion not in insertion_dict:
        insertion_dict[insertion] = 0
    insertion_dict[insertion] += 1
insertion_costs = np.ones(128, dtype=np.float64)
for key in insertion_dict:
    insertion_costs[ord(key)] = 1/insertion_dict[key]
print(insertion_costs)
np.savetxt("models/insertion_costs.txt", insertion_costs, fmt='%4.6f')

[ 1.          1.          1.          1.          1.          1.          1.
  1.          1.          1.          1.          1.          1.          1.
  1.          1.          1.          1.          1.          1.          1.
  1.          1.          1.          1.          1.          1.          1.
  1.          1.          1.          1.          1.          1.          1.
  1.          1.          1.          1.          0.05        1.          0.5
  1.          1.          0.04        1.          0.01785714  1.          1.
  0.33333333  1.          1.          1.          1.          1.          1.
  1.          0.5         0.5         1.          1.          1.          1.
  1.          1.          1.          1.          1.          1.          1.
  1.          1.          1.          1.          1.          1.          1.
  0.5         1.          1.          1.          1.          1.          0.5
  1.          1.          1.          1.          1.          1.          

In [16]:
deletion_dict = {}
for deletion in deletions:
    if deletion not in deletion_dict:
        deletion_dict[deletion] = 0
    deletion_dict[deletion] += 1
print(deletion_dict)
deletion_costs = np.ones(128, dtype=np.float64)
for key in deletion_dict:
    deletion_costs[ord(key)] = 1/deletion_dict[key]
print(deletion_costs)
np.savetxt("models/deletion_costs.txt", deletion_costs, fmt='%4.6f')

{'o': 47, 'n': 65, 't': 53, 'i': 53, 'e': 44, 'u': 30, 'r': 50, 'g': 3, ';': 13, 'v': 4, 'w': 10, 'l': 33, 's': 45, 'c': 9, 'h': 7, 'd': 5, 'f': 9, '.': 5, 'y': 4, 'a': 13, 'm': 17, 'q': 2, 'M': 4, 'k': 1, 'p': 5, "'": 2, 'C': 1, '&': 1, 'L': 1, '3': 2, ',': 1, 'B': 1}
[ 1.          1.          1.          1.          1.          1.          1.
  1.          1.          1.          1.          1.          1.          1.
  1.          1.          1.          1.          1.          1.          1.
  1.          1.          1.          1.          1.          1.          1.
  1.          1.          1.          1.          1.          1.          1.
  1.          1.          1.          1.          0.5         1.          1.
  1.          1.          1.          1.          0.2         1.          1.
  1.          1.          0.5         1.          1.          1.          1.
  1.          1.          1.          0.07692308  1.          1.          1.
  1.          1.          1.         

In [17]:
substitution_dict = {}
for subs in substitutions:
    if subs not in substitution_dict:
        substitution_dict[subs] = 0
    substitution_dict[subs] += 1
print(substitution_dict)
substitute_costs = np.ones((128, 128), dtype=np.float64)
for key in substitution_dict:
    key1, key2 = key
    substitute_costs[ord(key1), ord(key2)] = 1/substitution_dict[key]
print(substitute_costs)
np.savetxt("models/substitute_costs.txt", substitute_costs, fmt='%4.6f')

{('o', 'a'): 278, ('a', 'o'): 278, ('r', 'e'): 6, ('e', 'r'): 6, ('n', 'm'): 38, ('m', 'n'): 38, ('c', 'r'): 7, ('r', 'c'): 7, ('g', 'f'): 4, ('f', 'g'): 4, ('c', 's'): 10, ('s', 'c'): 10, ('l', 'J'): 4, ('J', 'l'): 4, ('s', 'e'): 20, ('e', 's'): 20, ('U', 'L'): 1, ('L', 'U'): 1, ('u', 'e'): 19, ('e', 'u'): 19, ('o', 'e'): 94, ('e', 'o'): 94, ('n', 'l'): 3, ('l', 'n'): 3, ('t', 'f'): 32, ('f', 't'): 32, ('r', 'v'): 39, ('v', 'r'): 39, ('r', 'i'): 7, ('i', 'r'): 7, ('l', 'k'): 17, ('k', 'l'): 17, ('s', 'r'): 69, ('r', 's'): 69, ('a', 'e'): 129, ('e', 'a'): 129, ('n', 'a'): 19, ('a', 'n'): 19, ('y', 'f'): 1, ('f', 'y'): 1, ('i', 'e'): 7, ('e', 'i'): 7, ('o', 'c'): 34, ('c', 'o'): 34, ('W', 'K'): 1, ('K', 'W'): 1, ('r', 't'): 18, ('t', 'r'): 18, ('M', 'U'): 4, ('U', 'M'): 4, ('s', 'x'): 10, ('x', 's'): 10, ('n', 't'): 1, ('t', 'n'): 1, ('1', 'H'): 1, ('H', '1'): 1, ('e', 'l'): 12, ('l', 'e'): 12, ('d', 't'): 11, ('t', 'd'): 11, ('d', 'c'): 9, ('c', 'd'): 9, ('w', 'm'): 14, ('m', 'w'): 14,