# Model Distance between characters

In [1]:
import numpy as np
import mxnet as mx
import difflib

from handwriting_line_recognition import Network as BiLSTMNetwork
from utils.noisy_forms_dataset import Noisy_forms_dataset
from utils.ngram_dataset import Ngram_dataset
from utils.iam_dataset import resize_image

## Decode noisy forms

We want to find what characters are more likely to be confused with each others to build a distance model between them

In [4]:
line_image_size = (60, 800)
def handwriting_recognition_transform(image):
    image, _ = resize_image(image, line_image_size)
    image = mx.nd.array(image)/255.
    image = (image - 0.942532484060557) / 0.15926149044640417
    image = image.as_in_context(ctx)
    image = image.expand_dims(0).expand_dims(0)
    return image

def get_ns(train):
    network = BiLSTMNetwork(rnn_hidden_states=512, rnn_layers=2, max_seq_len=160, ctx=ctx)
    network.load_params("models/handwriting_line_sl_160_a_512_o_2.params", ctx=ctx)

    def noise_source_transform(image, text):
        image = handwriting_recognition_transform(image)
        output = network(image)
        predict_probs = output.softmax().asnumpy()
        return predict_probs
    ns = Noisy_forms_dataset(noise_source_transform, train=train, name="OCR_noise2")
    return ns

In [5]:
ctx = mx.gpu(0)

In [6]:
train_ns = get_ns(train=True)
ng_train_ds = Ngram_dataset(train_ns, "word_5train", output_type="word", n=5)

51/967
52/967
53/967
54/967
55/967
56/967
57/967
58/967
59/967
60/967
61/967
62/967
63/967
64/967
65/967
66/967
67/967
68/967
69/967
70/967
71/967
72/967
73/967
74/967
75/967
76/967
77/967
78/967
79/967
80/967
81/967
82/967
83/967
84/967
85/967
86/967
87/967
88/967
89/967
90/967
91/967
92/967
93/967
94/967
95/967
96/967
97/967
98/967
99/967
100/967
101/967
102/967
103/967
104/967
105/967
106/967
107/967
108/967
109/967
110/967
111/967
112/967
113/967
114/967
115/967
116/967
117/967
118/967
119/967
120/967
121/967
122/967
123/967
124/967
125/967
126/967
127/967
128/967
129/967
130/967
131/967
132/967
133/967
134/967
135/967
136/967
137/967
138/967
139/967
140/967
141/967
142/967
143/967
144/967
145/967
146/967
147/967
148/967
149/967
150/967
151/967
152/967
153/967
154/967
155/967
156/967
157/967
158/967
159/967
160/967
161/967
162/967
163/967
164/967
165/967
166/967
167/967
168/967
169/967
170/967
171/967
172/967
173/967
174/967
175/967
176/967
177/967
178/967
179/967
180/967
181/967
1

#### Using ndiff to diff the expected result and the predicted results

In [9]:
insertions = []
deletions = []
substitutions = []

for i in range(len(ng_train_ds)):
    _, _, noisy, actual = ng_train_ds[i]
    diffs = []
    for diff in difflib.ndiff(noisy, actual):
        if diff[0] == "+" or diff[0] == "-":
            diffs.append(diff)
    if len(diffs) == 1:
        if diffs[0][0] == "+":
            insertions.append(diffs[0][-1])
        if diffs[0][0] == "-":
            deletions.append(diffs[0][-1])
    if len(diffs) == 2:
        if diffs[0][0] == "+" and diffs[1][0] == "-" or diffs[0][0] == "-" and diffs[1][0] == "+":
            changes1 = (diffs[0][-1], diffs[1][-1])
            changes2 = (diffs[1][-1], diffs[0][-1])
            substitutions.append(changes1)
            substitutions.append(changes2)

#### Using SequenceMatcher to diff the expected result and the predicted results

In [16]:
insertions = []
deletions = []
substitutions = []
output = []
for i in range(len(ng_train_ds)):
    _, _, noisy, actual = ng_train_ds[i]
    seqm = difflib.SequenceMatcher(None, noisy, actual)
    for opcode, a0, a1, b0, b1 in seqm.get_opcodes():
        if opcode == 'equal':
            output.append(seqm.a[a0:a1])
        elif opcode == 'insert':
            for char in seqm.b[b0:b1]:
                insertions.append(char)
        elif opcode == 'delete':
            for char in seqm.a[a0:a1]:
                deletions.append(char)
        elif opcode == 'replace':
            # seqm.a[a0:a1] -> seqm.b[b0:b1]
            if len(seqm.a[a0:a1]) == len(seqm.b[b0:b1]):
                for charA, charB in zip(seqm.a[a0:a1], seqm.b[b0:b1]):
                    substitutions.append((charA, charB))
        else:
            pass

In [17]:
insertion_dict = {}
for insertion in insertions:
    if insertion not in insertion_dict:
        insertion_dict[insertion] = 0
    insertion_dict[insertion] += 1
insertion_costs = np.ones(128, dtype=np.float64)
for key in insertion_dict:
    insertion_costs[ord(key)] = 1/insertion_dict[key]
print(insertion_costs)
np.savetxt("models/insertion_costs.txt", insertion_costs, fmt='%4.6f')

[1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         0.03030303 0.03448276 0.16666667 0.2
 1.         1.         0.02631579 1.         0.0075188  1.
 0.25       0.2        1.         1.         1.         1.
 1.         1.         1.         1.         0.09090909 0.02083333
 1.         1.         1.         1.         1.         0.14285714
 0.33333333 0.11111111 0.2        0.16666667 0.5        0.16666667
 0.08333333 0.05       1.         0.16666667 0.09090909 0.25
 0.33333333 0.33333333 1.         1.         0.2        0.04
 0.14285714 0.33333333 0.33333333 0.16666667 1.         1.
 1.         1.         1.         1.         1.         1.
 1.         0.02       0.03

In [18]:
deletion_dict = {}
for deletion in deletions:
    if deletion not in deletion_dict:
        deletion_dict[deletion] = 0
    deletion_dict[deletion] += 1
print(deletion_dict)
deletion_costs = np.ones(128, dtype=np.float64)
for key in deletion_dict:
    deletion_costs[ord(key)] = 1/deletion_dict[key]
print(deletion_costs)
np.savetxt("models/deletion_costs.txt", deletion_costs, fmt='%4.6f')

{'h': 40, 'r': 22, 'i': 17, 'W': 3, 'y': 8, 't': 51, 'n': 21, 'l': 14, 'e': 39, 'a': 23, 'A': 7, 's': 24, '.': 8, 'H': 2, 'u': 6, 'o': 14, 'm': 13, 'p': 4, 'S': 2, 'w': 20, 'x': 1, 'F': 3, 'T': 9, '1': 12, '5': 11, 'c': 12, 'M': 5, 'f': 2, 'G': 2, 'b': 4, 'g': 1, 'd': 8, ',': 3, '0': 1, 'B': 2, 'C': 3, '"': 1, 'I': 1, 'v': 1}
[1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         0.33333333 1.         0.125      1.
 1.         0.08333333 1.         1.         1.         0.09090909
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         0.14285714
 0.5   

In [19]:
substitution_dict = {}
for subs in substitutions:
    if subs not in substitution_dict:
        substitution_dict[subs] = 0
    substitution_dict[subs] += 1
print(substitution_dict)
substitute_costs = np.ones((128, 128), dtype=np.float64)
for key in substitution_dict:
    key1, key2 = key
    substitute_costs[ord(key1), ord(key2)] = 1/substitution_dict[key]
print(substitute_costs)
np.savetxt("models/substitute_costs.txt", substitute_costs, fmt='%4.6f')

{('r', 's'): 5, ('l', 't'): 8, ('t', 'h'): 5, ('t', 'l'): 13, ('n', 'm'): 18, ('M', 'U'): 1, ('f', 't'): 1, ('A', 'N'): 1, ('e', 'o'): 13, ('e', 'u'): 2, ('n', 'r'): 9, ('h', 'k'): 4, ('e', 'a'): 18, ('c', 'e'): 3, ('.', ','): 21, ('H', 'M'): 1, ('c', 'C'): 3, ('t', 'r'): 4, ('L', 'h'): 1, ('W', 'b'): 1, ('r', 'e'): 3, ('r', 'R'): 1, ('r', 'n'): 10, ('r', 'v'): 5, ('P', 'R'): 1, ('o', 'e'): 6, ('v', 'r'): 4, ('t', 'd'): 4, ('n', 'a'): 1, ('h', 'L'): 1, ('W', 'S'): 1, ('W', 'w'): 3, ('r', 'x'): 2, ('c', 't'): 3, ('C', 'G'): 1, ('L', 't'): 1, ('a', 'b'): 1, ('e', 'M'): 3, ('y', 'g'): 6, ('e', 'm'): 1, ('a', 'o'): 24, ('S', 'I'): 1, ('r', 'i'): 3, ('w', 's'): 2, ('j', 'S'): 1, ('e', 'E'): 4, ('k', 'l'): 2, ('n', 't'): 2, ('t', 'k'): 2, ('e', 'w'): 1, ('h', '"'): 1, ('t', 'M'): 1, ('"', "'"): 6, (',', '.'): 13, ('w', 'a'): 1, ('l', 'L'): 2, ('l', 'h'): 3, ('e', 'n'): 3, ('u', 'n'): 3, ('f', 'F'): 1, ('f', 'P'): 1, ('t', 'n'): 1, ('l', 'n'): 1, ('n', 'u'): 5, ('o', 'a'): 6, ('t', 'f'): 4, (

## Using Visual Distance with embedding built from classification

In [8]:
import zipfile

In [None]:
mxnet.test_utils.download('http://www.itl.nist.gov/iaui/vip/cs_links/EMNIST/matlab.zip', dirname='dataset/distance')