# Introduction

We tackle the problem of OCR post processing. In OCR, we map the image form of the document into the text domain. This is done first using an CNN+LSTM+CTC model, in our case based on tesseract. Since this output maps only image to text, we need something on top to validate and correct language semantics.

The idea is to build a language model, that takes the OCRed text and corrects it based on language knowledge. The langauge model could be:
- Char level: the aim is to capture the word morphology. In which case it's like a spelling correction system.
- Word level: the aim is to capture the sentence semnatics. But such systems suffer from the OOV problem.
- Fusion: to capture semantics and morphology language rules. The output has to be at char level, to avoid the OOV. However, the input can be char, word or both.

The fusion model target is to learn:

    p(char | char_context, word_context)

In this workbook we use seq2seq vanilla Keras implementation, adapted from the lstm_seq2seq example on Eng-Fra translation task. The adaptation involves:

- Adapt to spelling correction, on char level
- Pre-train on a noisy, medical sentences
- Fine tune a residual, to correct the mistakes of tesseract 
- Limit the input and output sequence lengths
- Enusre teacher forcing auto regressive model in the decoder
- Limit the padding per batch
- Learning rate schedule
- Bi-directional LSTM Encoder
- Bi-directional GRU Encoder


# Imports

In [1]:
from __future__ import print_function
import tensorflow as tf
import keras.backend as K
from keras.backend.tensorflow_backend import set_session
from keras.models import Model
from keras.layers import Input, LSTM, Dense, Bidirectional, Concatenate, GRU
from keras import optimizers
from keras.callbacks import ModelCheckpoint, TensorBoard, LearningRateScheduler
from keras.models import load_model
import numpy as np
import os
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from autocorrect import spell
import re
%matplotlib inline

Using TensorFlow backend.


# Utility functions

In [2]:
# Limit gpu allocation. allow_growth, or gpu_fraction
def gpu_alloc():
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    set_session(tf.Session(config=config))

In [3]:
gpu_alloc()

In [4]:
def calculate_WER_sent(gt, pred):
    '''
    calculate_WER('calculating wer between two sentences', 'calculate wer between two sentences')
    '''
    gt_words = gt.lower().split(' ')
    pred_words = pred.lower().split(' ')
    d = np.zeros(((len(gt_words) + 1), (len(pred_words) + 1)), dtype=np.uint8)
    # d = d.reshape((len(gt_words)+1, len(pred_words)+1))

    # Initializing error matrix
    for i in range(len(gt_words) + 1):
        for j in range(len(pred_words) + 1):
            if i == 0:
                d[0][j] = j
            elif j == 0:
                d[i][0] = i

    # computation
    for i in range(1, len(gt_words) + 1):
        for j in range(1, len(pred_words) + 1):
            if gt_words[i - 1] == pred_words[j - 1]:
                d[i][j] = d[i - 1][j - 1]
            else:
                substitution = d[i - 1][j - 1] + 1
                insertion = d[i][j - 1] + 1
                deletion = d[i - 1][j] + 1
                d[i][j] = min(substitution, insertion, deletion)
    return d[len(gt_words)][len(pred_words)]

In [5]:
def calculate_WER(gt, pred):
    '''

    :param gt: list of sentences of the ground truth
    :param pred: list of sentences of the predictions
    both lists must have the same length
    :return: accumulated WER
    '''
#    assert len(gt) == len(pred)
    WER = 0
    nb_w = 0
    for i in range(len(gt)):
        #print(gt[i])
        #print(pred[i])
        WER += calculate_WER_sent(gt[i], pred[i])
        nb_w += len(gt[i])

    return WER / nb_w

In [6]:
def load_data_with_gt(file_name, num_samples, max_sent_len, min_sent_len, delimiter='\t', gt_index=1, prediction_index=0):
    '''Load data from txt file, with each line has: <TXT><TAB><GT>. The  target to the decoder muxt have \t as the start trigger and \n as the stop trigger.'''
    cnt = 0  
    input_texts = []
    gt_texts = []
    target_texts = []
    for row in open(file_name, encoding='utf8'):
        if cnt < num_samples :
            #print(row)
            sents = row.split(delimiter)
            input_text = sents[prediction_index]
            
            target_text = '\t' + sents[gt_index] + '\n'
            if len(input_text) > min_sent_len and len(input_text) < max_sent_len and len(target_text) > min_sent_len and len(target_text) < max_sent_len:
                cnt += 1
                
                input_texts.append(input_text)
                target_texts.append(target_text)
                gt_texts.append(sents[gt_index])
    return input_texts, target_texts, gt_texts

In [7]:
def load_data(file_name, num_samples, max_sent_len, min_sent_len):
    '''Load data from txt file, with each line has: <TXT><TAB><GT>. The  target to the decoder muxt have \t as the start trigger and \n as the stop trigger.'''
    cnt = 0  
    input_texts = []   
    
    #for row in open(file_name, encoding='utf8'):
    for row in open(file_name):
        if cnt < num_samples :            
            input_text = row           
            if len(input_text) > min_sent_len and len(input_text) < max_sent_len:
                cnt += 1                
                input_texts.append(input_text)
    return input_texts

In [8]:
def vectorize_data(input_texts, max_encoder_seq_length, num_encoder_tokens, vocab_to_int):
    
    if(len(input_texts) > max_encoder_seq_length):
        input_texts = input_texts[:max_encoder_seq_length]
    
    '''Prepares the input text and targets into the proper seq2seq numpy arrays'''
    encoder_input_data = np.zeros(
    (len(input_texts), max_encoder_seq_length),
    dtype='float32')
    
    for i, input_text in enumerate(input_texts):
        for t, char in enumerate(input_text[:max_encoder_seq_length]):
            # c0..cn
            encoder_input_data[i, t] = vocab_to_int[char]
                
    return encoder_input_data

In [9]:
def decode_sequence(input_seq, encoder_model, decoder_model, num_decoder_tokens, max_decoder_seq_length, vocab_to_int, int_to_vocab):
    
    #print(max_decoder_seq_length)
    # Encode the input as state vectors.
    encoder_outputs, h, c  = encoder_model.predict(input_seq)
    states_value = [h,c]
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = vocab_to_int['\t']

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    #print(input_seq)
    attention_density = []
    i = 0
    special_chars = ['\\', '/', '-', '—' , ':', '[', ']', ',', '.', '"', ';', '%', '~', '(', ')', '{', '}', '$', '#']
    #special_chars = []
    while not stop_condition:
        #print(target_seq)
        output_tokens, attention, h, c  = decoder_model.predict(
            [target_seq, encoder_outputs] + states_value)
        #print(attention.shape)
        attention_density.append(attention[0][0])# attention is max_sent_len x 1 since we have num_time_steps = 1 for the output
        # Sample a token
        #print(output_tokens.shape)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        
        #print(sampled_token_index)
        sampled_char = int_to_vocab[sampled_token_index]
        
        orig_char = int_to_vocab[int(input_seq[:,i][0])]
        
        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '\n' or
           len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True
            #print('End', sampled_char, 'Len ', len(decoded_sentence), 'Max len ', max_decoder_seq_length)
            sampled_char = ''
        
        # Copy digits as it, since the spelling corrector is not good at digit corrections
        
        if(orig_char.isdigit() or orig_char in special_chars):
            decoded_sentence += orig_char            
        else:
            if(sampled_char.isdigit() or sampled_char in special_chars):
                decoded_sentence += ''
            else:
                decoded_sentence += sampled_char
        
        #decoded_sentence += sampled_char


        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]
        
        i += 1
        if(i > 48):
            i = 0
    attention_density = np.array(attention_density)
    
    # Word level spell correct
    '''
    corrected_decoded_sentence = ''
    for w in decoded_sentence.split(' '):
        corrected_decoded_sentence += spell(w) + ' '
    decoded_sentence = corrected_decoded_sentence
    '''
    return decoded_sentence, attention_density


In [10]:
special_chars = ['\\', '\/', '\-', '\—' , '\:', '\[', '\]', '\,', '\.', '\"', '\;', '\%', '\~', '\(', '\)', '\{', '\}', '\$', '\&', '\#', '\☒', '\■', '\☐', '\□', '\☑', '\@']
special_chars_s = '[' + ''.join(special_chars) + ']'
def word_spell_correct(decoded_sentence):
    if(decoded_sentence == ''):
        return ''
    corrected_decoded_sentence = ''

    for w in decoded_sentence.split(' '):
        #print(w)
        #if((len(re.findall(r'\d+', w))==0) and not (w in special_chars)):
        if((len(re.findall(r'\d+', w))==0) and (len(re.findall(special_chars_s, w))==0)):
            corrected_decoded_sentence += spell(w) + ' '
        else:
            corrected_decoded_sentence += w + ' '
    return corrected_decoded_sentence

In [11]:
def clean_up_sentence(sentence, vocab):
    s = ''
    prev_char = ''
    for c in sentence.strip():
        if c not in vocab or (c == ' ' and prev_char == ' '):
            s += ''
        else:
            s += c
        prev_char = c
            
    return s

# Load data

# Load model params

In [12]:
data_path = '../../dat/'

In [13]:
max_sent_lengths = [50, 100]

In [14]:
vocab_file = {}
model_file = {}
encoder_model_file = {}
decoder_model_file = {}
model = {}
encoder_model = {}
decoder_model = {}
vocab = {}
vocab_to_int = {}
int_to_vocab = {}
max_sent_len = {}
min_sent_len = {}
num_decoder_tokens = {}
num_encoder_tokens = {}
max_encoder_seq_length = {}
max_decoder_seq_length = {}

In [15]:

for i in max_sent_lengths:
    vocab_file[i] = 'vocab-{}.npz'.format(i)
    model_file[i] = 'best_model-{}.hdf5'.format(i)
    encoder_model_file[i] = 'encoder_model-{}.hdf5'.format(i)
    decoder_model_file[i] = 'decoder_model-{}.hdf5'.format(i)
    
    vocab = np.load(file=vocab_file[i])
    vocab_to_int[i] = vocab['vocab_to_int'].item()
    int_to_vocab[i] = vocab['int_to_vocab'].item()
    max_sent_len[i] = vocab['max_sent_len']
    min_sent_len[i] = vocab['min_sent_len']
    input_characters = sorted(list(vocab_to_int))
    num_decoder_tokens[i] = num_encoder_tokens[i] = len(input_characters) #int(encoder_model.layers[0].input.shape[2])
    max_encoder_seq_length[i] = max_decoder_seq_length[i] = max_sent_len[i] - 1#max([len(txt) for txt in input_texts])
    
    model[i] = load_model(model_file[i])
    encoder_model[i] = load_model(encoder_model_file[i])
    decoder_model[i] = load_model(decoder_model_file[i])



In [16]:
num_samples = 1000000
#tess_correction_data = os.path.join(data_path, 'test_data.txt')
#input_texts = load_data(tess_correction_data, num_samples, max_sent_len, min_sent_len)

OCR_data = os.path.join(data_path, 'new_trained_data.txt')
#input_texts, target_texts, gt_texts = load_data_with_gt(OCR_data, num_samples, max_sent_len, min_sent_len, delimiter='|',gt_index=0, prediction_index=1)
input_texts, target_texts, gt_texts = load_data_with_gt(OCR_data, num_samples, max_sent_len=10000, min_sent_len=0)

In [17]:
# Sample data
print(len(input_texts))
for i in range(10):
    print(input_texts[i], '\n', target_texts[i])

1951
Me dieal Provider Roles: Treating  
 	Medical Provider Roles: Treating


Provider First Name: Christine  
 	Provider First Name: Christine


Provider Last Name: Nolen, MD  
 	Provider Last Name: Nolen, MD


Address Line 1 : 7 25 American Avenue  
 	Address Line 1 : 725 American Avenue


City. W’aukesha  
 	City: Waukesha


StatefProvinee: ‘WI  
 	State/Province: WI


Postal Code: 5 31 88  
 	Postal Code: 53188


Country". US  
 	Country:  US


Business Telephone: (2 62) 92 8- 1000  
 	Business Telephone: (262) 928- 1000


Date ot‘Pirst Visit: 1 2/01f20 17  
 	Date of First Visit: 12/01/2017




In [18]:
# Spell correct before inference
'''
input_texts_ = []
for sent in input_texts:
    sent_ = ''
    for word in sent.split(' '):
        sent_ += spell(word) + ' '
    input_texts_.append(sent_)
input_texts = input_texts_
input_texts_ = []
# Sample data
print(len(input_texts))
for i in range(10):
    print(input_texts[i], '\n', target_texts[i])
'''

"\ninput_texts_ = []\nfor sent in input_texts:\n    sent_ = ''\n    for word in sent.split(' '):\n        sent_ += spell(word) + ' '\n    input_texts_.append(sent_)\ninput_texts = input_texts_\ninput_texts_ = []\n# Sample data\nprint(len(input_texts))\nfor i in range(10):\n    print(input_texts[i], '\n', target_texts[i])\n"

In [19]:
decoded_sentences = []
corrected_sentences = []

#for seq_index in range(len(input_texts)):
results = open('RESULTS.md', 'w')
results.write('|OCR sentence|GT sentence|Char decoded sentence|Word decoded sentence|Sentence length (chars)|\n')
results.write('---------------|-----------|----------------|----------------|----------------|\n')
     

for i, input_text in enumerate(input_texts):
    #print(input_text)
    # Find the input length range to choose the proper model to use
    len_range = max_sent_lengths[-1] # Take the longest range
    for length in max_sent_lengths:
        if(len(input_text) < length):
            len_range = length
            break
    #print(len_range)
    
    input_text = clean_up_sentence(input_text, vocab_to_int[len_range])
    encoder_input_data = vectorize_data(input_texts=[input_text], max_encoder_seq_length=max_encoder_seq_length[len_range], num_encoder_tokens=num_encoder_tokens[len_range], vocab_to_int=vocab_to_int[len_range])
    
    

    target_text = gt_texts[i]
    
    input_seq = encoder_input_data
    #print(input_seq.shape)
    #print(max_decoder_seq_length[len_range])
    #print(max_decoder_seq_length)
    decoded_sentence,_  = decode_sequence(input_seq, encoder_model[len_range], decoder_model[len_range], num_decoder_tokens[len_range],  max_decoder_seq_length[len_range], vocab_to_int[len_range], int_to_vocab[len_range])
    corrected_sentence = word_spell_correct(input_text)
    print('-Lenght = ', len_range)
    print('Input sentence:', input_text)
    print('GT sentence:', target_text.strip())
    print('Char Decoded sentence:', decoded_sentence)   
    print('Word Decoded sentence:', corrected_sentence) 
    results.write(' | ' + input_text + ' | ' + target_text.strip() + ' | ' + decoded_sentence + ' | ' + corrected_sentence + ' | ' + str(len_range) + ' | \n')
    decoded_sentences.append(decoded_sentence)
    corrected_sentences.append(corrected_sentence)
results.close()    

    

-Lenght =  50
Input sentence: Me dieal Provider Roles: Treating
GT sentence: Medical Provider Roles: Treating
Char Decoded sentence: Medical Provider Roles:Treating
Word Decoded sentence: Me dieal Provider Roles: Treating 
-Lenght =  50
Input sentence: Provider First Name: Christine
GT sentence: Provider First Name: Christine
Char Decoded sentence: Provider First Name: Chirisin
Word Decoded sentence: Provider First Name: Christine 
-Lenght =  50
Input sentence: Provider Last Name: Nolen, MD
GT sentence: Provider Last Name: Nolen, MD
Char Decoded sentence: Provider Last Name: None,Me
Word Decoded sentence: Provider Last Name: Nolen, MD 
-Lenght =  50
Input sentence: Address Line 1 : 7 25 American Avenue
GT sentence: Address Line 1 : 725 American Avenue
Char Decoded sentence: Address Line 1:725 nemin Avent Avent
Word Decoded sentence: Address Line 1 : 7 25 American Avenue 
-Lenght =  50
Input sentence: City. W’aukesha
GT sentence: City: Waukesha
Char Decoded sentence: City. Worances
Word

-Lenght =  100
Input sentence: pAvMENTsm'E—I: STATEMENT DATE WILL NOT APPEAR ON THIS STATEMENT
GT sentence: PAYMENTS RECEIVED AFTER STATE DATE WILL NOT APPEAR ON THIS STATEMENT
Char Decoded sentence: ACTIVE LaU—A:SY STTAT LATE TipSU TAYATILANSTEALALDINDGTATE—
Word Decoded sentence: pAvMENTsm'E—I: STATEMENT DATE WILL NOT APPEAR ON THIS STATEMENT 
-Lenght =  50
Input sentence: DATE
GT sentence: DATE
Char Decoded sentence: DATE
Word Decoded sentence: DATE 
-Lenght =  50
Input sentence: DESCRIPTION
GT sentence: DESCRIPTION
Char Decoded sentence: DESCRIPTION
Word Decoded sentence: DESCRIPTION 
-Lenght =  50
Input sentence: _AYMENTS
GT sentence: PAYMENTS
Char Decoded sentence: PAAMENTS
Word Decoded sentence: PAYMENTS 
-Lenght =  50
Input sentence: _DJUSTMENTS
GT sentence: ADJUSTMENTS
Char Decoded sentence: ADJUT DANE
Word Decoded sentence: ADJUSTMENTS 
-Lenght =  50
Input sentence: PAT'ENT BALANCE
GT sentence: PATIENTS BALANCE
Char Decoded sentence: PATTET BAMAE
Word Decoded sentence: PATIEN

-Lenght =  100
Input sentence: To pay your bill on line with a credit card, log on to www.ebixinc.comlpayonline.html.
GT sentence: To pay your bill on line with a credit card, log on to www.ebixinc.com/payonline.html.
Char Decoded sentence: To ppedich ont old polland pollysiot onghes,ctionship pou hypent pon one ponkendent Fing o,lep
Word Decoded sentence: To pay your bill on line with a credit card, log on to www.ebixinc.comlpayonline.html. 
-Lenght =  50
Input sentence: ACCOUNT# EMA297232
GT sentence: ACCOUNT# EMA297232
Char Decoded sentence: ACCOUNT# CAE297232
Word Decoded sentence: ACCOUNT# EMA297232 
-Lenght =  100
Input sentence: PLACE OF SERVICE 11 Office 21 Inpatient 22 Outpatient Hospital 23 Emergency Room-Hospital
GT sentence: PLACE OF SERVICE 11 Office 21 Inpatient 22 Outpatient Hospital 23 Emergency Room-Hospital
Char Decoded sentence: PLINent Name For11y E PAT21TANT Ficate22Sprour Ccine Fing Unum 11e MIDe M21r Cound
Word Decoded sentence: PLACE OF SERVICE 11 Office 21 Inpa

-Lenght =  100
Input sentence: To disclose information, whether from before, during or after the date of this authorization, about my health, including HIV, AIDS or other disorders of the Immune system, use of drugs or alcohol, mental or phy5ica| histor , condition, advice or treatment (except this authorization does not authorize release of psychotherapy notesi, prescription drug history, earnings, financial or credit history, professional licenses, employment history, insurance claims and benefits, and all other claims and benefits, including Social Security claims and benefits (“My Information");
GT sentence: To disclose information, whether from before, during or after the date of this authorization, about my health, including HIV, AIDS or other disorders of the immune system, use of drugs or alcohol, mental or physical history, condition, advice or treatment (except this authorization does not authorize release of psychotherapy notes, prescription drug history, earnings, financial

-Lenght =  50
Input sentence: 03/14/2018 Date Signed
GT sentence: 03/14/2018 Date Signed
Char Decoded sentence: 03/14/2018Date Signed
Word Decoded sentence: 03/14/2018 Date Signed 
-Lenght =  50
Input sentence: Printed Name
GT sentence: Printed Name
Char Decoded sentence: Printed Name
Word Decoded sentence: Printed Name 
-Lenght =  50
Input sentence: Seeial Security Number
GT sentence: Social Security Number
Char Decoded sentence: Sebiial Security Number
Word Decoded sentence: Seeial Security Number 
-Lenght =  100
Input sentence: I signed on behalf of the Insured as (Relationship). If Power of Attorney Designee, Guardian, or Conservator, please attach a copy of the document granting authority.
GT sentence: I signed on behalf of the Insured as (Relationship). If Power of Attorney Designee, Guardian, or Conservator, please attach a copy of the document granting authority.
Char Decoded sentence: I signed on behalf of the Insured as (relationshiprohent Insunt If Nomenthe fol orent( the in

-Lenght =  100
Input sentence: ”Children are covered for the full Wellness Benefit.
GT sentence: **Children are covered for the full Wellness Benefit.
Char Decoded sentence: **Ceoffreveregn Chefion beneffitell orenent Chiffone pent Fored ank Effreneseredfionerent
Word Decoded sentence: Children are covered for the full Wellness Benefit. 
-Lenght =  50
Input sentence: Total Employee Benefit Amount: $20,000.00
GT sentence: Total Employee Benefit Amount: $20,000.00
Char Decoded sentence: Total Employee Benefit Johmen: $20,000.
Word Decoded sentence: Total Employee Benefit Amount: $20,000.00 
-Lenght =  50
Input sentence: Total Monthly Premium: $40.40
GT sentence: Total Monthly Premium: $40.40
Char Decoded sentence: Total Monthly Premium: $40.4
Word Decoded sentence: Total Monthly Premium: $40.40 
-Lenght =  100
Input sentence: Total Employee Semi-Monthly Payroll Deduction: $20.20
GT sentence: Total Employee Semi-Monthly Payroll Deduction: $20.20
Char Decoded sentence: Total Molly Name-  o

-Lenght =  100
Input sentence: sip rlght knee ACL repair, MCL repair utilizing internal bracing (DOS: 2/2/18)
GT sentence: s/p right knee ACL repair, MCL repair utilizing internal bracing (DOS: 2/2/18)
Char Decoded sentence: Spel Ticke Monge LDe , Fithint the  Ming the Dation The Re,tilnt
Word Decoded sentence: sip rlght knee ACL repair, MCL repair utilizing internal bracing (DOS: 2/2/18) 
-Lenght =  50
Input sentence: Post-Op
GT sentence: Post-Op
Char Decoded sentence: Post-Op
Word Decoded sentence: Post-Op 
-Lenght =  100
Input sentence: returns to clinic approximately 2 weeks status post right knee AOL repair with right knee MCL repair using internal brace. Overall, she reports doing quite well. Her pain is well controlled with over the counter medication as wall as Hydrocodone at night. She has been taking Aspirin twice daily as prescribed. She has been attending physical therapy and has been compliant with her touch down weight bearing status.
GT sentence: returns to clinic approx

-Lenght =  100
Input sentence: 4. Lateral compartment osseous contusions involving the posterior rim of the lateral tibial plateau and terminal sulcus of lateral femoral condyle.
GT sentence: 4. Lateral compartment osseous contusions involving the posterior rim of the lateral tibial plateau and terminal sulcus of lateral femoral condyle.
Char Decoded sentence: 4. Tyee Monfores No Workoreation Foristuensent No4.essmsess Lastionsho shapions Nor Mon Lastoonts4
Word Decoded sentence: 4. Lateral compartment osseous contusions involving the posterior rim of the lateral tibial plateau and terminal sulcus of lateral femoral condyle. 
-Lenght =  50
Input sentence: 5. Patellar apical grade 1-2 chondromalacia.
GT sentence: 5. Patellar apical grade 1-2 chondromalacia.
Char Decoded sentence: 5. Pate ar  achearal  1-2e arth ap ami.aal5
Word Decoded sentence: 5. Patellar apical grade 1-2 chondromalacia. 
-Lenght =  50
Input sentence: Diagnosis
GT sentence: Diagnosis
Char Decoded sentence: Diagnosis
W

-Lenght =  100
Input sentence: 15 minutes spent, over 50% of which was spent in discussion regarding diagnosis, treatment options and answering questions.
GT sentence: 15 minutes spent, over 50% of which was spent in discussion regarding diagnosis, treatment options and answering questions.
Char Decoded sentence: 15 thision cone ,nesent50%erits nomp nos ing ress15he pist ngsss ,heng p50% and costient ponsinsti15
Word Decoded sentence: 15 minutes spent, over 50% of which was spent in discussion regarding diagnosis, treatment options and answering questions. 
-Lenght =  50
Input sentence: Scribe - Statements
GT sentence: Scribe - Statements
Char Decoded sentence: Scribe - Statements
Word Decoded sentence: Scribe - Statements 
-Lenght =  100
Input sentence: Nate Graden is acting as scribe for Dr. Jason Holm
GT sentence: Nate Graden is acting as scribe for Dr. Jason Holm
Char Decoded sentence: Neth ofstat at age Ifovint thatimape.Summemprais
Word Decoded sentence: Nate Graden is acting as 

-Lenght =  50
Input sentence: Psychiatric: no psychiatric symptoms.
GT sentence: Psychiatric: no psychiatric symptoms.
Char Decoded sentence: Phychizatio:s nority no stom sympt.ms
Word Decoded sentence: Psychiatric: no psychiatric symptoms. 
-Lenght =  50
Input sentence: Respiratory: no respiratory symptoms.
GT sentence: Respiratory: no respiratory symptoms.
Char Decoded sentence: Respiratory: no respriptiors symptom.
Word Decoded sentence: Respiratory: no respiratory symptoms. 
-Lenght =  50
Input sentence: Active Problems
GT sentence: Active Problems
Char Decoded sentence: Accive Problems
Word Decoded sentence: Active Problems 
-Lenght =  50
Input sentence: 1. Knee injury (889.90XA)
GT sentence: 1. Knee injury (S89.90XA)
Char Decoded sentence: 1. Knee injury (889.90XA)
Word Decoded sentence: 1. Knee injury (889.90XA) 
-Lenght =  50
Input sentence: Past Medical History
GT sentence: Past Medical History
Char Decoded sentence: Past Medical History
Word Decoded sentence: Past Medical His

-Lenght =  50
Input sentence: Psychiatric: no psychiatric symptoms.
GT sentence: Psychiatric: no psychiatric symptoms.
Char Decoded sentence: Phychizatio:s nority no stom sympt.ms
Word Decoded sentence: Psychiatric: no psychiatric symptoms. 
-Lenght =  50
Input sentence: Respiratory: no respiratory symptoms.
GT sentence: Respiratory: no respiratory symptoms.
Char Decoded sentence: Respiratory: no respriptiors symptom.
Word Decoded sentence: Respiratory: no respiratory symptoms. 
-Lenght =  50
Input sentence: Active Problems
GT sentence: Active Problems
Char Decoded sentence: Accive Problems
Word Decoded sentence: Active Problems 
-Lenght =  50
Input sentence: 1. Knee injury
GT sentence: 1. Knee injury
Char Decoded sentence: 1. Knee injury
Word Decoded sentence: 1. Knee injury 
-Lenght =  50
Input sentence: Past Medical History
GT sentence: Past Medical History
Char Decoded sentence: Past Medical History
Word Decoded sentence: Past Medical History 
-Lenght =  50
Input sentence: I No sig

-Lenght =  50
Input sentence: 9 Family history of Cancer (C801)
GT sentence: • Family history of Cancer (C80.1)
Char Decoded sentence: 9 Family history of Chis C(801)
Word Decoded sentence: 9 Family history of Cancer (C801) 
-Lenght =  50
Input sentence: 0 Family history of other condition (284.89)
GT sentence: • Family history of other condition (Z84.89)
Char Decoded sentence: 0 Famisy of thor byhorth instruction(284.89)
Word Decoded sentence: 0 Family history of other condition (284.89) 
-Lenght =  50
Input sentence: 0 Family history of other condition (284.89)
GT sentence: • Family history of other condition (Z84.89)
Char Decoded sentence: 0 Famisy of thor byhorth instruction(284.89)
Word Decoded sentence: 0 Family history of other condition (284.89) 
-Lenght =  50
Input sentence: Social History
GT sentence: Social History
Char Decoded sentence: Social History
Word Decoded sentence: Social History 
-Lenght =  50
Input sentence: 0 Age reporting
GT sentence: • Age reporting
Char Decod

-Lenght =  100
Input sentence: X-rays were ordered, obtained and interpreted. Views: 3 views of the right knee. Findings: No apparent dislocation or fractures. Normal joint spaces and soft tissues. Norma: alignment and no bony lesion appreciated.
GT sentence: X-rays were ordered, obtained and interpreted. Views: 3 views of the right knee. Findings: No apparent dislocation or fractures. Normal joint spaces and soft tissues. Normal alignment and no bony lesion appreciated.
Char Decoded sentence: -pl youper thater ,o bon nent th ffinch the .rovi-ed sed to theredf,overainrisabed Thy wipren.ifio- 
Word Decoded sentence: X-rays were ordered, obtained and interpreted. Views: 3 views of the right knee. Findings: No apparent dislocation or fractures. Normal joint spaces and soft tissues. Norma: alignment and no bony lesion appreciated. 
-Lenght =  50
Input sentence: Diagnosis
GT sentence: Diagnosis
Char Decoded sentence: Diagnosis
Word Decoded sentence: Diagnosis 
-Lenght =  100
Input sentence:

-Lenght =  100
Input sentence: Performzln Ofﬁce; Due:26Jan201B; Last Updated By:Portnoy, Erin;Ordered: Foranee injury; Ordered BycFelvor, David;
GT sentence: Perform:In Office; Due:26Jan2018; Last Updated By:Portnoy, Erin;Ordered: For:Knee injury; Ordered By:Feivor, David;
Char Decoded sentence: Prectfor Fred In;fren:26nfo201I;es Ded torthe F:e Meatientienden;f N:26s r201t;on Freneffrof De:tedi
Word Decoded sentence: Performzln Ofﬁce; Due:26Jan201B; Last Updated By:Portnoy, Erin;Ordered: Foranee injury; Ordered BycFelvor, David; 
-Lenght =  100
Input sentence: MRI KNEE WIO CONTRST RT; Status:Need information - Financial Authorization; Requested for:21Jan2018:
GT sentence: MRI KNEE W/O CONTRST RT; Status:Need information - Financial Authorization; Requested for:21Jan2018;
Char Decoded sentence: Medical Provider Inform;tion  :hysician For Unes Frenting Prof Diting ;atikn P:onint Insunt Das
Word Decoded sentence: MRI KNEE WIO CONTRST RT; Status:Need information - Financial Authorization; 

-Lenght =  100
Input sentence: Jamie Birkelo, PA-C, was present and scrubbed throughout the case and his assistance was critical for patient positioning, assistance with prepping and draping, soft tissue retraction, leg manipulation, operating power equipment, wound closure, and dressing and brace application.
GT sentence: Jamie Birkelo, PA-C, was present and scrubbed throughout the case and his assistance was critical for patient positioning, assistance with prepping and draping, soft tissue retraction, leg manipulation, operating power equipment, wound closure, and dressing and brace application.
Char Decoded sentence: *ste torsate ,-, actident and carage and Pay nul costuss i,s r-s,tial acartions sabion and be 
Word Decoded sentence: Jamie Birkelo, PA-C, was present and scrubbed throughout the case and his assistance was critical for patient positioning, assistance with prepping and draping, soft tissue retraction, leg manipulation, operating power equipment, wound closure, and dres

-Lenght =  100
Input sentence: Musculoskeletal -. Gait and station: Abnormal. Normal muscle strength and tone.
GT sentence: Musculoskeletal -. Gait and station: Abnormal. Normal muscle strength and tone.
Char Decoded sentence: Mesit Fint Tel t-.ts ons Unkines: of the D.ted robilatidPytil-.olrnt pated
Word Decoded sentence: Musculoskeletal -. Gait and station: Abnormal. Normal muscle strength and tone. 
-Lenght =  100
Input sentence: Left Knee: no deformity, erythema, ecchymosis, edema, or tenderness, fiexion and extension within normal limits. strength within normal limits In all planes and no evidence of laxity or instability.
GT sentence: Left Knee: no deformity, erythema, ecchymosis, edema, or tenderness, flexion and extension within normal limits, strength within normal limits in all planes and no evidence of laxity or instability.
Char Decoded sentence: Instine I:ffre ffortifi,tle tient,er and redf,Inest Turmer: Tiffoer anfo,me the fo, at Inforer, MIrt
Word Decoded sentence: Left 

-Lenght =  50
Input sentence: Last name — Stokley
GT sentence: Last name - Stokley
Char Decoded sentence: Last name— Seclary
Word Decoded sentence: Last name — Stokley 
-Lenght =  50
Input sentence: Organization/Facility —
GT sentence: Organization/Facility -
Char Decoded sentence: Orrantifizat/on lizity—
Word Decoded sentence: Organization/Facility — 
-Lenght =  50
Input sentence: Address line 1 -
GT sentence: Address line 1 -
Char Decoded sentence: Address Line 1 -  
Word Decoded sentence: Address line 1 - 
-Lenght =  50
Input sentence: Address line 2 —
GT sentence: Address line 2 -
Char Decoded sentence: Address Line 2 —  
Word Decoded sentence: Address line 2 — 
-Lenght =  50
Input sentence: City -
GT sentence: City -
Char Decoded sentence: City-
Word Decoded sentence: City - 
-Lenght =  50
Input sentence: State - NC
GT sentence: State - NC
Char Decoded sentence: State - NC
Word Decoded sentence: State - NC 
-Lenght =  50
Input sentence: Speciality — PCP
GT sentence: Speciality - P

-Lenght =  50
Input sentence: Employee On & Off-Job Acc January 1, 2017
GT sentence: Employee On & Off-Job Acc January 1, 2017
Char Decoded sentence: Employee On & Off-Job Acc January 1, 2017
Word Decoded sentence: Employee On & Off-Job Acc January 1, 2017 
-Lenght =  50
Input sentence: Spouse On 8!. Off-Job Acc January 1, 2017
GT sentence: Spouse On & Off-Job Acc January 1, 2017
Char Decoded sentence: Spouse On 8 .ffJ-b Acc January 1,201
Word Decoded sentence: Spouse On 8!. Off-Job Acc January 1, 2017 
-Lenght =  100
Input sentence: EmpIOyee Sickness Hospital Confinement January 1, 2017
GT sentence: Employee Sickness Hospital Confinement January 1, 2017
Char Decoded sentence: Employee Mones Deas Name MIf ITerance  1, Mp
Word Decoded sentence: EmpIOyee Sickness Hospital Confinement January 1, 2017 
-Lenght =  50
Input sentence: Employee Wellness Benefit January 1, 2017
GT sentence: Employee Wellness Benefit January 1, 2017
Char Decoded sentence: Employee Wellness Benefit January 1, 201

-Lenght =  50
Input sentence: Any overtime? - no
GT sentence: Any overtime? - no
Char Decoded sentence: An over these - no
Word Decoded sentence: Any overtime - no 
-Lenght =  50
Input sentence: Lunch break —
GT sentence: Lunch break -
Char Decoded sentence: Lunch breaki—
Word Decoded sentence: Lunch break — 
-Lenght =  50
Input sentence: Total time absent —
GT sentence: Total time absent -
Char Decoded sentence: Total tmmbent abst— 
Word Decoded sentence: Total time absent — 
-Lenght =  50
Input sentence: Absence reason — episode
GT sentence: Absence reason - episode
Char Decoded sentence: Abseccee rescre—te 
Word Decoded sentence: Absence reason — episode 
-Lenght =  100
Input sentence: Event Dates Intermittent Comments — EE fell at work.
GT sentence: Event Dates Intermittent Comments - EE fell at work.
Char Decoded sentence: Eve thartit Inestreatient Destated—ertiontertithe tomente te te tienth thente tark —hent te te te ti
Word Decoded sentence: Event Dates Intermittent Comments — 

-Lenght =  50
Input sentence: Physical Address:
GT sentence: Physical Address:
Char Decoded sentence: Physical Address:
Word Decoded sentence: Physical Address: 
-Lenght =  50
Input sentence: Address Line 1:
GT sentence: Address Line 1:
Char Decoded sentence: Address Line 1:
Word Decoded sentence: Address Line 1: 
-Lenght =  50
Input sentence: Address Line 2:
GT sentence: Address Line 2:
Char Decoded sentence: Address Line 2:
Word Decoded sentence: Address Line 2: 
-Lenght =  50
Input sentence: City:
GT sentence: City:
Char Decoded sentence: City:
Word Decoded sentence: City: 
-Lenght =  50
Input sentence: State:
GT sentence: State:
Char Decoded sentence: State:
Word Decoded sentence: State: 
-Lenght =  50
Input sentence: Country:
GT sentence: Country:
Char Decoded sentence: Country:
Word Decoded sentence: Country: 
-Lenght =  50
Input sentence: ZIP:
GT sentence: ZIP:
Char Decoded sentence: ZAP:AG
Word Decoded sentence: ZIP: 
-Lenght =  50
Input sentence: Temporary Address:
GT sentence

-Lenght =  100
Input sentence: C. Information About the Patlont (if different {rem Insuroleollcyholdor) Check one: El Spouse El Domestic Partner El Dependent Child
GT sentence: C. Information About the Patient (If different from Insured/Policyholder) Check one:
Char Decoded sentence: C. ITer Informeducarentlour chinc(raificy thons{ou. Fienthe Fient orentithoustovim(loverencrinco{er.
Word Decoded sentence: C. Information About the Patlont (if different {rem Insuroleollcyholdor) Check one: El Spouse El Domestic Partner El Dependent Child 
-Lenght =  50
Input sentence: Last Name Suffix Flrst Name Ml
GT sentence: Last Name Suffix First Name MI
Char Decoded sentence: Last Name Suffix First Name MI
Word Decoded sentence: Last Name Suffix Flrst Name Ml 
-Lenght =  50
Input sentence: Date at Birth tmrnlddr’yy)
GT sentence: Date of Birth (mm/dd/yy)
Char Decoded sentence: Date ath Birth Rirged yo
Word Decoded sentence: Date at Birth tmrnlddr’yy) 
-Lenght =  50
Input sentence: Social Security Mem

-Lenght =  100
Input sentence: is this oondiijon the result ofan accidental injury? ee D No
GT sentence: Is this condition the result of an accidental injury? Yes No
Char Decoded sentence: Is Scinecinc Se fiteation of Torisity catcom on Une Dation the tons ind hoon
Word Decoded sentence: is this oondiijon the result ofan accidental injury ee D No 
-Lenght =  50
Input sentence: Ifyee‘ date of accident (mmlddiyy) Ell—I
GT sentence: If yes, date of accident (mm/dd/yy)
Char Decoded sentence: If yes mated mmddy (odeation)Ele
Word Decoded sentence: Ifyes date of accident (mmlddiyy) Ell—I 
-Lenght =  100
Input sentence: to this oondiiion the result ofhlsfher employment El Yea Efﬁe El Unknown
GT sentence: Is this condition the result of his/her employment Yes No Unknown
Char Decoded sentence: Sthes Tory free Mon working thene provint thenthe ine thene Incerion be Tyrent ond Yeshent work wise
Word Decoded sentence: to this oondiiion the result ofhlsfher employment El Yea Else El Unknown 
-Lengh

-Lenght =  50
Input sentence: Record number:
GT sentence: Record number:
Char Decoded sentence: Record number:
Word Decoded sentence: Record number: 
-Lenght =  50
Input sentence: Type:
GT sentence: Type:
Char Decoded sentence: Type:
Word Decoded sentence: Type: 
-Lenght =  50
Input sentence: Trace number:
GT sentence: Trace number:
Char Decoded sentence: Trace number:
Word Decoded sentence: Trace number: 
-Lenght =  50
Input sentence: A Account number:
GT sentence: Account number:
Char Decoded sentence: A cccon number
Word Decoded sentence: A Account number: 
-Lenght =  50
Input sentence: Transaction reference number:
GT sentence: Transaction reference number:
Char Decoded sentence: Transaction reference number:
Word Decoded sentence: Transaction reference number: 
-Lenght =  50
Input sentence: ‘ Cardholder name:
GT sentence: Cardholder name:
Char Decoded sentence: Cardholder name
Word Decoded sentence: a Cardholder name: 
-Lenght =  50
Input sentence: Transaction identiﬁer:
GT senten

-Lenght =  50
Input sentence: TIER 3 11101111111101 MOOP Max
GT sentence: TIER 3 Individual MOOP Max
Char Decoded sentence: TIER 311101111111101 PhO Mon
Word Decoded sentence: TIER 3 11101111111101 MOOP Max 
-Lenght =  50
Input sentence: URTHDATLANTA LLC FAYETI'EVI LLE
GT sentence: ORTHOATLANTA LLC FAYETTEVILLE
Char Decoded sentence: URTHAATLA LACLEALLA VIEWITE L
Word Decoded sentence: URTHDATLANTA LLC FAYETI'EVI LLE 
-Lenght =  50
Input sentence: Merchant I0:
GT sentence: Merchant ID:
Char Decoded sentence: Merchant I0:
Word Decoded sentence: Merchant I0: 
-Lenght =  50
Input sentence: Transaction typo:
GT sentence: Transaction type:
Char Decoded sentence: Transaction typo:
Word Decoded sentence: Transaction typo: 
-Lenght =  50
Input sentence: Approval code:
GT sentence: Approval code:
Char Decoded sentence: Approval code:
Word Decoded sentence: Approval code: 
-Lenght =  50
Input sentence: Dateltime:
GT sentence: Date/time:
Char Decoded sentence: Datetime:
Word Decoded sentence: Dat

-Lenght =  100
Input sentence: Optional Authorization to Disclose Information to Third Parties
GT sentence: Optional Authorization to Disclose Information to Third Parties
Char Decoded sentence: * tient Matonthiletoristion Provident onsimentionth tork perititestionshnstion Yes
Word Decoded sentence: Optional Authorization to Disclose Information to Third Parties 
-Lenght =  100
Input sentence: To assist in the evaluation or administration of my claim(s), I authorize Unum Group, its subsidiaries and duly authorized representatives (“Unum") to share personal health and ﬁnancial information relating to my ciaim with the family members, friends. andfor other third parties listed below:
GT sentence: To assist in the evaluation or administration of my claim(s), I authorize Unum Group, its subsidiaries and duly authorized representatives ("Unum") to share personal health and financial information relating to my claim with the family members, friends, and/or other third parties listed below:
C

-Lenght =  100
Input sentence: Unum is a registered lradsmark and marketing brand of Unum Group and its insuring subsidiaries.
GT sentence: Unum is a registered trademark and marketing brand of Unum Group and its insuring subsidiaries.
Char Decoded sentence: Unum paricaress rof terts rentiss ristronshishing tharserss rits ons ssuerisnd partiersins rightorss
Word Decoded sentence: Unum is a registered lradsmark and marketing brand of Unum Group and its insuring subsidiaries. 
-Lenght =  50
Input sentence: unum'i
GT sentence: Unum
Char Decoded sentence: unum
Word Decoded sentence: unumi 
-Lenght =  50
Input sentence: Confirmation of Coverage
GT sentence: Confirmation of Coverage
Char Decoded sentence: Confirmation of Coverage
Word Decoded sentence: Confirmation of Coverage 
-Lenght =  50
Input sentence: Customer #:
GT sentence: Customer #:
Char Decoded sentence: Customer #:
Word Decoded sentence: Customer #: 
-Lenght =  50
Input sentence: EE Name:
GT sentence: EE Name:
Char Decoded sente

-Lenght =  100
Input sentence: Technique: Multiplanar, multisequence imaging of the left knee was performed without the use of intravenous gadolinium.
GT sentence: Technique: Multiplanar, multisequence imaging of the left knee was performed without the use of intravenous gadolinium.
Char Decoded sentence: Then Pofi:he for Umane,re Fur chithe fon Surgentict Suffip:evintionship, Pher nertent onshnes forken
Word Decoded sentence: Technique: Multiplanar, multisequence imaging of the left knee was performed without the use of intravenous gadolinium. 
-Lenght =  50
Input sentence: FINDINGS:
GT sentence: FINDINGS:
Char Decoded sentence: FINDINGS:
Word Decoded sentence: FINDINGS: 
-Lenght =  100
Input sentence: On coronal sequence, there is a horizontal flap tearthroughout the posterior medial meniscal horn with focal radial tearwithin the posterior horn. Separate complex tear at the junction of the posterior medial meniscai horn and root. The medial meniscai body is extruded by 0.3 am. No dis

-Lenght =  50
Input sentence: p—IEDMONT HEALTHéARé“
GT sentence: PIEDMONT HEALTHCARE
Char Decoded sentence: P—IDENT HEALTHCARER
Word Decoded sentence: p—IEDMONT HEALTHéARé“ 
-Lenght =  50
Input sentence: Group No:
GT sentence: Group No:
Char Decoded sentence: Group No:
Word Decoded sentence: Group No: 
-Lenght =  50
Input sentence: Date:
GT sentence: Date:
Char Decoded sentence: Date:
Word Decoded sentence: Date: 
-Lenght =  50
Input sentence: Explanation of Benefits
GT sentence: Explanation of Benefits
Char Decoded sentence: Explaning on Benefits
Word Decoded sentence: Explanation of Benefits 
-Lenght =  50
Input sentence: Page I 012 (continued on back)
GT sentence: Page 1 of 2 (continued on back)
Char Decoded sentence: Page In012o(e Contined bnsubs)
Word Decoded sentence: Page I 012 (continued on back) 
-Lenght =  50
Input sentence: Provld-or: GEORGE STONE M0
GT sentence: Provider: GEORGE STONE MD
Char Decoded sentence: Provid-r:GORTH STONE SED0
Word Decoded sentence: Provld-or: GEOR

-Lenght =  100
Input sentence: I authorize the followin persons: health care professionals. hospitals, clinics, laboratories, pharmacies and all other medical or me really related providers, facilities or services, rehabilitation profesSionals, vocational evaluators. health plans, insurance companies, third party administrators, insurance producers, insurance service providers. consumer reporting agencres including credit bureaus, GEINEX Se‘mces, LLC, The Advocator Group and other Social Security advocacy vendors, professional licensing bodies, employers, attorneys. ﬁnancial institutions and/or banks, and governmental entities:
GT sentence: I authorize the following persons: health care professionals, hospitals, clinics, laboratories, pharmacies and all other medical or medically related providers, facilities or services, rehabilitation professionals, vocational evaluators. health plans, insurance companies, third party administrators, insurance producers, insurance service providers. 

-Lenght =  100
Input sentence: If I do not sign this authorization or if l_alter or revoke it, except as speciﬁed above, Unum may not be able to evaluate or administer my claim(s),_ which may lead to my claim(s) being denied. i may revoke this authorization at any time by sending written notice to the address above. I understand that revocation Will not apply to any informattﬁn thﬂf ”hi lm I'ﬂt‘tl IDQ‘I'Q nl" ”inﬁll-inn: I'H'ih?
GT sentence: If I do not sign this authorization or if I alter or revoke it, except as specified above, Unum may not be able to evaluate or administer my claim(s), which may lead to my claim(s) being denied. I may revoke this authorization at any time by sending written notice to the address above. I understand that revocation will not apply to any information that Unum requires or discloses
Char Decoded sentence: If knowrevere Yestr of an and ofyse theness chesed to tharke shink rofitestionshiprofithathe the ine
Word Decoded sentence: If I do not sign this aut

-Lenght =  100
Input sentence: Any person who knowingly and with the intent to injure. defraud or deceive an insurance company presents a false or fraudulent claim for payment of a loss or beneﬁt or knowingly presents false information in an application for insurance is guilty of a crime and may be subject to ﬁnes and confinement in prison.
GT sentence: Any person who knowingly and with the intent to injure, defraud or deceive an insurance company presents a false or fraudulent claim for payment of a loss or benefit or knowingly presents false information in an application for insurance is guilty of a crime and may be subject to fines and confinement in prison.
Char Decoded sentence: Aucial chicuredupite the dick ons forint the ponthe for your chinceraifite the inghiviventuristu ing
Word Decoded sentence: Any person who knowingly and with the intent to injure. defraud or deceive an insurance company presents a false or fraudulent claim for payment of a loss or benet or knowingly presen

-Lenght =  50
Input sentence: Medical Pl'oxitler Information — Hospitalization
GT sentence: Medical Provider Information - Hospitalization
Char Decoded sentence: Medical Provider Information  —ospitalization
Word Decoded sentence: Medical Pl'oxitler Information — Hospitalization 
-Lenght =  50
Input sentence: Hospital Name. Minnesota Valley Surgery Center
GT sentence: Hospital Name: Minnesota Valley Surgery Center
Char Decoded sentence: Hospital Name. Vilation  Huspotal Farmenter
Word Decoded sentence: Hospital Name. Minnesota Valley Surgery Center 
-Lenght =  50
Input sentence: Address Line 1: 1000 W 140th St #102
GT sentence: Address Line 1: 1000 W 140th St #102
Char Decoded sentence: Address Line 1: 1000 140 Ent#102
Word Decoded sentence: Address Line 1: 1000 W 140th St #102 
-Lenght =  50
Input sentence: City. Bm‘nsville
GT sentence: City: Burnsville
Char Decoded sentence: City. BmodE Vill
Word Decoded sentence: City. Bm‘nsville 
-Lenght =  50
Input sentence: Claim Tji'pe: VB Accid

KeyboardInterrupt: 

In [None]:
input_texts = ['SUBJECTIVE: This is a S-year-old +@W his left great toe with the handleh lacration.',
               'Thera was no handlebarthe lacration.',
               'Patiet last tet is needing this for school at this',
               'OBJECTIVE : The temp is 99.8, the f tha blood pressure 99/64, O2 sat 94 8/10 at this time.',
               'Left great toe the dorsl surface, extending ta th active hemorrhage at this time.',
               'Th anaathetized with a cotton ball sat Left this in place for 20 minutes.',
               'with Betadine again and injected th he tolerated very well.',
               'The wound sutures. Antibiotic eintment and g',
               'Patient tolerated very well. Pat',
               'IMPRESSION: Lacration te left grs',
               'PLAN: Patent is to do dressing ch advised as far as checking tha waurn it with soap and water.',
               'Sutures oy have any problems prior te that tim ona teaspoon three times a day rer Ibuprofan far pain, discomfort. Cg',
               'hite male who accidently dropped a bike onto ar end hitting the left great toe,'
               'causing a guard to the end of the bike, which caused anus shot is more that three years ago and time.',
               'nlse ef 105 and regular, resprations 286,% on room air.',
               'Patient rates hia pain at — there i15 noted a 3-om laceration across a lateral aspect of tha toe.',
               'There is no e toa ir cleansed with Betadine.',
               'It is then urated with 5 cu of 2% Hylocaine plain.',
               'We then cleansed ae toa with 3 cc of 2% Xylacaina plain',
               'which was then clesed with five 5-0 Prolene ressure dressing was then applied to the tos',
               'paient is given DPT 0.5 ee intramucular (IM).at toe.',
               'Kefylex 250 mg per 5 ml, the next seven days.',
               'He may use Tylenol or 11 if any problems.',
               'Unum Life Insurance Company of America 2211',               
               'Congress Street Portland, Maine 04122',
               'APPLICATION FOR GROUP CRITICAL LLNESS INSURANCE',
               'I Evidence of Insurability',
               '',
               'Application Type: @ New Enrollee Change to',
               'Existing Coverage  Reinstatement  Internal',
               'Replacement  Late Applicant  Rehire SECTION 1:',
               'Employee(Applicant) Information  Always',
               'Complete Employee Name(First, Middle, Last)',
               'Social Security Number Nikolas J Jones',
               '123 - 456 - 7890 Home Address(Street/ PO Box)',
               'Gender 1634 Stewert St  F  M City Date of Birth',
               '(mm / dd / yyyy) Seattle 06 / 15 / 1991 State Zip',
               'Code Home Phone # Washington 98101 854-555-1212',
               'Are you Actively at Work? Employee ID / Payroll #',
               ' Yes  No55624 a.Are you a U.S.Citizen or',
               'Canadian Citizen working in the U.S.? b.Are you',
               'legally authorized to work in  Yes  No(If No',
               'reply to part b) the U.S.?  Yes  No Employer',
               'Name Group Number Date of Hire(mm/ dd / yyyy)',
               'Facebook 11 - 555566 11 / 30 / 2016 Occupation',
               'Eligibility Class Software Engineer 7 Scheduled',
               'Number of Work Hours per Week Work Phone # 35',
               '854-555-6622 SECTION 2: Spouse Information ',
               'Complete Only if applying for Spouse coverage Name',
               '(First, Middle, Last) Social Security Number',
               'Gender Date of Birth(mm / dd / yyyy) Does the',
               '1019 - 07 - AZ 1',
              'if claint is for a child, please state your relationship 10 the child',
              'date of accident 3d _ time of accident ram. 0 p.m.',
              'have you slopped working? (of yes [1 no if yes, what was the last day that you worked? (mm/ddryy)_| —3 | —{% cnslamegs bil =']
               
for input_text in input_texts:
    len_range = max_sent_lengths[-1] # Take the longest range
    for length in max_sent_lengths:
        if(len(input_text) < length):
            len_range = length
            break
    #print(len_range)
    pre_corrected_sentence = word_spell_correct(input_text)
    input_text = clean_up_sentence(input_text, vocab_to_int[len_range])
    encoder_input_data = vectorize_data(input_texts=[input_text], max_encoder_seq_length=max_encoder_seq_length[len_range], num_encoder_tokens=num_encoder_tokens[len_range], vocab_to_int=vocab_to_int[len_range])



    target_text = gt_texts[i]

    input_seq = encoder_input_data
    #print(input_seq.shape)
    #print(max_decoder_seq_length[len_range])
    #print(max_decoder_seq_length)

    decoded_sentence,_  = decode_sequence(input_seq, encoder_model[len_range], decoder_model[len_range], num_decoder_tokens[len_range],  max_decoder_seq_length[len_range], vocab_to_int[len_range], int_to_vocab[len_range])
    corrected_sentence = word_spell_correct(input_text)
    #print('-Lenght = ', len_range)
    print('Input sentence:', input_text)
    #print('Spell Decoded sentence:', pre_corrected_sentence) 
    #print('Char Decoded sentence:', decoded_sentence)   
    print('Word Decoded sentence:', corrected_sentence) 
    print('\n')



In [None]:

input_texts = ['text',
'',
'',
'',
'',
' ',
'',
'',
'',
'Fai',
'10',
'7521509',
'(FISTDEOO)',
'at',
'11/3/2017',
'5:23:19',
'from',
'-9373834004',
'Req',
'IC',
'2017:1030525109:292E.',
'Page',
'4',
'of',
'5',
'(C)',
'',
'',
'',
' ',
'',
'',
'',
' ',
'',
'',
'',
'11/3/2017',
'FRI',
'8:26',
'FAX',
'2373834004',
'Kjooas00s',
'',
'',
'',
'as3-ursasy3',
'11:30:11',
'11/2/2017',
'vis',
'',
'',
'',
'®',
'®',
'&',
'ACCIDENT',
'CLAIM',
'FORM',
'',
'uu',
'num’',
'Tha',
'Benelits',
'Canter',
'',
'P.O.',
'Bax',
'100158,',
'Calumbin,',
'EC',
'20202-3150',
'',
'Tol-frea:',
'1-800-635-5587',
'Fax:',
'1-800-447-2488',
'',
'Gall',
'toll-free',
'Monday',
'through',
'Friday,',
'8',
'a.m.',
'lo',
'8',
'p.m,',
'Eagtarn',
'Time.',
'',
'',
'',
' ',
'',
'',
'',
' ',
'',
'',
'',
' ',
'',
'',
'',
' ',
'',
'',
'',
' ',
'',
'',
'',
' ',
'',
'',
'',
' ',
'',
'',
'',
' ',
'',
'',
'',
' ',
'',
'',
'',
'[',
'ATTENDING',
'PHYSICIAN',
'STATEMENT',
']',
'',
'',
'IneurexiPolicyt',
'alcar',
'Hama',
'(Lael',
'Name,',
'Flis!',
'Nama,',
'MI,',
'Suffix)',
'Data',
'of',
'Risth',
'{msmidrfyy)',
'-',
'',
'',
'Faupi',
'Nana',
'{Laut',
'Hume,',
'Flial',
'Numa,',
'1',
'Sut)',
'Dats',
'al',
'Bln',
'rAvad)',
'Ul',
'_',
'',
'-[ECIpENT',
'DETAILS',
']',
'',
'',
'',
' ',
'',
'',
'',
' ',
'',
'',
'',
' ',
'',
'',
'',
' ',
'',
'',
'',
'a',
'thls',
'Gundilan',
'the',
'result',
'of',
'a',
'acddental',
'inury?',
'ves',
'O',
'No',
'if',
'yas,',
'dale',
'of',
'accident',
'qre/ddlyy)',
'[1',
'0]',
'[z]e',
'[=]',
'',
'',
'',
'Is',
'Mig',
'condition',
'Lhe',
'result',
'of',
'hefer',
'employment',
'£1',
'Yes',
'pNo',
'[1',
'Unknown',
'',
'',
'',
'Plaaze',
'verily',
'treatment',
'for',
'the',
'accident',
'lalad',
'above.',
'',
'',
'',
' ',
'',
'',
'',
' ',
'',
'',
'',
' ',
'',
'',
'',
' ',
'',
'',
'',
' ',
'',
'',
'',
'Dalaw',
'of',
'Diagnosis',
'Diagncsis',
'Description',
'Prosadure',
'Procedure',
'Dascription',
'',
'Branden',
'(Including',
'|',
'Cudo',
'(GD)',
'ous',
'',
'Confinement)',
'eR',
'ap',
'HAS',
'TTT',
'',
'BEEF',
'eR',
'',
'wiz]',
'.',
'S33,5XxA',
'Hh',
'rioes',
'ey',
'race',
'Word',
'',
'awqd]',
'',
'weak',
'3',
'n',
'[aveny',
'[d',
'',
'wifi',
'Wl',
'',
'oa',
'',
'',
'',
' ',
'',
'',
'',
' ',
'',
'',
'',
' ',
'',
'',
'',
' ',
'',
'',
'',
' ',
'',
'',
'',
' ',
'',
'',
'',
' ',
'',
'',
'',
' ',
'',
'',
'',
'Has',
'lhe',
'pallet',
'bean',
'trastad',
'for',
'tha',
'same',
'ar',
'&',
'S(tilar',
'candillan',
'by',
'anolher',
'phyalelan',
'In',
'tha',
'past?',
'[1',
'Yen',
'Bho',
'',
'M',
'yor,',
'pioona',
'provid',
'tha',
'fares:',
'',
'',
'',
' ',
'',
'',
'',
'Diageosis:',
'Tramiment',
'Daten:',
'',
'',
'',
' ',
'',
'',
'',
' ',
'',
'',
'',
'id',
'ya.1',
'#dving',
'Lhe',
'patient',
'to',
'clap',
'working?',
'RECEIVED',
'',
'It',
'yes,',
'B8',
'of',
'what',
'cate?',
'(mmidkyy)',
'',
'',
'',
'[23]',
'[117]',
'',
'',
'',
'[Ih',
'cielih',
'fa',
'rotated',
'to',
'normal',
'prepnency,',
'please',
'grovida',
'tha',
'idliawing:',
'NOV',
'',
'Expecigd',
'Delivery',
'Dale',
'(mimicd/yy)',
'Aclual',
'Delivery',
'Dale',
'{mmiddlyy',
'',
'',
'',
' ',
'',
'',
'',
'Phyeiclan',
'informaiton',
'HUMAN',
'REGOURCITE',
'',
'',
'',
'FRAUD',
'NOTICE:',
'Any',
'person',
'wha',
'knowingly',
'files',
'&',
'statement',
'of',
'clalm',
'containing',
'FALSE',
'or',
'misleading',
'information',
'8',
'',
'subject',
'to',
'criminal',
'and',
'elvil',
'penallies.',
'This',
'includes',
'Attending',
'Physician',
'portions',
'of',
'the',
'claim',
'farm.',
'',
'',
'',
'CS',
'yma',
'SEAS',
'Ta',
'hve',
'glan',
'=',
'',
'The',
'above',
'statements',
'ara',
'trun',
'And',
'rompints',
'to',
'tho',
'bot',
'of',
'my',
'knowledge',
'and',
'bolluf.',
'',
'',
'',
'Physician',
'Name',
'(Lea!',
'Name,',
'Firat',
'Name,',
'MI,',
'Suita)',
'Plases',
'Print',
'Co',
'FHman',
'log',
'Mm',
'',
'/',
'‘',
'',
'',
'',
'Medical',
'Speclaty',
'[Tr',
'eactal-',
']',
'|',
'D',
'of',
'r',
'of',
'Ch',
'2',
'',
'2',
'Le',
'',
'',
'==',
'Zoi!',
'M',
'o',
'“Fanart',
'',
'',
'=',
'Balfrone',
'ie',
'2',
'Sle',
'iu',
'',
'il',
'HY',
'BY',
'1942',
'Fax',
'Number',
'yz—',
'43',
'-8',
'7775',
'Fhyalafans',
'Tax',
'ID',
'Number.',
'',
'',
'',
'Aro',
'you',
'refateq',
'to',
'hiv',
'pollen?',
'0',
'Yoe',
'LlMo',
'|',
'yes,',
'wal',
'iv',
'the',
'relelianshipT',
'',
'',
'',
' ',
'',
'',
'',
' ',
' ',
'',
'',
'',
'Physlclan',
'Slgnature',
'Date',
'',
'CL-1023',
'-2717',
'=',
'',
'',
'',
' ',
'',
'',
'',
'—']
               
for input_text in input_texts:
    len_range = max_sent_lengths[-1] # Take the longest range
    for length in max_sent_lengths:
        if(len(input_text) < length):
            len_range = length
            break
    #print(len_range)
    pre_corrected_sentence = word_spell_correct(input_text)
    input_text = clean_up_sentence(input_text, vocab_to_int[len_range])
    encoder_input_data = vectorize_data(input_texts=[input_text], max_encoder_seq_length=max_encoder_seq_length[len_range], num_encoder_tokens=num_encoder_tokens[len_range], vocab_to_int=vocab_to_int[len_range])



    target_text = gt_texts[i]

    input_seq = encoder_input_data
    #print(input_seq.shape)
    #print(max_decoder_seq_length[len_range])
    #print(max_decoder_seq_length)

    decoded_sentence,_  = decode_sequence(input_seq, encoder_model[len_range], decoder_model[len_range], num_decoder_tokens[len_range],  max_decoder_seq_length[len_range], vocab_to_int[len_range], int_to_vocab[len_range])
    corrected_sentence = word_spell_correct(input_text)
    #print('-Lenght = ', len_range)
    #print('Input sentence:', input_text)
    #print('Spell Decoded sentence:', pre_corrected_sentence) 
    #print('Char Decoded sentence:', decoded_sentence)   
    
    #print('Word Decoded sentence:', corrected_sentence) 
    print(corrected_sentence) 
    #print('\n')



In [20]:
input_texts = ['☑ @ New Enrollee ☐ Change to Existing Coverage ☐ Reinstatement', 
               'This claim is for: |☒ Self ☐ Spouse ☐ Domestic Partner ☐ Dependent Child',
               'Toll-free: 1-800-635-5597 Fax: 1-800-447-2498',
               'CLAIM FORW',
               'Sccial Security Number',
               'CLAIM FOR',
               'Nedical Information',
               'CLAIN FOR',
               'Contusion',]
for input_text in input_texts:
    len_range = max_sent_lengths[-1] # Take the longest range
    for length in max_sent_lengths:
        if(len(input_text) < length):
            len_range = length
            break
    #print(len_range)
    #print(input_text)
    pre_corrected_sentence = word_spell_correct(input_text)
    #print(pre_corrected_sentence)

    input_text_ = input_text
    input_text = clean_up_sentence(input_text, vocab_to_int[len_range])
    encoder_input_data = vectorize_data(input_texts=[input_text], max_encoder_seq_length=max_encoder_seq_length[len_range], num_encoder_tokens=num_encoder_tokens[len_range], vocab_to_int=vocab_to_int[len_range])



    target_text = gt_texts[i]

    input_seq = encoder_input_data
    #print(input_seq.shape)
    #print(max_decoder_seq_length[len_range])
    #print(max_decoder_seq_length)

    decoded_sentence,_  = decode_sequence(input_seq, encoder_model[len_range], decoder_model[len_range], num_decoder_tokens[len_range],  max_decoder_seq_length[len_range], vocab_to_int[len_range], int_to_vocab[len_range])
    corrected_sentence = word_spell_correct(input_text_)
    #print('-Lenght = ', len_range)
    print('Input sentence:', input_text_)
    #print('Spell Decoded sentence:', pre_corrected_sentence) 
    #print('Char Decoded sentence:', decoded_sentence)   
    
    print('Word Decoded sentence:', corrected_sentence) 
    #print(corrected_sentence) 
    #print('\n')



Input sentence: ☑ @ New Enrollee ☐ Change to Existing Coverage ☐ Reinstatement
Word Decoded sentence: ☑ @ New Enrollee ☐ Change to Existing Coverage ☐ Reinstatement 
Input sentence: This claim is for: |☒ Self ☐ Spouse ☐ Domestic Partner ☐ Dependent Child
Word Decoded sentence: This claim is for: |☒ Self ☐ Spouse ☐ Domestic Partner ☐ Dependent Child 
Input sentence: Toll-free: 1-800-635-5597 Fax: 1-800-447-2498
Word Decoded sentence: Toll-free: 1-800-635-5597 Fax: 1-800-447-2498 
Input sentence: CLAIM FORW
Word Decoded sentence: CLAIM FOR 
Input sentence: Sccial Security Number
Word Decoded sentence: Social Security Number 
Input sentence: CLAIM FOR
Word Decoded sentence: CLAIM FOR 
Input sentence: Nedical Information
Word Decoded sentence: Medical Information 
Input sentence: CLAIN FOR
Word Decoded sentence: PLAIN FOR 
Input sentence: Contusion
Word Decoded sentence: Contusion 


# Handwriting correction

In [None]:
num_samples = 1000000

OCR_data = os.path.join(data_path, 'handwritten_output.txt')
input_texts, target_texts, gt_texts = load_data_with_gt(OCR_data, num_samples, max_sent_len=10000, min_sent_len=0, delimiter='|', gt_index=0, prediction_index=1)

# Sample data
print(len(input_texts))
for i in range(100):
    print(input_texts[i], '\n', gt_texts[i])

In [None]:
#for seq_index in range(len(input_texts)):
results = open('RESULTS_HW.md', 'w')
results.write('|HW sentence|Corrected sentence|GT sentence|\n')
results.write('|---------------|-----------|----------------|\n')
decoded_sentences = []
corrected_sentences = []
for input_text, target_text in zip(input_texts, target_texts):
    len_range = max_sent_lengths[-1] # Take the longest range
    for length in max_sent_lengths:
        if(len(input_text) < length):
            len_range = length
            break
    #print(len_range)
    #print(input_text)
    pre_corrected_sentence = word_spell_correct(input_text)
    #print(pre_corrected_sentence)
    
    input_text = clean_up_sentence(input_text, vocab_to_int[len_range])
    encoder_input_data = vectorize_data(input_texts=[input_text], max_encoder_seq_length=max_encoder_seq_length[len_range], num_encoder_tokens=num_encoder_tokens[len_range], vocab_to_int=vocab_to_int[len_range])



    #target_text = target_texts[i]

    input_seq = encoder_input_data
    #print(input_seq.shape)
    #print(max_decoder_seq_length[len_range])
    #print(max_decoder_seq_length)

    decoded_sentence,_  = decode_sequence(input_seq, encoder_model[len_range], decoder_model[len_range], num_decoder_tokens[len_range],  max_decoder_seq_length[len_range], vocab_to_int[len_range], int_to_vocab[len_range])
    corrected_sentence = word_spell_correct(input_text)
    #print('-Lenght = ', len_range)
    print('Input sentence:', input_text)
    #print('Spell Decoded sentence:', pre_corrected_sentence) 
    #print('Char Decoded sentence:', decoded_sentence)   
    
    print('Word Decoded sentence:', corrected_sentence)
    print('Ground truth sentence:', target_text)
    results.write(' | ' + input_text + ' | ' + corrected_sentence + ' | '+ target_text.strip() + ' | \n')
    decoded_sentences.append(decoded_sentence)
    corrected_sentences.append(corrected_sentence)
    #print(corrected_sentence) 
    #print('\n')
    
results.close()


In [None]:
WER_spell_correction = calculate_WER(gt_texts, decoded_sentences)
print('WER_spell_correction |TEST= ', WER_spell_correction)

In [None]:
WER_spell_word_correction = calculate_WER(gt_texts, corrected_sentences)
print('WER_spell_word_correction |TEST= ', WER_spell_word_correction)

In [None]:
WER_OCR = calculate_WER(gt_texts, input_texts)
print('WER_OCR |TEST= ', WER_OCR)