In [1]:
from __future__ import print_function
import tensorflow as tf
import keras.backend as K
from keras.backend.tensorflow_backend import set_session
from keras.models import Model
from keras.layers import Input, LSTM, Dense, Bidirectional, Concatenate, GRU, Dot, TimeDistributed, Activation, Embedding
from keras import optimizers
from keras.callbacks import ModelCheckpoint, TensorBoard, LearningRateScheduler
import numpy as np
import os
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import json
from nltk.tokenize import word_tokenize
import re
import os
import tarfile
%matplotlib inline

Using TensorFlow backend.


# Utility functions

In [2]:
# Limit gpu allocation. allow_growth, or gpu_fraction
def gpu_alloc():
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    set_session(tf.Session(config=config))

In [3]:
gpu_alloc()

In [4]:
# Artificial noisy spelling mistakes
def noise_maker(sentence, threshold):
    '''Relocate, remove, or add characters to create spelling mistakes'''
    letters = ['a','b','c','d','e','f','g','h','i','j','k','l','m',
           'n','o','p','q','r','s','t','u','v','w','x','y','z',]
    noisy_sentence = []
    i = 0
    while i < len(sentence):
        random = np.random.uniform(0, 1, 1)
        # Most characters will be correct since the threshold value is high
        if random < threshold:
            noisy_sentence.append(sentence[i])
        else:
            new_random = np.random.uniform(0, 1, 1)
            # ~33% chance characters will swap locations
            if new_random > 0.67:
                if i == (len(sentence) - 1):
                    # If last character in sentence, it will not be typed
                    continue
                else:
                    # if any other character, swap order with following character
                    noisy_sentence.append(sentence[i + 1])
                    noisy_sentence.append(sentence[i])
                    i += 1
            # ~33% chance an extra lower case letter will be added to the sentence
            elif new_random < 0.33:
                random_letter = np.random.choice(letters, 1)[0]
                noisy_sentence.append(random_letter)
                noisy_sentence.append(sentence[i])
            # ~33% chance a character will not be typed
            else:
                pass
        i += 1

    return ''.join(noisy_sentence)

In [5]:
def load_data_with_gt(file_name, num_samples, max_sent_len, min_sent_len, delimiter='\t', gt_index=1, prediction_index=0):
    '''Load data from txt file, with each line has: <TXT><TAB><GT>. The  target to the decoder muxt have \t as the start trigger and \n as the stop trigger.'''
    cnt = 0  
    input_texts = []
    gt_texts = []
    target_texts = []
    for row in open(file_name, encoding='utf8'):
        if cnt < num_samples :
            #print(row)
            sents = row.split(delimiter)
            if (len(sents) < 2):
                continue
            input_text = sents[prediction_index]
            
            target_text = '\t' + sents[gt_index] + '\n'
            if len(input_text) > min_sent_len and len(input_text) < max_sent_len and len(target_text) > min_sent_len and len(target_text) < max_sent_len:
                cnt += 1
                
                input_texts.append(input_text)
                target_texts.append(target_text)
                gt_texts.append(sents[gt_index])
    return input_texts, target_texts, gt_texts

In [6]:
def load_raw_data(file_name, num_samples, max_sent_len, min_sent_len):
    '''Load data from txt file, with each line has: <TXT><TAB><GT>. The  target to the decoder muxt have \t as the start trigger and \n as the stop trigger.'''
    cnt = 0  
    input_texts = []
    for row in open(file_name, encoding='utf8'):
        if cnt < num_samples :
            input_text = row
            if len(input_text) > min_sent_len and len(input_text) < max_sent_len:
                cnt += 1
                
                input_texts.append(input_text)
    return input_texts

In [7]:
def load_data_with_noise(file_name, num_samples, noise_threshold, max_sent_len, min_sent_len):
    '''Load data from txt file, with each line has: <TXT>. The GT is just a noisy version of TXT. The  target to the decoder muxt have \t as the start trigger and \n as the stop trigger.'''
    cnt = 0  
    input_texts = []
    gt_texts = []
    target_texts = []
    while cnt < num_samples :
        for row in open(file_name, encoding='utf8'):
        #for row in open(file_name):
            if cnt < num_samples :
                sents = row.split("\t")
                input_text = noise_maker(sents[1], noise_threshold)
                input_text = input_text[:-1]

                target_text = '\t' + sents[1] + '\n'            
                if len(input_text) > min_sent_len and len(input_text) < max_sent_len and len(target_text) > min_sent_len and len(target_text) < max_sent_len:
                    cnt += 1
                    input_texts.append(input_text)
                    target_texts.append(target_text)
                    gt_texts.append(target_text[1:-1])
                    
    return input_texts, target_texts, gt_texts

In [8]:
def load_medical_terms_with_noise(json_file, num_samples, noise_threshold):
    with open(json_file) as f:
        med_terms_dict = json.load(f)
    med_terms = list(med_terms_dict.keys())
    input_texts = []
    gt_texts = []
    target_texts = []
    cnt = 0
    while cnt < num_samples:
        for term in med_terms:
            if cnt < num_samples :
                input_text = noise_maker(term, noise_threshold)
                input_text = input_text[:-1]   

                target_text = '\t' + term + '\n'

                input_texts.append(input_text)
                target_texts.append(target_text)
                gt_texts.append(target_text[1:-1])        
                cnt += 1
    return input_texts, target_texts, gt_texts, med_terms_dict

In [9]:
def load_accidents_terms_with_noise(file_name, limit, num_samples, noise_threshold):

    f = open(file_name, encoding='utf8')
    line = 0    
    med_terms = []
    try:
        for r in f:
            if(line < limit):

                med_terms.extend(r.split('|'))
                line += 1
    except:
        print('finished')
    input_texts = []
    gt_texts = []
    target_texts = []
    cnt = 0
    while cnt < num_samples:
        for term in med_terms:
            if cnt < num_samples :
                input_text = noise_maker(term, noise_threshold)
                input_text = input_text[:-1]   

                target_text = '\t' + term + '\n'

                input_texts.append(input_text)
                target_texts.append(target_text)
                gt_texts.append(target_text[1:-1])        
                cnt += 1
                
    return input_texts, target_texts, gt_texts

In [10]:
def load_procedures_tests_with_noise(file_name, num_samples, noise_threshold):
    '''Load data from txt file, with each line has: <TXT>. The GT is just a noisy version of TXT. The  target to the decoder muxt have \t as the start trigger and \n as the stop trigger.'''
    cnt = 0  
    input_texts = []
    gt_texts = []
    target_texts = []
    while cnt < num_samples :
        for row in open(file_name, encoding='utf8'):
        #for row in open(file_name):
            if cnt < num_samples :
                
                input_text = noise_maker(row, noise_threshold)
                input_text = input_text[:-1]

                target_text = '\t' + row + '\n'            

                cnt += 1
                input_texts.append(input_text)
                target_texts.append(target_text)
                gt_texts.append(target_text[1:-1])
                    
    return input_texts, target_texts, gt_texts

In [11]:
def process_word(word):
    # Try to correct the word from known dict
    #word = spell(word)
    # Option 1: Replace special chars and digits
    #processed_word = re.sub(r'[\\\/\-\—\:\[\]\,\.\"\;\%\~\(\)\{\}\$\#\?\●\@\+\-\*\d]', r'', w.lower())
    
    # Option 2: skip all words with special chars or digits
    if(len(re.findall(r'[\\\/\-\—\:\[\]\,\.\"\;\%\~\(\)\{\}\$\#\?\●\@\+\-\*\d]', word.lower())) == 0):
        processed_word = word
    else:
        processed_word = 'UNK'

    # Skip stop words
    #stop_words = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]        
    stop_words = []        
    if processed_word in stop_words:
        processed_word = 'UNK'
        
    return processed_word

In [12]:
def build_vocab(all_texts):
    '''Build vocab dictionary to victorize chars into ints'''
    vocab_to_int = {}
    count = 0 # Start index for any char will be 1, as 0 is masked by the Embedding/Masking layer
    codes = ['UNK', ' ', '\t','\n']# Start 'UNK' at the first entry, to keep its index=0 to be masked
    for code in codes:
        if code not in vocab_to_int:
            vocab_to_int[code] = count
            count += 1    
    
    for sentence in all_texts:       
        for word in word_tokenize(sentence):
            word = process_word(word)
            if word not in vocab_to_int:
                vocab_to_int[word] = count
                count += 1


    '''''Build inverse translation from int to word'''
    int_to_vocab = {}
    for word, value in vocab_to_int.items():
        int_to_vocab[value] = word
        
    return vocab_to_int, int_to_vocab

# Load data

In [13]:
data_path = '../../dat/'

In [14]:
max_sent_len = 1000000
min_sent_len = -1

In [15]:
input_texts = []

# Load tesseract correction

In [16]:
'''
num_samples = 1000000

# Dont add noisy or input mistakes as known words
tess_correction_data = os.path.join(data_path, 'all_ocr_data_2.txt')
input_texts_OCR, target_texts_OCR, gt_OCR = load_data_with_gt(tess_correction_data, num_samples, max_sent_len, min_sent_len)
input_texts += target_texts_OCR

tess_correction_data = os.path.join(data_path, 'field_class_21.txt')
input_texts_OCR, target_texts_OCR, gt_OCR = load_data_with_gt(tess_correction_data, num_samples, max_sent_len, min_sent_len)
input_texts += target_texts_OCR

tess_correction_data = os.path.join(data_path, 'field_class_32.txt')
input_texts_OCR, target_texts_OCR, gt_OCR = load_data_with_gt(tess_correction_data, num_samples, max_sent_len, min_sent_len)
input_texts += target_texts_OCR

tess_correction_data = os.path.join(data_path, 'field_class_30.txt')
input_texts_OCR, target_texts_OCR, gt_OCR = load_data_with_gt(tess_correction_data, num_samples, max_sent_len, min_sent_len)
input_texts += target_texts_OCR
'''

"\nnum_samples = 1000000\n\n# Dont add noisy or input mistakes as known words\ntess_correction_data = os.path.join(data_path, 'all_ocr_data_2.txt')\ninput_texts_OCR, target_texts_OCR, gt_OCR = load_data_with_gt(tess_correction_data, num_samples, max_sent_len, min_sent_len)\ninput_texts += target_texts_OCR\n\ntess_correction_data = os.path.join(data_path, 'field_class_21.txt')\ninput_texts_OCR, target_texts_OCR, gt_OCR = load_data_with_gt(tess_correction_data, num_samples, max_sent_len, min_sent_len)\ninput_texts += target_texts_OCR\n\ntess_correction_data = os.path.join(data_path, 'field_class_32.txt')\ninput_texts_OCR, target_texts_OCR, gt_OCR = load_data_with_gt(tess_correction_data, num_samples, max_sent_len, min_sent_len)\ninput_texts += target_texts_OCR\n\ntess_correction_data = os.path.join(data_path, 'field_class_30.txt')\ninput_texts_OCR, target_texts_OCR, gt_OCR = load_data_with_gt(tess_correction_data, num_samples, max_sent_len, min_sent_len)\ninput_texts += target_texts_OCR\

In [17]:
num_samples = 1000000
files_list = ['all_ocr_data_2.txt', 'field_class_21.txt', 'field_class_22.txt', 'field_class_23.txt', 'field_class_24.txt', 'field_class_25.txt', 'field_class_26.txt', 'field_class_27.txt', 'field_class_28.txt', 'field_class_29.txt', 'field_class_30.txt', 'field_class_31.txt', 'field_class_32.txt', 'field_class_33.txt', 'field_class_34.txt', 'NL-14622714.txt', 'NL-14627449.txt', 'NL-14628986.txt', 'NL-14631911.txt', 'NL-14640007.txt']
#desired_file_sizes = [num_samples, num_samples, num_samples, num_samples]


#for file_name, num_file_samples in zip(files_list, desired_file_sizes):
for file_name in files_list:
    tess_correction_data = os.path.join(data_path, file_name)
    input_texts_OCR, target_texts_OCR, gt_OCR = load_data_with_gt(tess_correction_data, num_samples, max_sent_len, min_sent_len)

    input_texts += input_texts_OCR

# Load HW terms

In [18]:

hw_correction_data = os.path.join(data_path, 'handwritten_output.txt')
input_texts_OCR, target_texts_OCR, gt_OCR = load_data_with_gt(hw_correction_data, num_samples, max_sent_len, min_sent_len, delimiter='|', gt_index=0, prediction_index=1)
input_texts += target_texts_OCR

# Load clean claims forms

In [19]:
num_samples = 10000
file_name = os.path.join(data_path, 'Test-Example-22.txt')
input_texts_CleanClaims = load_raw_data(file_name, num_samples, max_sent_len, min_sent_len)

In [20]:
input_texts += input_texts_CleanClaims

# Load Medical Terms dictionary

In [21]:
json_file = os.path.join(data_path, 'abbrevs.json')
threshold = 1.0
num_samples = 0
# We only need medical terms dict here to append it to the final dict at the end
input_texts_MedTerms, target_texts_MedTerms, _, med_terms_dict = load_medical_terms_with_noise(json_file, num_samples, threshold)

# Load Medical Instruction dictionary

In [22]:
num_samples = 10000
file_name = os.path.join(data_path, 'medical_instructions.txt')
input_texts_MedInstructions = load_raw_data(file_name, num_samples, max_sent_len, min_sent_len)


In [23]:
input_texts += input_texts_MedInstructions

# Load accident terms

In [24]:
num_samples = 10000
file_name = os.path.join(data_path, 'AccidentsL.txt')
#input_texts_AccTerms = load_raw_data(file_name, num_samples, max_sent_len, min_sent_len)

In [25]:
# Dont add noisy or input mistakes as known words
#input_texts += input_texts_AccTerms
#target_texts += target_texts_AccTerms
#input_texts += input_texts_AccTerms

# Load procedures and tests

In [26]:
file_name = os.path.join(data_path, 'procedures_tests.txt')
num_samples = 10000
input_texts_ProcTests = load_raw_data(file_name, num_samples, max_sent_len, min_sent_len)

In [27]:
# Dont add noisy or input mistakes as known words
#input_texts += input_texts_ProcTests
#target_texts += target_texts_ProcTests
input_texts += input_texts_ProcTests

In [28]:
# Sample data
print(len(input_texts))
for i in range(10):
    print(input_texts[i], '\n')

19316
Claim Type: VB Accident - Accidental Injury 

“Tho Th 9 Reported Even [ Happen ed To: EnployeefPolicyholder 

Pol inyhold elm-Chm er [11 form arlon 

First Name: 

Middle Nameﬂnitial: 

Last Name: 

Social S ecurity Number: 

Birth Date: 

Gender: 

Language Preference: 



## Build vocab

In [29]:
all_texts = input_texts
vocab_to_int, int_to_vocab = build_vocab(all_texts)
np.savez('vocab-words-all-terms', vocab_to_int=vocab_to_int, int_to_vocab=int_to_vocab, max_sent_len=max_sent_len, min_sent_len=min_sent_len )

In [30]:
input_characters = sorted(list(vocab_to_int))
target_characters = sorted(list(vocab_to_int))
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)
max_encoder_seq_length = max([len(txt) for txt in input_texts])

In [31]:
print('Number of samples:', len(input_texts))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)

Number of samples: 19316
Number of unique input tokens: 16994
Number of unique output tokens: 16994
Max sequence length for inputs: 2439


In [32]:
vocab_to_int # Some special chars need to be removed TODO: Data cleaning

{'UNK': 0,
 ' ': 1,
 '\t': 2,
 '\n': 3,
 'Claim': 4,
 'Type': 5,
 'VB': 6,
 'Accident': 7,
 'Accidental': 8,
 'Injury': 9,
 '“': 10,
 'Tho': 11,
 'Th': 12,
 'Reported': 13,
 'Even': 14,
 'Happen': 15,
 'ed': 16,
 'To': 17,
 'EnployeefPolicyholder': 18,
 'Pol': 19,
 'inyhold': 20,
 'er': 21,
 'form': 22,
 'arlon': 23,
 'First': 24,
 'Name': 25,
 'Middle': 26,
 'Nameﬂnitial': 27,
 'Last': 28,
 'Social': 29,
 'S': 30,
 'ecurity': 31,
 'Number': 32,
 'Birth': 33,
 'Date': 34,
 'Gender': 35,
 'Language': 36,
 'Preference': 37,
 'Address': 38,
 'Line': 39,
 'StatefPrmince': 40,
 'Postal': 41,
 'Code': 42,
 'Comtry': 43,
 'Best': 44,
 'Phone': 45,
 'to': 46,
 'be': 47,
 'Reached': 48,
 'During': 49,
 'the': 50,
 'Day': 51,
 'Email': 52,
 'f': 53,
 '”': 54,
 'RADIOLOGY': 55,
 'REPORT': 56,
 'UNKNOWN': 57,
 'Techiqn': 58,
 'vim': 59,
 'left': 60,
 'mist': 61,
 'FINDINGS': 62,
 'Awmmossiﬁcdunﬁyuencﬁdalbmdmcﬁoﬂ': 63,
 'Nofoaal': 64,
 "abmmaﬁtyisseenhﬂ'n": 65,
 'Mountings': 66,
 'IMPRESSION': 67,


In [33]:
int_to_vocab

{0: 'UNK',
 1: ' ',
 2: '\t',
 3: '\n',
 4: 'Claim',
 5: 'Type',
 6: 'VB',
 7: 'Accident',
 8: 'Accidental',
 9: 'Injury',
 10: '“',
 11: 'Tho',
 12: 'Th',
 13: 'Reported',
 14: 'Even',
 15: 'Happen',
 16: 'ed',
 17: 'To',
 18: 'EnployeefPolicyholder',
 19: 'Pol',
 20: 'inyhold',
 21: 'er',
 22: 'form',
 23: 'arlon',
 24: 'First',
 25: 'Name',
 26: 'Middle',
 27: 'Nameﬂnitial',
 28: 'Last',
 29: 'Social',
 30: 'S',
 31: 'ecurity',
 32: 'Number',
 33: 'Birth',
 34: 'Date',
 35: 'Gender',
 36: 'Language',
 37: 'Preference',
 38: 'Address',
 39: 'Line',
 40: 'StatefPrmince',
 41: 'Postal',
 42: 'Code',
 43: 'Comtry',
 44: 'Best',
 45: 'Phone',
 46: 'to',
 47: 'be',
 48: 'Reached',
 49: 'During',
 50: 'the',
 51: 'Day',
 52: 'Email',
 53: 'f',
 54: '”',
 55: 'RADIOLOGY',
 56: 'REPORT',
 57: 'UNKNOWN',
 58: 'Techiqn',
 59: 'vim',
 60: 'left',
 61: 'mist',
 62: 'FINDINGS',
 63: 'Awmmossiﬁcdunﬁyuencﬁdalbmdmcﬁoﬂ',
 64: 'Nofoaal',
 65: "abmmaﬁtyisseenhﬂ'n",
 66: 'Mountings',
 67: 'IMPRESSION',


In [34]:
len(int_to_vocab)

16994

In [35]:
f = open('vocab.txt', 'w')
for term in vocab_to_int.keys():
    print(term)
    f.write(term + '\n')

UNK
 
	


Claim
Type
VB
Accident
Accidental
Injury
“
Tho
Th
Reported
Even
Happen
ed
To
EnployeefPolicyholder
Pol
inyhold
er
form
arlon
First
Name
Middle
Nameﬂnitial
Last
Social
S
ecurity
Number
Birth
Date
Gender
Language
Preference
Address
Line
StatefPrmince
Postal
Code
Comtry
Best
Phone
to
be
Reached
During
the
Day
Email
f
”
RADIOLOGY
REPORT
UNKNOWN
Techiqn
vim
left
mist
FINDINGS
Awmmossiﬁcdunﬁyuencﬁdalbmdmcﬁoﬂ
Nofoaal
abmmaﬁtyisseenhﬂ'n
Mountings
IMPRESSION
Noawemamliy
harmed
Daytime
Eran
r
Information
Stopped
‘
Workinng
Yes
Physically
at
Work
Hours
Worked
on
Scheduled
''
Explanation
ofChange
in
Schedule
I
was
light
duty
injury
of
wrist
but
last
date
worked
due
other
ofknee
is
What
put
me
out
ofwork
Missed
Returned
No
Description
Rolling
pt
over
during
adl
care
and
weight
hoyer
lift
CNA
with
lost
grip
While
side
pushed
back
hyperﬂexed
it
Cracked
ulna
torn
ligament
damage
Re
lated
Time
ofAccident
Diagnosis
lthiscopic
surgery
Surg
Inform
arion
Is
Surger
Required
Surgery
Irmatienthutpat

fli
rnmiddfyy
q
penalties
portions
beat
LIZAE
E'E'H
Edgewood
FACESHEE
MRN
Doe
Sex
Demographics
SSN
xxxexxvmocx
EEFU'I
Fho
ne
Empioyer
Verified
Renew
Admission
Intormation
Admitting
Srowder
Larkin
John
Admissaon
Elective
Incompiete
Area
ELIEABETH
SERVJCE
AREA
EDG
SC
CRESTWEW
Roomi
Eed
EDGSCCEEDGSCC
Discharged
Confirmedi
Acct
Class
Same
Bilied
Guarantor
Relation
Sei
SEH
amrly
PIC
Payori'ﬁlan
Precert
Subscriber
Operative
Notee
Brief
ELIZABETH
EUGEWOOD
OP
MFIN
Adm
DIG
ccntinued
SHEIZDW
continucdl
lljtrtia
bilr'nazn
fatitiﬁ
lei
Die
Lzl
Cele
Sigs'led
Ellizur
Elizabeth
Healthcare
DPEHATIVEIPHOCEDUFIE
NOTE
Body
mass
index
kgfmz
DIAGNOSIS
Bucket
handle
medial
meniscus
initial
encounter
PROCEDURES
LEFT
KNEE
AFITHHOSCCIPY
DEBFI'IDEMENT
MEDIAL
MEISCECTUMY
CHDNDRDPLASTY
PFC
SURGEONS
Surgeonts
Role
rimaryr
ANESTHESIA
SPEClMENS
specimens
log
ESTIMATED
BLOOD
LOSS
DISPUSI'I'iOhHPOST
PFIOC
COURSE
PACU
Lark'rn
Eilidbt
lh
EDGEonD
DP
Dos
arc
AM
butt
Orthopedio
tr
Ego
Physioan
Flea
Daletlucs
Edsur
CIF
OPERA

Namedugl
Medi
pecplty
relatu
Conhmuecj
un
toiiat
INSUREDIPOLFCYHOLDERIPATEENT
insureds
Mil
Tteating
Physrcian
Additionai
bills
vehicie
incidenﬂacctdent
medicat
considerations
poiicy
income
wrthom
recruited
andror
protection
Arizona
taw
requires
intent
injure
defraud
deceive
faise
fraudulent
toss
beneﬁt
application
guilty
crime
ﬁnes
prison
York
materialty
conceals
concerning
fact
thereto
commits
aiso
penalty
exceed
ﬁve
thousand
dollars
value
violation
lnsureleolicyholder
read
notices
shouid
reason
obligation
repay
overpayment
knowtedge
signature
consideration
requ'iréd
Optimal
ismtwwmomnormm
MERE
Marga
Mam
yum
Timei
authorizationto
Ponabiiity
hospitais
ctinilc
vocationai
evaiuators
GEINEX
Sewices
LLC
Sooal
protessronat
empioyers
aicohol
reiease
professonai
Tnfﬂuranfce
clatims
orma
Paut
inciudin
subse
uent
ﬁnancialmana
ement
whic
ever
osed
byAHiPAA
resistant
prowding
orany
Sepial
Secun
iength
otherwrse
atter
abte
evaiuate
claimts
feed
wiii
discioses
nsur
ure
éigneé
SOCIEET
security
Conse

preference
RTC
WORKING
SAFELY
UNABLE
ADVISE
PI
RATING
CAPACITY
PSI
REGULAR
DUTY
MODIFIED
YESINO
DESCRIBE
Rem
PERMANENT
IMPAIRMENT
EXPECTED
MMI
REACHED
Biddeford
Werkwell
SMMC
Drive
SIGNATURE
TELEPHONE
NARRATIVES
ATTACHED
wca
DISTRIBUTION
PRAcTn'IDNER
INSURANUE
SszFOI
EimloyeerPolicyholdei
Contlnued
CON
FINEMENTIINTENSIVE
WNO
mmi'ddiyy
Surge
ddlyy
lCD
mmI'ddI'yy
choose
rocedure
TICE
issub'ect
patierit
'II_
ocial
Binh
Descn'ptiotr
occured
participating
Codc
MCL
Meniscus
kncc
Irmatienthwpatient
llctlica
Pl'mitlcl
Jason
Prowider
Holm
Dually
Pl'oxitler
Minnesota
Valley
Produce
Ciotmtry
ot'Visit
AdmissiorL
ofDisehaJ'ge
Eulpl
artl'onie
Su
hmission
igncd
unumL
tolreceive
deSIgned
notesil
Spoial
SEND
UN'I
jzgffa
Visa
MasterCard
Exp
REMITTANCE
Forward
ARTHROSCOPICALLY
AIDED
ANTERIOR
Contractual
Write
REPAIR
TORN
LIGAMENT
INJECTION
ANESTHETIC
AGENT
FEM
ULTRASONIC
GUIDANCE
NEEDLE
OPPOSING
u_
PROSTHETIC_IMPLANT
OTHERWIS
étianﬁ
'K
Cantractual
Writc'cff
ACCOURw
QUEETIONS
UNITED
THAFK
Fatient
TWIN
rag

palient
¥saling
provillers
corvacl
infenation
nay
Andress
advisad
Ic
Tas
pno
Expeeted
retutn
Ful
Z
Functlenal
nof
behavicral
healit
RESTRIGTIONS
acliviies
nal
aclivities
cannat
co
mitial
understoed
prolanged
repatilive
tive
sporific
totafty
Plsase
restrictinns
mmvddfyy
ry
Irye
anc
camplete
heliof
Physicizn
ily
Z|
FLNG
relationsh
Physifian
GRAMERCY
PARK
PHYSICAT
RESABILITATION
ALL
GUY
Diplamaie
vf
MED
IANE
REBABILITATION
SUITE
FOREST
Chore
Aun
Ave
Phones
Fae
pslza
Fain
Level
nbove
presentd
Paticor
workingfnot
exercices
Ls
Bualualion
passive
ROB
Diffusely
render
dilfisefdarasin
HOM
Flexion
Rotation
Sperling
Diffuscly
diffs
irigper
poinis
Extension
Jo
fan
Flexton
_i
wou
UF
ric
testsi
por
___Lasepue
Iilateral
Flexior
BR
TR
deprees
Adduction
Atilucrtion
degreas
Impingement
____
Haskin
Neer
Brier
BErop
Yergason
Cross
adduweron
Formal
Spesial
tes
Mchurray
Acley
Patelfar
ARDM
MP
rp
Oryer
Antalgic
Meds
phin
ist
previnus
trestment
iest
qriered
€
Siatys
Totally
Disabiggd
lly
Disabled
Mot
Repardin

ming
lisl
confinemenl
li
lagt
sufix
narne
mmddyy
bl
willoch
bilis
additienal
mey
avaluate
conslderationg
beneflt
paymeanta
poly
inaame
amployer
benelits
sllualion
nave
yuestions
aboul
lax
silustion
signatura
guardian
berefits
partly
anlar
resicrac
parson
hiv
i'signed
las
mame
tmmiddfyy
falleal
marne
lael
nemes
sif
biith
bccidental
mw
aggidant
i=
amployment
b'no
unknowns
plaaee
treatmen
accidant
fisted
spove
riggriosls
diegnoals
pracedura
confinament
oee
fesser
anolher
pleaye
dlagnasis
daras
pry
idim
provige
follwing
expecied
aclual
dallvery
gc
slgnature
physiclan
beet
nama
wochtd
terr
medica
specially
ternal
mediune
medfx
cily
£ip
tee
physician'a
vez
wl
leeph¥eician
signature|
canter
fmla
autharize
tnsurey
mafe
tanad
printéd
medexpress
dob
norwin
huntingdon
pittsburgh
sax
picnic
alle
rgies
allargies
vitals
bp
mmhg
resp
bmi
lmp
pmp
willochell
tari
meadexpress
dor
lesser
nall
gat
plenty
recovering
hear
elevated
awake
againstthe
lv
avoid
frostbite
washcloth
elastic
layer
clothing
ica
meds

AMBULATORY
HLT
concem
Retum
sang
foloving
Jer
hf
pormangii
litticany
Keyboarding
excess
of____
Pounds
excels
reaching
VENTURACOUNTY
AGENCY
TATIONS
WHITE
Cnlo
YELLOW
Pallont
ANACAPA
SURGICAL
ooh
pestle
aT
mote
Lome
Roed
Cool
poh
pra
cbd
Lic
DEA
reo
Anirh
Addreet
zn
Rot
NR
Label
Imp
cd
koM
Harte
Bufix
|ACCIDENT
accidentat
mjury
H☒
emplayment
gb
abo
Desoription
trio
ano
Aefod
_cdpruio
Lofh
cOhsed
oct
melibind
pid
boltief
DMovCo
Speciaity
Toa
Begrelo
Zj
_c
Nahe
Pom
ralatad
ATTACH
RECORDS
THREE
MONTHS
PRESENT
Obstatrical
Cutrent
Oiher
Monthiy
arising
ayrnptoms
appaared
apeident
consulted
_FI'res
LMP
Lmp
hospitat
continuausly
Hao
ToD
Estimate
patiery
shauid
ablg
partially
Impairment
lrdtation
functional
capacity
capahln
Sight
frmietion
capabla
Glass
limitation
incapable
clerical
acitvlty
Savere
minimal
Remarks
defined
Foderal
Dictionary
Gecupational
Tiles
engage
interparsonal
relations
Ciasa
abla
sityations
limiter
intarperaonal
rly
gage
enly
interparsanal
moderato
lrnitations
siress
interpe

Jochai
Tanis
Israel
fulfil
WiFe
secular
Talbot
BONGANI
MOYO
ABove
ZIMBABWE
Interrupted
SD
Sally
cross
plywood
Tory
Polaris
Pkwy
EGGEWOOT
VINEI
BAILLIEPARK
POTCHEFSTROOM
GLENVISTA
anxious
Mistah
Piers
paralysed
trout
SIp
challenge
publishes
Madagascar
sigh
works
Penn
AVINASH
Grootbrakrivie
needy
tary
Ministry
Agriculture
Harleysville
fewer
sparked
installation
tool
elementary
composed
experienced
Duron
CJ
confessed
everything
coming
aides
sake
Roerber
Cheryl
Education
Freedom
Speech
Returning
PEDIATRIC
panel
lifelong
Cristiaan
Bouquet
Rosettenville
Bonald
Jacobus
Harris
Webb
Whitney
leadership
convicted
sabotage
BAREnID
POMONA
incidents
compulsory
savings
Exchequer
computed
swing
Roodezundtstraat
Unless
Ypsilanti
contribution
neutrals
hoped
MIDDELBURG
Ethridqe
unflinching
witness
Man
Dog
bite
demonstrators
shopgirls
overalls
Ares
outer
vertical
du
jour
complicates
moleboheng
yards
outside
affluence
seemed
Marianne
feeding
tray
slide
slackness
appeasement
kit
Probably
Biceps
Adele
Fourie

In [36]:
'''
#tar_path = os.path.abspath(os.path.dirname(__file__))
tar_path = os.getcwd() + '/autocorrect'
tar_file_name = os.path.join(tar_path, 'words.tar')
t = tarfile.open(tar_file_name, 'r:tar')
t.extractfile(tar_file_name)
'''

"\n#tar_path = os.path.abspath(os.path.dirname(__file__))\ntar_path = os.getcwd() + '/autocorrect'\ntar_file_name = os.path.join(tar_path, 'words.tar')\nt = tarfile.open(tar_file_name, 'r:tar')\nt.extractfile(tar_file_name)\n"

In [37]:
!tar -xvf autocorrect/words.tar

words/
words/en_US_GB_CA_mixed.txt
words/big_orig.txt
words/._big.txt
words/big.txt
words/en_US_GB_CA_lower.txt


In [38]:
f_big_orig = open('words/big_orig.txt', 'r')
f_vocab = open('vocab.txt', 'r')
f_big = open('words/big.txt', 'w')
for line in f_big_orig:
    f_big.write(line)
for line in f_vocab:
    f_big.write(line)
    

f_big_orig.close()
f_big.close()
f_vocab.close()

In [39]:
!tar -cvf autocorrect/words.tar words 

words/
words/en_US_GB_CA_mixed.txt
words/big_orig.txt
words/._big.txt
words/big.txt
words/en_US_GB_CA_lower.txt


In [40]:
!rm -rf words/