In [1]:
from __future__ import print_function
import tensorflow as tf
import keras.backend as K
from keras.backend.tensorflow_backend import set_session
from keras.models import Model
from keras.layers import Input, LSTM, Dense, Bidirectional, Concatenate, GRU, Dot, TimeDistributed, Activation, Embedding
from keras import optimizers
from keras.callbacks import ModelCheckpoint, TensorBoard, LearningRateScheduler
import numpy as np
import os
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import json
from nltk.tokenize import word_tokenize
import re
import os
import tarfile
%matplotlib inline

Using TensorFlow backend.


# Utility functions

In [2]:
# Limit gpu allocation. allow_growth, or gpu_fraction
def gpu_alloc():
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    set_session(tf.Session(config=config))

In [3]:
gpu_alloc()

In [4]:
# Artificial noisy spelling mistakes
def noise_maker(sentence, threshold):
    '''Relocate, remove, or add characters to create spelling mistakes'''
    letters = ['a','b','c','d','e','f','g','h','i','j','k','l','m',
           'n','o','p','q','r','s','t','u','v','w','x','y','z',]
    noisy_sentence = []
    i = 0
    while i < len(sentence):
        random = np.random.uniform(0, 1, 1)
        # Most characters will be correct since the threshold value is high
        if random < threshold:
            noisy_sentence.append(sentence[i])
        else:
            new_random = np.random.uniform(0, 1, 1)
            # ~33% chance characters will swap locations
            if new_random > 0.67:
                if i == (len(sentence) - 1):
                    # If last character in sentence, it will not be typed
                    continue
                else:
                    # if any other character, swap order with following character
                    noisy_sentence.append(sentence[i + 1])
                    noisy_sentence.append(sentence[i])
                    i += 1
            # ~33% chance an extra lower case letter will be added to the sentence
            elif new_random < 0.33:
                random_letter = np.random.choice(letters, 1)[0]
                noisy_sentence.append(random_letter)
                noisy_sentence.append(sentence[i])
            # ~33% chance a character will not be typed
            else:
                pass
        i += 1

    return ''.join(noisy_sentence)

In [5]:
def load_data_with_gt(file_name, num_samples, max_sent_len, min_sent_len, delimiter='\t', gt_index=1, prediction_index=0):
    '''Load data from txt file, with each line has: <TXT><TAB><GT>. The  target to the decoder muxt have \t as the start trigger and \n as the stop trigger.'''
    cnt = 0  
    input_texts = []
    gt_texts = []
    target_texts = []
    for row in open(file_name, encoding='utf8'):
        if cnt < num_samples :
            #print(row)
            sents = row.split(delimiter)
            if (len(sents) < 2):
                continue
            input_text = sents[prediction_index]
            
            target_text = '\t' + sents[gt_index] + '\n'
            if len(input_text) > min_sent_len and len(input_text) < max_sent_len and len(target_text) > min_sent_len and len(target_text) < max_sent_len:
                cnt += 1
                
                input_texts.append(input_text)
                target_texts.append(target_text)
                gt_texts.append(sents[gt_index])
    return input_texts, target_texts, gt_texts

In [6]:
def load_raw_data(file_name, num_samples, max_sent_len, min_sent_len):
    '''Load data from txt file, with each line has: <TXT><TAB><GT>. The  target to the decoder muxt have \t as the start trigger and \n as the stop trigger.'''
    cnt = 0  
    input_texts = []
    for row in open(file_name, encoding='utf8'):
        if cnt < num_samples :
            input_text = row
            if len(input_text) > min_sent_len and len(input_text) < max_sent_len:
                cnt += 1
                
                input_texts.append(input_text)
    return input_texts

In [7]:
def load_data_with_noise(file_name, num_samples, noise_threshold, max_sent_len, min_sent_len):
    '''Load data from txt file, with each line has: <TXT>. The GT is just a noisy version of TXT. The  target to the decoder muxt have \t as the start trigger and \n as the stop trigger.'''
    cnt = 0  
    input_texts = []
    gt_texts = []
    target_texts = []
    while cnt < num_samples :
        for row in open(file_name, encoding='utf8'):
        #for row in open(file_name):
            if cnt < num_samples :
                sents = row.split("\t")
                input_text = noise_maker(sents[1], noise_threshold)
                input_text = input_text[:-1]

                target_text = '\t' + sents[1] + '\n'            
                if len(input_text) > min_sent_len and len(input_text) < max_sent_len and len(target_text) > min_sent_len and len(target_text) < max_sent_len:
                    cnt += 1
                    input_texts.append(input_text)
                    target_texts.append(target_text)
                    gt_texts.append(target_text[1:-1])
                    
    return input_texts, target_texts, gt_texts

In [8]:
def load_medical_terms_with_noise(json_file, num_samples, noise_threshold):
    with open(json_file) as f:
        med_terms_dict = json.load(f)
    med_terms = list(med_terms_dict.keys())
    input_texts = []
    gt_texts = []
    target_texts = []
    cnt = 0
    while cnt < num_samples:
        for term in med_terms:
            if cnt < num_samples :
                input_text = noise_maker(term, noise_threshold)
                input_text = input_text[:-1]   

                target_text = '\t' + term + '\n'

                input_texts.append(input_text)
                target_texts.append(target_text)
                gt_texts.append(target_text[1:-1])        
                cnt += 1
    return input_texts, target_texts, gt_texts, med_terms_dict

In [9]:
def load_accidents_terms_with_noise(file_name, limit, num_samples, noise_threshold):

    f = open(file_name, encoding='utf8')
    line = 0    
    med_terms = []
    try:
        for r in f:
            if(line < limit):

                med_terms.extend(r.split('|'))
                line += 1
    except:
        print('finished')
    input_texts = []
    gt_texts = []
    target_texts = []
    cnt = 0
    while cnt < num_samples:
        for term in med_terms:
            if cnt < num_samples :
                input_text = noise_maker(term, noise_threshold)
                input_text = input_text[:-1]   

                target_text = '\t' + term + '\n'

                input_texts.append(input_text)
                target_texts.append(target_text)
                gt_texts.append(target_text[1:-1])        
                cnt += 1
                
    return input_texts, target_texts, gt_texts

In [10]:
def load_procedures_tests_with_noise(file_name, num_samples, noise_threshold):
    '''Load data from txt file, with each line has: <TXT>. The GT is just a noisy version of TXT. The  target to the decoder muxt have \t as the start trigger and \n as the stop trigger.'''
    cnt = 0  
    input_texts = []
    gt_texts = []
    target_texts = []
    while cnt < num_samples :
        for row in open(file_name, encoding='utf8'):
        #for row in open(file_name):
            if cnt < num_samples :
                
                input_text = noise_maker(row, noise_threshold)
                input_text = input_text[:-1]

                target_text = '\t' + row + '\n'            

                cnt += 1
                input_texts.append(input_text)
                target_texts.append(target_text)
                gt_texts.append(target_text[1:-1])
                    
    return input_texts, target_texts, gt_texts

In [11]:
def process_word(word):
    # Try to correct the word from known dict
    #word = spell(word)
    # Option 1: Replace special chars and digits
    #processed_word = re.sub(r'[\\\/\-\—\:\[\]\,\.\"\;\%\~\(\)\{\}\$\#\?\●\@\+\-\*\d]', r'', w.lower())
    
    # Option 2: skip all words with special chars or digits
    if(len(re.findall(r'[\\\/\-\—\:\[\]\,\.\"\;\%\~\(\)\{\}\$\#\?\●\@\+\-\*\d]', word.lower())) == 0):
        processed_word = word
    else:
        processed_word = 'UNK'

    # Skip stop words
    #stop_words = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]        
    stop_words = []        
    if processed_word in stop_words:
        processed_word = 'UNK'
        
    return processed_word

In [12]:
def build_vocab(all_texts):
    '''Build vocab dictionary to victorize chars into ints'''
    vocab_to_int = {}
    count = 0 # Start index for any char will be 1, as 0 is masked by the Embedding/Masking layer
    codes = ['UNK', ' ', '\t','\n']# Start 'UNK' at the first entry, to keep its index=0 to be masked
    for code in codes:
        if code not in vocab_to_int:
            vocab_to_int[code] = count
            count += 1    
    
    for sentence in all_texts:       
        for word in word_tokenize(sentence):
            word = process_word(word)
            if word not in vocab_to_int:
                vocab_to_int[word] = count
                count += 1


    '''''Build inverse translation from int to word'''
    int_to_vocab = {}
    for word, value in vocab_to_int.items():
        int_to_vocab[value] = word
        
    return vocab_to_int, int_to_vocab

# Load data

In [13]:
data_path = '../../dat/'

In [14]:
max_sent_len = 1000000
min_sent_len = -1

In [15]:
input_texts = []

# Load tesseract correction

In [16]:
num_samples = 1000000

# Dont add noisy or input mistakes as known words
tess_correction_data = os.path.join(data_path, 'all_ocr_data_2.txt')
input_texts_OCR, target_texts_OCR, gt_OCR = load_data_with_gt(tess_correction_data, num_samples, max_sent_len, min_sent_len)
input_texts += target_texts_OCR

tess_correction_data = os.path.join(data_path, 'field_class_21.txt')
input_texts_OCR, target_texts_OCR, gt_OCR = load_data_with_gt(tess_correction_data, num_samples, max_sent_len, min_sent_len)
input_texts += target_texts_OCR

tess_correction_data = os.path.join(data_path, 'field_class_32.txt')
input_texts_OCR, target_texts_OCR, gt_OCR = load_data_with_gt(tess_correction_data, num_samples, max_sent_len, min_sent_len)
input_texts += target_texts_OCR

tess_correction_data = os.path.join(data_path, 'field_class_30.txt')
input_texts_OCR, target_texts_OCR, gt_OCR = load_data_with_gt(tess_correction_data, num_samples, max_sent_len, min_sent_len)
input_texts += target_texts_OCR

# Load clean claims forms

In [17]:
num_samples = 10000
file_name = os.path.join(data_path, 'Test-Example-22.txt')
input_texts_CleanClaims = load_raw_data(file_name, num_samples, max_sent_len, min_sent_len)

In [18]:
input_texts += input_texts_CleanClaims

# Load Medical Terms dictionary

In [19]:
json_file = os.path.join(data_path, 'abbrevs.json')
threshold = 1.0
num_samples = 0
# We only need medical terms dict here to append it to the final dict at the end
input_texts_MedTerms, target_texts_MedTerms, _, med_terms_dict = load_medical_terms_with_noise(json_file, num_samples, threshold)

# Load Medical Instruction dictionary

In [20]:
num_samples = 10000
file_name = os.path.join(data_path, 'medical_instructions.txt')
input_texts_MedInstructions = load_raw_data(file_name, num_samples, max_sent_len, min_sent_len)


In [21]:
input_texts += input_texts_MedInstructions

# Load accident terms

In [22]:
num_samples = 10000
file_name = os.path.join(data_path, 'AccidentsL.txt')
#input_texts_AccTerms = load_raw_data(file_name, num_samples, max_sent_len, min_sent_len)

In [23]:
# Dont add noisy or input mistakes as known words
#input_texts += input_texts_AccTerms
#target_texts += target_texts_AccTerms
#input_texts += input_texts_AccTerms

# Load procedures and tests

In [24]:
file_name = os.path.join(data_path, 'procedures_tests.txt')
num_samples = 10000
input_texts_ProcTests = load_raw_data(file_name, num_samples, max_sent_len, min_sent_len)

In [25]:
# Dont add noisy or input mistakes as known words
#input_texts += input_texts_ProcTests
#target_texts += target_texts_ProcTests
input_texts += input_texts_ProcTests

In [26]:
# Sample data
print(len(input_texts))
for i in range(10):
    print(input_texts[i], '\n')

9001
	Claim Type: VB Accident - Accidental Injury

 

	Who The Reported Event Happened To: Employee/Policyholder

 

	Policyholder/Owner Information

 

	First Name:

 

	Middle Name/Initial:

 

	Last Name:

 

	Social Security Number:

 

	Birth Date:

 

	Gender:

 

	Language Preference:

 



## Build vocab

In [27]:
all_texts = input_texts
vocab_to_int, int_to_vocab = build_vocab(all_texts)
np.savez('vocab-words-all-terms', vocab_to_int=vocab_to_int, int_to_vocab=int_to_vocab, max_sent_len=max_sent_len, min_sent_len=min_sent_len )

In [28]:
input_characters = sorted(list(vocab_to_int))
target_characters = sorted(list(vocab_to_int))
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)
max_encoder_seq_length = max([len(txt) for txt in input_texts])

In [29]:
print('Number of samples:', len(input_texts))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)

Number of samples: 9001
Number of unique input tokens: 7194
Number of unique output tokens: 7194
Max sequence length for inputs: 2442


In [30]:
vocab_to_int # Some special chars need to be removed TODO: Data cleaning

{'UNK': 0,
 ' ': 1,
 '\t': 2,
 '\n': 3,
 'Claim': 4,
 'Type': 5,
 'VB': 6,
 'Accident': 7,
 'Accidental': 8,
 'Injury': 9,
 'Who': 10,
 'The': 11,
 'Reported': 12,
 'Event': 13,
 'Happened': 14,
 'To': 15,
 'Information': 16,
 'First': 17,
 'Name': 18,
 'Middle': 19,
 'Last': 20,
 'Social': 21,
 'Security': 22,
 'Number': 23,
 'Birth': 24,
 'Date': 25,
 'Gender': 26,
 'Language': 27,
 'Preference': 28,
 'Address': 29,
 'Line': 30,
 'City': 31,
 'Postal': 32,
 'Code': 33,
 'Country': 34,
 'Best': 35,
 'Phone': 36,
 'to': 37,
 'be': 38,
 'Reached': 39,
 'During': 40,
 'the': 41,
 'Day': 42,
 'Email': 43,
 'Page': 44,
 'of': 45,
 'RADIOLOGY': 46,
 'REPORT': 47,
 'Patient': 48,
 'MRN': 49,
 'Accession': 50,
 'No': 51,
 'Ref': 52,
 'Physician': 53,
 'UNKNOWN': 54,
 'Study': 55,
 'Hospital': 56,
 'DOB': 57,
 'Tech': 58,
 'Notes': 59,
 'LT': 60,
 'WRIST': 61,
 'WC': 62,
 'INITIAL': 63,
 'WORTMAN': 64,
 'Wrist': 65,
 'comp': 66,
 'More': 67,
 'View': 68,
 'UNCOL': 69,
 'Technique': 70,
 'views

In [31]:
int_to_vocab

{0: 'UNK',
 1: ' ',
 2: '\t',
 3: '\n',
 4: 'Claim',
 5: 'Type',
 6: 'VB',
 7: 'Accident',
 8: 'Accidental',
 9: 'Injury',
 10: 'Who',
 11: 'The',
 12: 'Reported',
 13: 'Event',
 14: 'Happened',
 15: 'To',
 16: 'Information',
 17: 'First',
 18: 'Name',
 19: 'Middle',
 20: 'Last',
 21: 'Social',
 22: 'Security',
 23: 'Number',
 24: 'Birth',
 25: 'Date',
 26: 'Gender',
 27: 'Language',
 28: 'Preference',
 29: 'Address',
 30: 'Line',
 31: 'City',
 32: 'Postal',
 33: 'Code',
 34: 'Country',
 35: 'Best',
 36: 'Phone',
 37: 'to',
 38: 'be',
 39: 'Reached',
 40: 'During',
 41: 'the',
 42: 'Day',
 43: 'Email',
 44: 'Page',
 45: 'of',
 46: 'RADIOLOGY',
 47: 'REPORT',
 48: 'Patient',
 49: 'MRN',
 50: 'Accession',
 51: 'No',
 52: 'Ref',
 53: 'Physician',
 54: 'UNKNOWN',
 55: 'Study',
 56: 'Hospital',
 57: 'DOB',
 58: 'Tech',
 59: 'Notes',
 60: 'LT',
 61: 'WRIST',
 62: 'WC',
 63: 'INITIAL',
 64: 'WORTMAN',
 65: 'Wrist',
 66: 'comp',
 67: 'More',
 68: 'View',
 69: 'UNCOL',
 70: 'Technique',
 71: 'v

In [32]:
len(int_to_vocab)

7194

In [33]:
f = open('vocab.txt', 'w')
for term in vocab_to_int.keys():
    print(term)
    f.write(term + '\n')

UNK
 
	


Claim
Type
VB
Accident
Accidental
Injury
Who
The
Reported
Event
Happened
To
Information
First
Name
Middle
Last
Social
Security
Number
Birth
Date
Gender
Language
Preference
Address
Line
City
Postal
Code
Country
Best
Phone
to
be
Reached
During
the
Day
Email
Page
of
RADIOLOGY
REPORT
Patient
MRN
Accession
No
Ref
Physician
UNKNOWN
Study
Hospital
DOB
Tech
Notes
LT
WRIST
WC
INITIAL
WORTMAN
Wrist
comp
More
View
UNCOL
Technique
views
left
wrist
Cormarison
None
availabie
Comparison
available
FINDINGS
A
ossific
density
seen
distal
ulnar
styloid
focal
abnormality
is
in
standing
soft
tissues
IMPRESSION
Possible
sequela
remote
trauma
or
unfused
ossification
center
acute
osseous
identified
Daytime
Stopped
Working
Yes
Physically
at
Work
Hours
Worked
on
Scheduled
Explanation
Change
Schedule
I
was
light
duty
injury
but
last
date
worked
due
other
knee
what
put
me
out
work
Missed
Returned
Description
Rolling
pt
over
during
adl
care
weight
hoyer
litt
CNA
with
lost
grip
while
side
and
pushed
back


inside
wear
outdoors
advance
tolerated
crutches
elevate
Continue
gentle
Frequent
pumping
sets
straight
leg
raises
orders
pleased
Ox
Rest
Amb
Sat
Timing
Method
Probe
Measured
Mariah
Larsen
MA
Screening
Kyphosis
Scoliosis
Paravertebral
Comments
Alert
oriented
apparent
distress
signs
Ambulates
limp
Transfers
table
positions
Straight
raise
negative
scoliosis
mobility
Minimal
Well
Musculoskeletal
HEAD
Tender
palpation
occiput
CERVICAL
THORACIC
Side
bent
LUMBAR
SACRUM
Sacroiliac
PSIS
increased
hamstring
Piriformis
DTRs
Start
Directions
PRN
Instruction
Stop
Omega
mg
N
Counseling
Educational
educational
factors
Finley
Kevin
INS
BAL
PAT
ITEM
ENCOUNTER
Immunization
Admin
Commercial
Payment
Flu
Vaccine
Quadrivalent
Split
And
OI
OSTEOPATHIC
MANIPULATION
BODY
REGIONS
allowing
serve
paymit
OVR
Reach
Centers
Bethel
male
haying
yrs
slipping
Aggrevated
radiation
hard
lift
Tools
Screenings
Instrument
Score
MDD
Classification
Questionnaire
Further
CORONARY
HEART
DISEASE
RISK
Completed
Order
Interpretatio

AGENT
FEM
ULTRASONIC
GUIDANCE
NEEDLE
OPPOSING
PROSTHETIC
IMPLANT
OTHERWIS
PYPATN
PYINSU
CWADDW
choosing
accept
Express
Discover
UNITED
THANK
TWIN
CITIES
ORTHOPEDICS
UPON
RCPT
Made
Adjusted
LWR
JOINT
SUMMARY
Inquiries
Croix
Orthopaedics
Twin
Cities
integrated
practice
Sufﬁx
Inches
lf
MINNESOTA
OPERATIVE
JASON
HOLM
DIAGNOSES
PROCEDURES
utilizing
internal
Jamie
Birkelo
ANESTHESIOLOGIST
mL
INTRAOPERATIVE
COMPLICATIONS
sustained
planting
both
nature
apposed
delaying
Risks
alternatives
elected
marked
again
answered
fashion
Preoperative
antibiotics
ensure
arthroscope
trochlea
plateau
peripheral
extend
overall
remnant
footprint
wall
notch
pivot
IIB
Lachman
III
opening
valgus
drawer
asymmetric
dial
secured
SutureTape
bundle
locking
anatomic
demarcated
guide
FlipCutter
socket
Multiple
puncture
holes
microfracture
augment
passed
TightRope
suture
preloaded
FiberTape
drawn
flipped
limbs
looped
loop
shortened
drawing
transtibial
exited
tightening
dissection
subcutaneous
SwiveLock
loaded
just
pullout

In [34]:
'''
#tar_path = os.path.abspath(os.path.dirname(__file__))
tar_path = os.getcwd() + '/autocorrect'
tar_file_name = os.path.join(tar_path, 'words.tar')
t = tarfile.open(tar_file_name, 'r:tar')
t.extractfile(tar_file_name)
'''

"\n#tar_path = os.path.abspath(os.path.dirname(__file__))\ntar_path = os.getcwd() + '/autocorrect'\ntar_file_name = os.path.join(tar_path, 'words.tar')\nt = tarfile.open(tar_file_name, 'r:tar')\nt.extractfile(tar_file_name)\n"