In [1]:
from __future__ import print_function
import tensorflow as tf
import keras.backend as K
from keras.backend.tensorflow_backend import set_session
from keras.models import Model
from keras.layers import Input, LSTM, Dense, Bidirectional, Concatenate, GRU, Dot, TimeDistributed, Activation, Embedding
from keras import optimizers
from keras.callbacks import ModelCheckpoint, TensorBoard, LearningRateScheduler
import numpy as np
import os
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import json
from nltk.tokenize import word_tokenize
%matplotlib inline

Using TensorFlow backend.


# Utility functions

In [2]:
# Limit gpu allocation. allow_growth, or gpu_fraction
def gpu_alloc():
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    set_session(tf.Session(config=config))

In [3]:
gpu_alloc()

In [4]:
# Artificial noisy spelling mistakes
def noise_maker(sentence, threshold):
    '''Relocate, remove, or add characters to create spelling mistakes'''
    letters = ['a','b','c','d','e','f','g','h','i','j','k','l','m',
           'n','o','p','q','r','s','t','u','v','w','x','y','z',]
    noisy_sentence = []
    i = 0
    while i < len(sentence):
        random = np.random.uniform(0, 1, 1)
        # Most characters will be correct since the threshold value is high
        if random < threshold:
            noisy_sentence.append(sentence[i])
        else:
            new_random = np.random.uniform(0, 1, 1)
            # ~33% chance characters will swap locations
            if new_random > 0.67:
                if i == (len(sentence) - 1):
                    # If last character in sentence, it will not be typed
                    continue
                else:
                    # if any other character, swap order with following character
                    noisy_sentence.append(sentence[i + 1])
                    noisy_sentence.append(sentence[i])
                    i += 1
            # ~33% chance an extra lower case letter will be added to the sentence
            elif new_random < 0.33:
                random_letter = np.random.choice(letters, 1)[0]
                noisy_sentence.append(random_letter)
                noisy_sentence.append(sentence[i])
            # ~33% chance a character will not be typed
            else:
                pass
        i += 1

    return ''.join(noisy_sentence)

In [5]:
def load_data_with_gt(file_name, num_samples, max_sent_len, min_sent_len, delimiter='\t', gt_index=1, prediction_index=0):
    '''Load data from txt file, with each line has: <TXT><TAB><GT>. The  target to the decoder muxt have \t as the start trigger and \n as the stop trigger.'''
    cnt = 0  
    input_texts = []
    gt_texts = []
    target_texts = []
    for row in open(file_name, encoding='utf8'):
        if cnt < num_samples :
            #print(row)
            sents = row.split(delimiter)
            input_text = sents[prediction_index]
            
            target_text = '\t' + sents[gt_index] + '\n'
            if len(input_text) > min_sent_len and len(input_text) < max_sent_len and len(target_text) > min_sent_len and len(target_text) < max_sent_len:
                cnt += 1
                
                input_texts.append(input_text)
                target_texts.append(target_text)
                gt_texts.append(sents[gt_index])
    return input_texts, target_texts, gt_texts

In [6]:
def load_data_with_noise(file_name, num_samples, noise_threshold, max_sent_len, min_sent_len):
    '''Load data from txt file, with each line has: <TXT>. The GT is just a noisy version of TXT. The  target to the decoder muxt have \t as the start trigger and \n as the stop trigger.'''
    cnt = 0  
    input_texts = []
    gt_texts = []
    target_texts = []
    while cnt < num_samples :
        for row in open(file_name, encoding='utf8'):
        #for row in open(file_name):
            if cnt < num_samples :
                sents = row.split("\t")
                input_text = noise_maker(sents[1], noise_threshold)
                input_text = input_text[:-1]

                target_text = '\t' + sents[1] + '\n'            
                if len(input_text) > min_sent_len and len(input_text) < max_sent_len and len(target_text) > min_sent_len and len(target_text) < max_sent_len:
                    cnt += 1
                    input_texts.append(input_text)
                    target_texts.append(target_text)
                    gt_texts.append(target_text[1:-1])
                    
    return input_texts, target_texts, gt_texts

In [7]:
def load_medical_terms_with_noise(json_file, num_samples, noise_threshold):
    with open(json_file) as f:
        med_terms_dict = json.load(f)
    med_terms = list(med_terms_dict.keys())
    input_texts = []
    gt_texts = []
    target_texts = []
    cnt = 0
    while cnt < num_samples:
        for term in med_terms:
            if cnt < num_samples :
                input_text = noise_maker(term, noise_threshold)
                input_text = input_text[:-1]   

                target_text = '\t' + term + '\n'

                input_texts.append(input_text)
                target_texts.append(target_text)
                gt_texts.append(target_text[1:-1])        
                cnt += 1
    return input_texts, target_texts, gt_texts, med_terms_dict

In [8]:
def load_accidents_terms_with_noise(file_name, limit, num_samples, noise_threshold):

    f = open(file_name, encoding='utf8')
    line = 0    
    med_terms = []
    try:
        for r in f:
            if(line < limit):

                med_terms.extend(r.split('|'))
                line += 1
    except:
        print('finished')
    input_texts = []
    gt_texts = []
    target_texts = []
    cnt = 0
    while cnt < num_samples:
        for term in med_terms:
            if cnt < num_samples :
                input_text = noise_maker(term, noise_threshold)
                input_text = input_text[:-1]   

                target_text = '\t' + term + '\n'

                input_texts.append(input_text)
                target_texts.append(target_text)
                gt_texts.append(target_text[1:-1])        
                cnt += 1
                
    return input_texts, target_texts, gt_texts

In [9]:
def load_procedures_tests_with_noise(file_name, num_samples, noise_threshold):
    '''Load data from txt file, with each line has: <TXT>. The GT is just a noisy version of TXT. The  target to the decoder muxt have \t as the start trigger and \n as the stop trigger.'''
    cnt = 0  
    input_texts = []
    gt_texts = []
    target_texts = []
    while cnt < num_samples :
        for row in open(file_name, encoding='utf8'):
        #for row in open(file_name):
            if cnt < num_samples :
                
                input_text = noise_maker(row, noise_threshold)
                input_text = input_text[:-1]

                target_text = '\t' + row + '\n'            

                cnt += 1
                input_texts.append(input_text)
                target_texts.append(target_text)
                gt_texts.append(target_text[1:-1])
                    
    return input_texts, target_texts, gt_texts

In [10]:
def build_vocab(all_texts):
    '''Build vocab dictionary to victorize chars into ints'''
    vocab_to_int = {}
    count = 0 # Start index for any char will be 1, as 0 is masked by the Embedding/Masking layer
    codes = ['UNK', ' ', '\t','\n']# Start 'UNK' at the first entry, to keep its index=0 to be masked
    for code in codes:
        if code not in vocab_to_int:
            vocab_to_int[code] = count
            count += 1    
    
    for sentence in all_texts:       
        for word in word_tokenize(sentence):
            if word not in vocab_to_int:
                vocab_to_int[word] = count
                count += 1


    '''''Build inverse translation from int to word'''
    int_to_vocab = {}
    for word, value in vocab_to_int.items():
        int_to_vocab[value] = word
        
    return vocab_to_int, int_to_vocab

# Load data

In [11]:
data_path = '../../dat/'

In [12]:
max_sent_len = 1000000
min_sent_len = -1

## Results on tesseract correction

In [13]:
num_samples = 0
tess_correction_data = os.path.join(data_path, 'all_ocr_data_2.txt')
input_texts_OCR, target_texts_OCR, gt_OCR = load_data_with_gt(tess_correction_data, num_samples, max_sent_len, min_sent_len)

In [14]:
input_texts = input_texts_OCR
target_texts = target_texts_OCR

# Results on noisy tesseract corrections

In [15]:
num_samples = 0
tess_correction_data = os.path.join(data_path, 'all_ocr_data_2.txt')
threshold = 0.9
input_texts_noisy_OCR, target_texts_noisy_OCR, gt_noisy_OCR = load_data_with_noise(file_name=tess_correction_data, 
                                                                 num_samples=num_samples, 
                                                                 noise_threshold=threshold, 
                                                                 max_sent_len=max_sent_len, 
                                                                 min_sent_len=min_sent_len)

In [16]:
input_texts += input_texts_noisy_OCR
target_texts += target_texts_noisy_OCR

# Load Medical Terms dictionary

In [17]:
json_file = os.path.join(data_path, 'abbrevs.json')
threshold = 1.0
num_samples = 10000
input_texts_MedTerms, target_texts_MedTerms, _, med_terms_dict = load_medical_terms_with_noise(json_file, num_samples, threshold)

# Load accident terms

In [18]:
file_name = os.path.join(data_path, 'AccidentsL.txt')
threshold = 1.0
num_samples = 10000
limit = 100
input_texts_AccTerms, target_texts_AccTerms, _ = load_accidents_terms_with_noise(file_name, limit, num_samples, threshold)

finished


In [19]:
input_texts += input_texts_AccTerms
target_texts += target_texts_AccTerms

# Load procedures and tests

In [20]:
file_name = os.path.join(data_path, 'procedures_tests.txt')
threshold = 1.0
num_samples = 10000
input_texts_ProcTests, target_texts_ProcTests, _ = load_procedures_tests_with_noise(file_name, num_samples, threshold)

In [21]:
input_texts += input_texts_ProcTests
target_texts += target_texts_ProcTests

In [22]:
# Sample data
print(len(input_texts))
for i in range(10):
    print(input_texts[i], '\n', target_texts[i])

20000
MINE_I 
 	MINE_ID

CONTROLLER_I 
 	CONTROLLER_ID

CONTROLLER_NAM 
 	CONTROLLER_NAME

OPERATOR_I 
 	OPERATOR_ID

OPERATOR_NAM 
 	OPERATOR_NAME

CONTRACTOR_I 
 	CONTRACTOR_ID

DOCUMENT_N 
 	DOCUMENT_NO

SUBUNIT_C 
 	SUBUNIT_CD

SUBUNI 
 	SUBUNIT

ACCIDENT_D 
 	ACCIDENT_DT



## Build vocab

In [23]:
all_texts = target_texts + input_texts
vocab_to_int, int_to_vocab = build_vocab(all_texts)
np.savez('vocab-words-all-terms', vocab_to_int=vocab_to_int, int_to_vocab=int_to_vocab, max_sent_len=max_sent_len, min_sent_len=min_sent_len )

In [24]:
input_characters = sorted(list(vocab_to_int))
target_characters = sorted(list(vocab_to_int))
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)
max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])

In [25]:
print('Number of samples:', len(input_texts))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)

Number of samples: 20000
Number of unique input tokens: 3813
Number of unique output tokens: 3813
Max sequence length for inputs: 641
Max sequence length for outputs: 644


In [26]:
vocab_to_int # Some special chars need to be removed TODO: Data cleaning

{'UNK': 0,
 ' ': 1,
 '\t': 2,
 '\n': 3,
 'MINE_ID': 4,
 'CONTROLLER_ID': 5,
 'CONTROLLER_NAME': 6,
 'OPERATOR_ID': 7,
 'OPERATOR_NAME': 8,
 'CONTRACTOR_ID': 9,
 'DOCUMENT_NO': 10,
 'SUBUNIT_CD': 11,
 'SUBUNIT': 12,
 'ACCIDENT_DT': 13,
 'CAL_YR': 14,
 'CAL_QTR': 15,
 'FISCAL_YR': 16,
 'FISCAL_QTR': 17,
 'ACCIDENT_TIME': 18,
 'DEGREE_INJURY_CD': 19,
 'DEGREE_INJURY': 20,
 'FIPS_STATE_CD': 21,
 'UG_LOCATION_CD': 22,
 'UG_LOCATION': 23,
 'UG_MINING_METHOD_CD': 24,
 'UG_MINING_METHOD': 25,
 'MINING_EQUIP_CD': 26,
 'MINING_EQUIP': 27,
 'EQUIP_MFR_CD': 28,
 'EQUIP_MFR_NAME': 29,
 'EQUIP_MODEL_NO': 30,
 'SHIFT_BEGIN_TIME': 31,
 'CLASSIFICATION_CD': 32,
 'CLASSIFICATION': 33,
 'ACCIDENT_TYPE_CD': 34,
 'ACCIDENT_TYPE': 35,
 'NO_INJURIES': 36,
 'TOT_EXPER': 37,
 'MINE_EXPER': 38,
 'JOB_EXPER': 39,
 'OCCUPATION_CD': 40,
 'OCCUPATION': 41,
 'ACTIVITY_CD': 42,
 'ACTIVITY': 43,
 'INJURY_SOURCE_CD': 44,
 'INJURY_SOURCE': 45,
 'NATURE_INJURY_CD': 46,
 'NATURE_INJURY': 47,
 'INJ_BODY_PART_CD': 48,
 'INJ

In [27]:
int_to_vocab

{0: 'UNK',
 1: ' ',
 2: '\t',
 3: '\n',
 4: 'MINE_ID',
 5: 'CONTROLLER_ID',
 6: 'CONTROLLER_NAME',
 7: 'OPERATOR_ID',
 8: 'OPERATOR_NAME',
 9: 'CONTRACTOR_ID',
 10: 'DOCUMENT_NO',
 11: 'SUBUNIT_CD',
 12: 'SUBUNIT',
 13: 'ACCIDENT_DT',
 14: 'CAL_YR',
 15: 'CAL_QTR',
 16: 'FISCAL_YR',
 17: 'FISCAL_QTR',
 18: 'ACCIDENT_TIME',
 19: 'DEGREE_INJURY_CD',
 20: 'DEGREE_INJURY',
 21: 'FIPS_STATE_CD',
 22: 'UG_LOCATION_CD',
 23: 'UG_LOCATION',
 24: 'UG_MINING_METHOD_CD',
 25: 'UG_MINING_METHOD',
 26: 'MINING_EQUIP_CD',
 27: 'MINING_EQUIP',
 28: 'EQUIP_MFR_CD',
 29: 'EQUIP_MFR_NAME',
 30: 'EQUIP_MODEL_NO',
 31: 'SHIFT_BEGIN_TIME',
 32: 'CLASSIFICATION_CD',
 33: 'CLASSIFICATION',
 34: 'ACCIDENT_TYPE_CD',
 35: 'ACCIDENT_TYPE',
 36: 'NO_INJURIES',
 37: 'TOT_EXPER',
 38: 'MINE_EXPER',
 39: 'JOB_EXPER',
 40: 'OCCUPATION_CD',
 41: 'OCCUPATION',
 42: 'ACTIVITY_CD',
 43: 'ACTIVITY',
 44: 'INJURY_SOURCE_CD',
 45: 'INJURY_SOURCE',
 46: 'NATURE_INJURY_CD',
 47: 'NATURE_INJURY',
 48: 'INJ_BODY_PART_CD',
 49: 

In [28]:
len(int_to_vocab)

3813

In [29]:
f = open('vocab.txt', 'w')
for term in vocab_to_int.keys():
    print(term)
    f.write(term + '\n')

UNK
 
	


MINE_ID
CONTROLLER_ID
CONTROLLER_NAME
OPERATOR_ID
OPERATOR_NAME
CONTRACTOR_ID
DOCUMENT_NO
SUBUNIT_CD
SUBUNIT
ACCIDENT_DT
CAL_YR
CAL_QTR
FISCAL_YR
FISCAL_QTR
ACCIDENT_TIME
DEGREE_INJURY_CD
DEGREE_INJURY
FIPS_STATE_CD
UG_LOCATION_CD
UG_LOCATION
UG_MINING_METHOD_CD
UG_MINING_METHOD
MINING_EQUIP_CD
MINING_EQUIP
EQUIP_MFR_CD
EQUIP_MFR_NAME
EQUIP_MODEL_NO
SHIFT_BEGIN_TIME
CLASSIFICATION_CD
CLASSIFICATION
ACCIDENT_TYPE_CD
ACCIDENT_TYPE
NO_INJURIES
TOT_EXPER
MINE_EXPER
JOB_EXPER
OCCUPATION_CD
OCCUPATION
ACTIVITY_CD
ACTIVITY
INJURY_SOURCE_CD
INJURY_SOURCE
NATURE_INJURY_CD
NATURE_INJURY
INJ_BODY_PART_CD
INJ_BODY_PART
SCHEDULE_CHARGE
DAYS_RESTRICT
DAYS_LOST
TRANS_TERM
RETURN_TO_WORK_DT
IMMED_NOTIFY_CD
IMMED_NOTIFY
INVEST_BEGIN_DT
NARRATIVE
CLOSED_DOC_NO
COAL_METAL_IND
''
0100003
0041044
Lhoist
Group
L13586
North
America
of
Alabama
,
LLC
220032180028
30
MILL
OPERATION/PREPARATION
PLANT
07/26/2003
2003
3
4
2330
06
NO
DYS
AWY
FRM
WRK
RSTR
ACT
01
?
VALUE
FOUND
05
Bench
grinder
Drill
press
B

.06
149
Labor
Leadman
Section
Shift
boss
301
ELECT.ARC
BURN-NOT
CONTAC
12/03/2008
trouble
shooting
power
center
flash
occurred
320083440010
220102350016
08/14/2010
1610
LAST
CROSSCUT
1.46
430
CHEST
RIBS/BREAST
BONE/CHEST
ORGNS
08/23/2010
helping
change
structure
rib
brusing
ribs
320111110030
220112140027
07/31/2011
1.1
390
INJURY
05/01/2012
warned
watch
laying
220120690017
02/29/2012
STEPPING
KNEELING
OBJECT
462
Examiner
Fire
Pre-shift
examiner
Mine
113
BLOCKING
57
05/21/2012
gathering
belongings
load
block
covered
twisting
320121660004
220121230013
04/30/2012
7.81
come-along
hooked
cutter
hold
place
hook
forearm
320121660008
220122920036
10/12/2012
10/13/2012
nail
boot
220130450004
02/06/2013
1915
bdc-2oup
2.6
060
106
NARO
G
CR
MTR-UG
EQP
03/07/2013
hauling
scoop
flat
jarring
suffered
strain
320130670008
0121101
Virginia
Conservation
Fund
VCLF
220160360047
01/27/2016
54
Rock
roof
bolting
Pinning
Truss
bolter
Fletcher
DDo-13
7.58
079
Tramming
077
UNDERGRD
preparing
canopy
lowered
cylin

I
1.6
Urea
nitrogen
BUN
7–18
Uric
3.0–7.0
Vitamin
A§
30–65
White
WBC
4,300–10,800
/mL
*Blood
many
†Units
explained
Appendix
units
converted
international
conversion
factor
International
IU
identified
§Other
vitamins


Diagnostic
Procedures
Procedure
Area
Sample
Tested
Description
Amniocentesis
sac
surrounding
fetus
abnormality
Arteriography
artery
aorta
thin
catheter
threaded
studied
outline
highlight
defect
Audiometry
Assessment
hear
distinguish
sounds
specific
pitches
volumes
headphones
Auscultation
Listening
stethoscope
Barium
studies
Esophagus
ulcers
measurement
low
inflatable
wrapped
around
evaluate
organ
Bone
Hipbone
Bronchoscopy
Airways
tumor
Cardiac
catheterization
vessel
Chorionic
villus
sampling
Placenta
Chromosomal
sex
Colonoscopy
Colposcopy
magnifying
lens
Computed
Computer-enhanced
Cone
biopsy
cone-shaped
heated
loop
laser
Culture
any
Growth
identify
infection
bacteria
fungi
Dilation
curettage
D
uterine
sharp
instrument
curet
Dual
absorptiometry
DEXA
Skeleton
focusing
re