In [1]:
from __future__ import print_function
import tensorflow as tf
import keras.backend as K
from keras.backend.tensorflow_backend import set_session
from keras.models import Model
from keras.layers import Input, LSTM, Dense, Bidirectional, Concatenate, GRU, Dot, TimeDistributed, Activation, Embedding
from keras import optimizers
from keras.callbacks import ModelCheckpoint, TensorBoard, LearningRateScheduler
import numpy as np
import os
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import json
from nltk.tokenize import word_tokenize
import re
import os
import tarfile
%matplotlib inline

Using TensorFlow backend.


# Utility functions

In [2]:
def load_data_with_gt(file_name, delimiter='\t', gt_index=1, prediction_index=0):
    input_texts = []
    gt_texts = []
    for row in open(file_name, encoding='utf8'):
        sents = row.split(delimiter)
        if (len(sents) < 2):
            continue
        input_text = sents[prediction_index]
        gt_texts.append(sents[gt_index])
    return input_texts, gt_texts

In [3]:
def load_raw_data(file_name):
    with open(file_name, 'r') as f:
        return(f.read())

In [4]:
def load_medical_terms(json_file):
    texts = []
    with open(json_file) as f:
        med_terms_dict = json.load(f)
    texts += list(med_terms_dict.keys())
    texts += list(med_terms_dict.values())
    return texts

In [5]:
def load_accidents_terms(file_name):

    f = open(file_name, encoding='utf8')
    line = 0  
    texts = []
    try:
        for r in f:
            for term in r.split('|'):
                    texts += term.replace('\"', '')
    except:
        print('finished')

                
    return texts

In [6]:
def process_word(word):
    # Try to correct the word from known dict
    #word = spell(word)
    # Option 1: Replace special chars and digits
    #processed_word = re.sub(r'[\\\/\-\—\:\[\]\,\.\"\;\%\~\(\)\{\}\$\#\?\●\@\+\-\*\d]', r'', w.lower())
    
    # Option 2: skip all words with special chars or digits
    if(len(re.findall(r'[\\\/\-\—\:\[\]\,\.\"\;\%\~\(\)\{\}\$\#\?\●\@\+\-\*\d]', word.lower())) == 0):
        processed_word = word
    else:
        processed_word = 'UNK'

    # Skip stop words
    #stop_words = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]        
    stop_words = []        
    if processed_word in stop_words:
        processed_word = 'UNK'
        
    return processed_word

# Load data

In [7]:
data_path = '../../../dat/'
texts = []

# Load tesseract correction

In [8]:

#files_list = ['all_ocr_data_2.txt', 'field_class_21.txt', 'field_class_22.txt', 'field_class_23.txt', 'field_class_24.txt', 'field_class_25.txt', 'field_class_26.txt', 'field_class_27.txt', 'field_class_28.txt', 'field_class_29.txt', 'field_class_30.txt', 'field_class_31.txt', 'field_class_32.txt', 'field_class_33.txt', 'field_class_34.txt', 'NL-14622714.txt', 'NL-14627449.txt', 'NL-14628986.txt', 'NL-14631911.txt', 'NL-14640007.txt']
files_list = ['field_class_21.txt', 'field_class_21.txt', 'field_class_21.txt', 'field_class_21.txt', 'field_class_21.txt', 'field_class_21.txt', 'field_class_21.txt', 'field_class_21.txt', 'field_class_21.txt', 'field_class_21.txt', 'field_class_21.txt', 'field_class_21.txt', 'field_class_21.txt', 'field_class_21.txt', 'field_class_21.txt', 'field_class_21.txt', 'field_class_21.txt', 'field_class_21.txt', 'field_class_21.txt', 'field_class_21.txt', 'field_class_21.txt', 'field_class_21.txt', 'field_class_21.txt', 'field_class_21.txt', 'field_class_21.txt']

for file_name in files_list:
    tess_correction_data = os.path.join(data_path, file_name)
    _, gt = load_data_with_gt(tess_correction_data)
    texts += gt

# Load HW terms

In [9]:

hw_correction_data = os.path.join(data_path, 'handwritten_output.txt')
_, gt = load_data_with_gt(hw_correction_data, delimiter='|', gt_index=0, prediction_index=1)
texts += gt

# Load clean claims forms

In [10]:
num_samples = 10000
file_name = os.path.join(data_path, 'claims.txt')
#texts += load_raw_data(file_name)


# Load Medical Terms dictionary

In [11]:
json_file = os.path.join(data_path, 'abbrevs.json')
texts += load_medical_terms(json_file)

# Load Medical Instruction dictionary

In [12]:
file_name = os.path.join(data_path, 'medical_instructions.txt')
texts += load_raw_data(file_name)


# Load accident terms

In [13]:

file_name = os.path.join(data_path, 'AccidentsL.txt')
texts += load_accidents_terms(file_name)

finished


# Load procedures and tests

In [14]:
file_name = os.path.join(data_path, 'procedures_tests.txt')
texts += load_raw_data(file_name)

In [15]:
# Sample data
print(len(texts))
for i in range(10):
    print(texts[i], '\n')

1081609
Claim Folder Contents
 

Claimant Name:
 

Claim Number:
 

Unauthorized access is strictly probihited
 

Print Date:
 

Claim Type: VB Accident - Accident Injury
 

Who The Reported Event Happened To: Employee/Policyholder
 

Policyholder/Owner Information
 

First Name:
 

Middle Name/Initial:
 



In [16]:
'''
texts += 'Gender\n'
texts += 'Gender\n'
texts += 'Gender\n'
texts += 'Gender\n'
texts += 'Gender\n'
texts += 'Gender\n'
texts += 'Gender\n'
texts += 'Gender\n'
texts += 'Gender\n'
texts += 'Gender\n'
texts += 'Gender\n'
texts += 'Gender\n'
'''

"\ntexts += 'Gender\n'\ntexts += 'Gender\n'\ntexts += 'Gender\n'\ntexts += 'Gender\n'\ntexts += 'Gender\n'\ntexts += 'Gender\n'\ntexts += 'Gender\n'\ntexts += 'Gender\n'\ntexts += 'Gender\n'\ntexts += 'Gender\n'\ntexts += 'Gender\n'\ntexts += 'Gender\n'\n"

In [17]:

with open('med.txt', 'w') as f:
    for text in texts:
        f.write(text)
f.close()

In [18]:
!tar -xvf autocorrect/words.tar

words/
words/en_US_GB_CA_mixed.txt
words/big_orig.txt
words/._big.txt
words/big.txt
words/en_US_GB_CA_lower.txt


In [19]:
#f_big_orig = open('words/big_orig.txt', 'r')
f_big_orig = open('med.txt', 'r')
f_med = open('med.txt', 'r')
f_big = open('words/big.txt', 'w')
for line in f_big_orig:
    f_big.write(line + '\n')
for line in f_med:
    f_big.write(line + '\n')
    

f_big_orig.close()
f_big.close()
f_med.close()

In [20]:
!tar -cvf autocorrect/words.tar words 

words/
words/en_US_GB_CA_mixed.txt
words/big_orig.txt
words/._big.txt
words/big.txt
words/en_US_GB_CA_lower.txt


In [21]:
!rm -rf words/