# Importing all the necessary libraries

In [1]:
import re  # To execute regex
import json # To read Json files
import spacy # NLP based model
import warnings
from spacy.training import Example
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span, Doc
from spacy import displacy
warnings.filterwarnings('ignore')
import difflib # To check for similarity between the OCR text and json parameters

# First lets load the text and json file to get insights

In [2]:
# Loading the json data
def load_data(file):
    with open(file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

In [3]:
# saving data as json
def save_data(file, data):
    with open(file, 'w', encoding='utf-8') as f:
        json.dump(data,f, indent=4)

In [4]:
# Fuction to load text input
def load_ocr_output(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read().strip().replace('\n', ' ').replace(',', '').replace(';', ' ').replace("'",' ').replace('"',' ')
        return text

In [5]:
# Load parameter names flattening it to find matches, Creating a list of parameter terms
parameters_json = load_data('./OCR raw samples/X1.json')
# {'Abbreviation': '11DEOXY',
#   'Synonyms': ['11 deoxycortisol',
#    'Cortodoxin',
#    'Deoxycorticosterone',
#    '11-DOC',
#    '11-Deoxycortisol',
#    '11-Desoxycortisol',
#    'Cortodoxyl',
#    'Cortolox',
#    'Cortoloxin',
#    '11-Oxocortisol']}

# This is the structure of json file
# Flattening the Data for Further Usage
parameter_terms = [d['Abbreviation'] for d in parameters_json if len(d['Abbreviation'].strip()) !=0] + [syn for d in parameters_json for syn in d['Synonyms'] if len(syn.strip()) !=0]

In [6]:
# Loading and cleaning the text for entity extraction
text = load_ocr_output("OCR raw samples/0ab9800e-bc9a-4388-aaa2-d4fc05e7d111.txt")

# Did some replacements after loading (just the basic replacements like removing special charecters and new line char)

# Regex method to extract information from the OCR output

In [7]:
# Function to check for similarity or partial matches using difflib
# Assuming 80% match is enough to say its a parameter name wrongly written in OCR text
def is_similar(test_name, parameters):
    for param in parameters:
        if difflib.SequenceMatcher(None, test_name, param).ratio() > 0.8:
            return (test_name,0)
    for param in parameters:
        for test in test_name.split():
            if difflib.SequenceMatcher(None, test, param).ratio() > 0.8:
                return (test,test_name.index(test))
        for test in test_name.split('('):
            if difflib.SequenceMatcher(None, test, param).ratio() > 0.8:
                return (test,test_name.index(test))
    return ('Not Found',0)

In [8]:
# The function will first extract the match, then check for similarity between the parameter caught in the match and the parameter name in the JSON.
def regex_match_from_OCR_output(text, parameters):
    pattern_1 = r"([A-Za-z\s]+)\s+(\d+\.?\d*)\s+([a-zA-Z%\/]+)" # parameter value unit
    pattern_2 = r"([A-Za-z0-9\s]+)\s*\(([\d\.\-]+)\)\s*([a-zA-Z%\/]+)\s+(\d+\.?\d*)" # parameter with number range unit value
    pattern_3 = r"([A-Za-z0-9\-\s\(\).]+)\s*\(([^)]+)\)\s*([a-zA-Z%/]+)\s*(\d+\.?\d*)" # parameter with brackets range unit value
    pattern_4 = r"([A-Za-z0-9\-\s\(\).]+)([a-zA-Z%/]+)(\d+\.?\d*)" # parameter unit value
    pattern_5 = r"([A-Za-z0-9\-\s\(\).]+)\s*\(([^)]+)\)\s*(\d+\.?\d*)\s*([a-zA-Z%/]+)" # parameter range value unit
    results = []
    parameter_names = {}
    entities = []
    for match in re.finditer(pattern_2, text):
        test_name, range_, unit, value = match.groups()
        similar_test_name = is_similar(test_name, parameters)[0].strip()
        if similar_test_name != 'Not Found' and similar_test_name:  # Using the similarity check here
            if similar_test_name not in parameter_names.keys():
                results.append({
                    'parameter': similar_test_name,
                    # 'range': range_,
                    'value': value,
                    'unit': unit
                })
                parameter_names[similar_test_name] = 1
    for match in re.finditer(pattern_3, text):
        test_name, range_, unit, value = match.groups()
        similar_test_name = is_similar(test_name, parameters)[0].strip()
        if similar_test_name != 'Not Found' and similar_test_name:  # Using the similarity check here
            if similar_test_name not in parameter_names.keys():
                results.append({
                    'parameter': similar_test_name,
                    # 'range': range_,
                    'value': value,
                    'unit': unit
                })
                parameter_names[similar_test_name] = 1
    for match in re.finditer(pattern_1, text):
        test_name, value, unit = match.groups()
        similar_test_name = is_similar(test_name, parameters)[0].strip()
        if similar_test_name != 'Not Found' and similar_test_name:  # Using the similarity check here
            if similar_test_name not in parameter_names.keys():
                results.append({
                    'parameter': similar_test_name,
                    # 'range': range_,
                    'value': value,
                    'unit': unit
                })
                parameter_names[similar_test_name] = 1
    for match in re.finditer(pattern_4, text):
        test_name, unit, value = match.groups()
        similar_test_name = is_similar(test_name, parameters)[0].strip()
        if similar_test_name != 'Not Found' and similar_test_name:  # Using the similarity check here
            if similar_test_name not in parameter_names.keys():
                results.append({
                    'parameter': similar_test_name,
                    # 'range': range_,
                    'value': value,
                    'unit': unit
                })
                parameter_names[similar_test_name] = 1
    for match in re.finditer(pattern_5, text):
        test_name, range_, value, unit = match.groups()
        similar_test_name = is_similar(test_name, parameters)[0].strip()
        if similar_test_name != 'Not Found' and similar_test_name:  # Using the similarity check here
            if similar_test_name not in parameter_names.keys():
                results.append({
                    'parameter': similar_test_name,
                    # 'range': range_,
                    'value': value,
                    'unit': unit
                })
                parameter_names[similar_test_name] = 1
    return (results)

In [9]:
regex_match = regex_match_from_OCR_output(text, parameter_terms)

In [10]:
regex_match
# The regex match generally performs well, though there are occasional errors. As per the key considerations, I have taken only the first instance.
# This approach initially increased the time complexity to O(n**2). To mitigate this, I have utilized a dictionary to reduce the complexity back to O(n).
# There are still a few errors scattered throughout,
# I have identified some of these errors, such as incorrect units and one or two incorrect parameter names.
# These issues could arise from two scenarios: 1. Inaccuracies in OCR output. 2. Presence of similar patterns.

[{'parameter': 'TSH', 'value': '0.73', 'unit': 'miIU/L'},
 {'parameter': 'FT4', 'value': '15', 'unit': 'pmol/L'},
 {'parameter': 'FT3', 'value': '4.8', 'unit': 'pmol/L'},
 {'parameter': 'Iron', 'value': '40', 'unit': 'umol/L'},
 {'parameter': 'T', 'value': '24', 'unit': 'umol/L'},
 {'parameter': 'Ferritin', 'value': '63', 'unit': 'ug/L'},
 {'parameter': 'iPTH', 'value': '8.3', 'unit': 'pmol/L'},
 {'parameter': 'B12', 'value': '252', 'unit': 'pmol/L'},
 {'parameter': 'Na', 'value': '140', 'unit': 'mmol/L'},
 {'parameter': 'K', 'value': '4.5', 'unit': 'mmol/L'},
 {'parameter': 'Cl', 'value': '106', 'unit': 'mmol/L'},
 {'parameter': 'HCO3', 'value': '24', 'unit': 'mmol/L'},
 {'parameter': 'Urea', 'value': '6.5', 'unit': 'mmol/L'},
 {'parameter': 'Urate', 'value': '0.28', 'unit': 'mmol/L'},
 {'parameter': 'Phos', 'value': '65', 'unit': 'U/L'},
 {'parameter': 'Protein', 'value': '69', 'unit': 'g/L'},
 {'parameter': 'Albumin', 'value': '41', 'unit': 'g/L'},
 {'parameter': 'Ca', 'value': '2.3

# Why Use SpaCy or Any Model?
- To save computational time.
- To avoid a hardcoded pattern recognition system.
- SpaCy provides a high level of customization and extensive community support.

# Approach
- First, let's create a blank SpaCy model and equip it with everything necessary to recognize all the identities in our text (Custom NER).
- Then, create an Entity Relationship (ER) model to establish connections between the entities.

In [11]:
# Initializing a blank spaCy model
nlp = spacy.blank("en")

In [12]:
# Setting up the NER pipeline component
if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner", last=True)
else:
    ner = nlp.get_pipe("ner")

In [13]:
# Adding new labels to the entity recognizer
labels = ["parameter", "value", "unit"]
for label in labels:
    ner.add_label(label)

In [14]:
# Integrate PhraseMatcher for rule-based matching
matcher = PhraseMatcher(nlp.vocab)
patterns = [nlp.make_doc(text) for text in parameter_terms]
matcher.add("parameter", patterns)

In [15]:
# Adding our custom build parser matcher to the spacy componenets
@spacy.Language.component("add_phrase_matcher")
def add_phrase_matcher(doc):
    matches = matcher(doc)
    spans = []
    seen_tokens = set()
    for match_id, start, end in matches:
        if start not in seen_tokens and end - 1 not in seen_tokens:
            span = Span(doc, start, end, label="parameter")
            spans.append(span)
            seen_tokens.update(range(start, end))
    filtered_ents = [ent for ent in doc.ents if ent.start not in seen_tokens and ent.end - 1 not in seen_tokens]
    doc.ents = filtered_ents + spans
    return doc

# Adding the component to the pipeline before 'ner'
nlp.add_pipe("add_phrase_matcher", before="ner")

<function __main__.add_phrase_matcher(doc)>

In [16]:
# Registering the 'relations' extension attribute on the Doc class
if not Doc.has_extension("relations"):
    Doc.set_extension("relations", default=[])

In [17]:
# Custom build relation extractor
def relation_extractor(nlp, name):
    def extract_relations(doc):
        relations = []
        pattern_1 = r"([A-Za-z\s]+)\s+(\d+\.?\d*)\s+([a-zA-Z%\/]+)" # parameter value unit
        pattern_2 = r"([A-Za-z0-9\s]+)\s*\(([\d\.\-]+)\)\s*([a-zA-Z%\/]+)\s+(\d+\.?\d*)" # parameter with number range unit value
        pattern_3 = r"([A-Za-z0-9\-\s\(\).]+)\s*\(([^)]+)\)\s*([a-zA-Z%/]+)\s*(\d+\.?\d*)" # parameter with brackets range unit value
        pattern_4 = r"([A-Za-z0-9\-\s\(\).]+)([a-zA-Z%/]+)(\d+\.?\d*)" # parameter unit value
        pattern_5 = r"([A-Za-z0-9\-\s\(\).]+)\s*\(([^)]+)\)\s*(\d+\.?\d*)\s*([a-zA-Z%/]+)" # parameter range value unit
        for match in re.finditer(pattern_2, doc.text):
            test_name, range_, unit, value = match.groups()
            para_name = is_similar(test_name.strip(), parameter_terms)[0].strip()
            if para_name != 'Not Found' and len(para_name) != 0:
                relations.append((para_name,value,unit))
        for match in re.finditer(pattern_3, text):
            test_name, range_, unit, value = match.groups()
            para_name = is_similar(test_name.strip(), parameter_terms)[0].strip()
            if para_name != 'Not Found' and len(para_name) != 0:
                relations.append((para_name,value,unit))
        for match in re.finditer(pattern_1, text):
            test_name, value, unit = match.groups()
            para_name = is_similar(test_name.strip(), parameter_terms)[0].strip()
            if para_name != 'Not Found' and len(para_name) != 0:
                relations.append((para_name,value,unit))
        for match in re.finditer(pattern_4, text):
            test_name, unit, value = match.groups()
            para_name = is_similar(test_name.strip(), parameter_terms)[0].strip()
            if para_name != 'Not Found' and len(para_name) != 0:
                relations.append((para_name,value,unit))
        for match in re.finditer(pattern_5, text):
            test_name, range_, value, unit = match.groups()
            para_name = is_similar(test_name.strip(), parameter_terms)[0].strip()
            if para_name != 'Not Found' and len(para_name) != 0:
                relations.append((para_name,value,unit))
        doc._.relations = relations
        return doc
    return extract_relations

# Register your factory function
spacy.Language.factory("extract_relations", func=relation_extractor)

# Adding the component to the pipeline before 'ner' is not present already
if "extract_relations" not in nlp.pipe_names:
    nlp.add_pipe("extract_relations", after="ner")

In [18]:
# Using the same regex pattern to generate training data for the SpaCy model.
# There are a few errors in the regex output; attempts to resolve them were unsuccessful due to OCR inaccuracies.
# Additional rules can be implemented on the regex output to further clean the training data.
# I abandoned this idea and am now exploring an alternative approach.
# Let's see what output SpaCy produces and determine our next steps.
# Function to generate training data: the format should be (text, {'entities': ()}) for SpaCy.

def generate_training_data(text, parameters):
    pattern_1 = r"([A-Za-z\s]+)\s+(\d+\.?\d*)\s+([a-zA-Z%\/]+)" # parameter value unit
    pattern_2 = r"([A-Za-z0-9\s]+)\s*\(([\d\.\-]+)\)\s*([a-zA-Z%\/]+)\s+(\d+\.?\d*)" # parameter with number range unit value
    pattern_3 = r"([A-Za-z0-9\-\s\(\).]+)\s*\(([^)]+)\)\s*([a-zA-Z%/]+)\s*(\d+\.?\d*)" # parameter with brackets range unit value
    pattern_4 = r"([A-Za-z0-9\-\s\(\).]+)([a-zA-Z%/]+)(\d+\.?\d*)" # parameter unit value
    pattern_5 = r"([A-Za-z0-9\-\s\(\).]+)\s*\(([^)]+)\)\s*(\d+\.?\d*)\s*([a-zA-Z%/]+)" # parameter range value unit
    entities = []
    for match in re.finditer(pattern_2, text):
        test_name, range_, unit, value = match.groups()
        similar_test_name = is_similar(test_name, parameters)[0]
        similar_test_name_st_ind = is_similar(test_name, parameters)[1]
        if similar_test_name != 'Not Found' and len(similar_test_name.strip()) != 0:  # Use the similarity check here
            entities.append((match.start(1)+similar_test_name_st_ind,match.start(1)+similar_test_name_st_ind+len(similar_test_name),'parameter'))
            entities.append((match.start(4),match.end(4),'value'))
            entities.append((match.start(3),match.end(3),'unit'))
    for match in re.finditer(pattern_3, text):
        test_name, range_, unit, value = match.groups()
        similar_test_name = is_similar(test_name, parameters)[0]
        similar_test_name_st_ind = is_similar(test_name, parameters)[1]
        if similar_test_name != 'Not Found' and len(similar_test_name.strip()) != 0:  # Use the similarity check here
            entities.append((match.start(1)+similar_test_name_st_ind,match.start(1)+similar_test_name_st_ind+len(similar_test_name),'parameter'))
            entities.append((match.start(4),match.end(4),'value'))
            entities.append((match.start(3),match.end(3),'unit'))
    for match in re.finditer(pattern_1, text):
        test_name, value, unit = match.groups()
        similar_test_name = is_similar(test_name, parameters)[0]
        if similar_test_name != 'Not Found' and len(similar_test_name.strip()) != 0:  # Use the similarity check here
            entities.append((match.start(1)+similar_test_name_st_ind,match.start(1)+similar_test_name_st_ind+len(similar_test_name),'parameter'))
            entities.append((match.start(2),match.end(2),'value'))
            entities.append((match.start(3),match.end(3),'unit'))
    for match in re.finditer(pattern_4, text):
        test_name, unit, value = match.groups()
        similar_test_name = is_similar(test_name, parameters)[0]
        similar_test_name_st_ind = is_similar(test_name, parameters)[1]
        if similar_test_name != 'Not Found' and len(similar_test_name.strip()) != 0:  # Use the similarity check here
            entities.append((match.start(1)+similar_test_name_st_ind,match.start(1)+similar_test_name_st_ind+len(similar_test_name),'parameter'))            
            entities.append((match.start(3),match.end(3),'value'))
            entities.append((match.start(2),match.end(2),'unit'))
    for match in re.finditer(pattern_5, text):
        test_name, range_, value, unit = match.groups()
        similar_test_name = is_similar(test_name, parameters)[0]
        if similar_test_name != 'Not Found' and len(similar_test_name.strip()) != 0:  # Use the similarity check here
            entities.append((match.start(1)+similar_test_name_st_ind,match.start(1)+similar_test_name_st_ind+len(similar_test_name),'parameter'))
            entities.append((match.start(3),match.end(3),'value'))
            entities.append((match.start(4),match.end(4),'unit'))
    sorted_entities = sorted(entities, key=lambda x: x[0])  # Sort by starting index
    filtered = []
    last_end = 0
    # cheking for pottential overlaps
    for entity in sorted_entities:
        if entity[0] >= last_end:  # No overlap
            filtered.append(entity)
            last_end = entity[1]
    return (text,{'entities': filtered})

In [19]:
# In the function definition, we have taken all instances instead of just the first to generate sufficient data.
# The first instance can also be extracted from the SpaCy output.
TRAIN_DATA = generate_training_data(text, parameter_terms)

In [20]:
# saving the training data.
save_data('TRAIN_DATA.json', TRAIN_DATA)

In [21]:
# Function to train the model.
# Not using dropout since overfitting shouldn't be an issue, as our entities won't change.
# Therefore, we are proceeding with 40 iterations.
def train_model(nlp,TRAIN_DATA):
    # Train the model
    nlp.begin_training()
    optimizer = nlp.begin_training()
    for itr in range(40):
        text, annotations = TRAIN_DATA
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, annotations)
        nlp.update([example], sgd=optimizer)
    return nlp

In [22]:
# Training the model
nlp = train_model(nlp, TRAIN_DATA)

In [23]:
# Evaluating the model
def evaluate_model(nlp, text):
    doc = nlp(text)
    displacy.render(doc, style="ent", jupyter=True)
    # Printing relations found
    print("Extracted Relations:", doc._.relations)

evaluate_model(nlp, text)

Extracted Relations: [('TSH', '0.73', 'miIU/L'), ('FT4', '15', 'pmol/L'), ('FT3', '4.8', 'pmol/L'), ('Iron', '40', 'umol/L'), ('T', '24', 'umol/L'), ('Ferritin', '63', 'ug/L'), ('iPTH', '8.3', 'pmol/L'), ('B12', '252', 'pmol/L'), ('Na', '140', 'mmol/L'), ('K', '4.5', 'mmol/L'), ('Cl', '106', 'mmol/L'), ('HCO3', '24', 'mmol/L'), ('Urea', '6.5', 'mmol/L'), ('Urate', '0.28', 'mmol/L'), ('Phos', '65', 'U/L'), ('Protein', '69', 'g/L'), ('Albumin', '41', 'g/L'), ('Ca', '2.39', 'mmol/L'), ('Ca', '2.43', 'mmol/L'), ('PO4', '1.00', 'mmol/L'), ('Mg', '0.82', 'mmol/L'), ('Hb', '141', 'g/L'), ('MCH', '35', 'pg'), ('MCHC', '332', 'g/L'), ('TSH', '0.73', 'miIU/L'), ('FT4', '15', 'pmol/L'), ('FT3', '4.8', 'pmol/L'), ('Iron', '40', 'umol/L'), ('T', '24', 'umol/L'), ('Ferritin', '63', 'ug/L'), ('iPTH', '8.3', 'pmol/L'), ('CEA', '4.2', 'ug/L'), ('CA-19.9', '10', 'U/mL'), ('CA125', '17', 'U/mL'), ('B12', '252', 'pmol/L'), ('Folate', '23.1', 'nmol/L'), ('Na', '140', 'mmol/L'), ('K', '4.5', 'mmol/L'), ('Cl

In [24]:
# Saving NER model
nlp.to_disk('./medical_ner_model')