## Extract information with context

amazon-textract-helper will pull in other dependencies like the pretty-printer and caller

In [None]:
!python -m pip install amazon-textract-helper amazon-textract-geofinder

In [None]:
from textractgeofinder.ocrdb import AreaSelection
from textractgeofinder.tgeofinder import KeyValue, TGeoFinder, AreaSelection, SelectionElement
from textractprettyprinter.t_pretty_print import get_forms_string
from textractcaller import call_textract
from textractcaller.t_call import Textract_Features

import trp.trp2 as t2

In [None]:
image_filename='./tests/data/patient_intake_form_sample.jpg'

This is the image we want to extract information from.

It has a form structure with repeating key names .

For example each question does have the keys "Yes" and "No" as answers, each of those being checkboxes.

The patient and emergency contact information have the same keyname "First Name:" or "Last Name:"

In [None]:
from IPython.display import Image
Image(filename=image_filename) 

calling Amazon Textract with the textractcaller is easy

In [None]:
j = call_textract(input_document=image_filename, features=[Textract_Features.FORMS])

The key/value pairs include multiple ones with the same name. We need to identify the context for those

In [None]:
print(get_forms_string(j))

Initializing the GeoFinder

In [None]:
t_document = t2.TDocumentSchema().load(j)
doc_height = 1000
doc_width = 1000
geofinder_doc = TGeoFinder(j, doc_height=doc_height, doc_width=doc_width)

## Identify context specific information

set_hierachy_kv is a little function to add "virtual" keys which we use to indicate context

In [None]:
def set_hierarchy_kv(list_kv: list[KeyValue], t_document: t2.TDocument, page_block: t2.TBlock, prefix="BORROWER"):
    for x in list_kv:
        t_document.add_virtual_key_for_existing_key(key_name=f"{prefix}_{x.key.text}",
                                                    existing_key=t_document.get_block_by_id(x.key.id),
                                                    page_block=page_block)

We find the relevant phrases in the document to specify the area of key value pairs related to the patient information.

In [None]:
# patient information
patient_information = geofinder_doc.find_phrase_on_page("patient information")[0]
emergency_contact_1 = geofinder_doc.find_phrase_on_page("emergency contact 1:", min_textdistance=0.99)[0]
top_left = t2.TPoint(y=patient_information.ymax, x=0)
lower_right = t2.TPoint(y=emergency_contact_1.ymin, x=doc_width)
form_fields = geofinder_doc.get_form_fields_in_area(
    area_selection=AreaSelection(top_left=top_left, lower_right=lower_right))

then we use this information to add new key value pairs to the Amazon Textract Response JSON Schema

In [None]:
set_hierarchy_kv(list_kv=form_fields, t_document=t_document, prefix='PATIENT', page_block=t_document.pages[0])

In [None]:
print(get_forms_string(t2.TDocumentSchema().dump(t_document)))

The same for emergency contacts

In [None]:
emergency_contact_2 = geofinder_doc.find_phrase_on_page("emergency contact 2:", min_textdistance=0.99)[0]
top_left = t2.TPoint(y=emergency_contact_1.ymax, x=0)
lower_right = t2.TPoint(y=emergency_contact_2.ymin, x=doc_width)
form_fields = geofinder_doc.get_form_fields_in_area(
    area_selection=AreaSelection(top_left=top_left, lower_right=lower_right))
set_hierarchy_kv(list_kv=form_fields,
                 t_document=t_document,
                 prefix='EMERGENCY_CONTACT_1',
                 page_block=t_document.pages[0])
#Emergency contact 2
fever_question = geofinder_doc.find_phrase_on_page("did you feel fever or feverish lately")[0]
top_left = t2.TPoint(y=emergency_contact_2.ymax, x=0)
lower_right = t2.TPoint(y=fever_question.ymin, x=doc_width)
form_fields = geofinder_doc.get_form_fields_in_area(
    area_selection=AreaSelection(top_left=top_left, lower_right=lower_right))
set_hierarchy_kv(list_kv=form_fields,
                 t_document=t_document,
                 prefix='EMERGENCY_CONTACT_2',
                 page_block=t_document.pages[0])


In [None]:
print(get_forms_string(t2.TDocumentSchema().dump(t_document)))

This function makes it easier to add selection elements to the Amazon Textract Response JSON schema

In [None]:
def add_sel_elements(t_document: t2.TDocument, selection_values: list[SelectionElement], key_base_name: str,
                     page_block: t2.TBlock) -> t2.TDocument:
    for sel_element in selection_values:
        sel_key_string = "_".join([s_key.original_text.upper() for s_key in sel_element.key if s_key.original_text])
        if sel_key_string:
            if sel_element.selection.original_text:
                t_document.add_key_values(page_block=page_block,
                                          key_name=f"{key_base_name}->{sel_key_string}",
                                          values=[t_document.get_block_by_id(sel_element.selection.id)])
    return t_document


Now we can identify the areas of the questions and add those to the JSON response schema as well

In [None]:
top_left = t2.TPoint(y=fever_question.ymin - 50, x=0)
lower_right = t2.TPoint(y=fever_question.ymax + 50, x=doc_width)
sel_values: list[SelectionElement] = geofinder_doc.get_selection_values_in_area(area_selection=AreaSelection(
    top_left=top_left, lower_right=lower_right),
                                                                                exclude_ids=[])
t_document = add_sel_elements(t_document=t_document,
                 selection_values=sel_values,
                 key_base_name="FEVER",
                 page_block=t_document.pages[0])

# shortness breath
shortness_question = geofinder_doc.find_phrase_on_page("Are you having shortness of breath")[0]
top_left = t2.TPoint(y=shortness_question.ymin - 50, x=0)
lower_right = t2.TPoint(y=shortness_question.ymax + 50, x=doc_width)
sel_values: list[SelectionElement] = geofinder_doc.get_selection_values_in_area(area_selection=AreaSelection(
    top_left=top_left, lower_right=lower_right),
                                                                                exclude_ids=[])
t_document = add_sel_elements(t_document=t_document,
                 selection_values=sel_values,
                 key_base_name="SHORTNESS",
                 page_block=t_document.pages[0])

# cough breath
question = geofinder_doc.find_phrase_on_page("Do you have a cough")[0]
top_left = t2.TPoint(y=question.ymin - 50, x=0)
lower_right = t2.TPoint(y=question.ymax + 50, x=doc_width)
sel_values: list[SelectionElement] = geofinder_doc.get_selection_values_in_area(area_selection=AreaSelection(
    top_left=top_left, lower_right=lower_right),
                                                                                exclude_ids=[])
t_document = add_sel_elements(t_document=t_document,
                 selection_values=sel_values,
                 key_base_name="COUGH",
                 page_block=t_document.pages[0])
# loss of taste
question = geofinder_doc.find_phrase_on_page("Did you experience loss of taste or smell")[0]
top_left = t2.TPoint(y=question.ymin - 50, x=0)
lower_right = t2.TPoint(y=question.ymax + 50, x=doc_width)
sel_values: list[SelectionElement] = geofinder_doc.get_selection_values_in_area(area_selection=AreaSelection(
    top_left=top_left, lower_right=lower_right),
                                                                                exclude_ids=[])
t_document = add_sel_elements(t_document=t_document,
                 selection_values=sel_values,
                 key_base_name="LOSS_OF_TASTE",
                 page_block=t_document.pages[0])
# COVID Contact
question = geofinder_doc.find_phrase_on_page("Where you in contact with any confirmed")[0]
top_left = t2.TPoint(y=question.ymin - 50, x=0)
lower_right = t2.TPoint(y=question.ymax + 50, x=doc_width)
sel_values: list[SelectionElement] = geofinder_doc.get_selection_values_in_area(area_selection=AreaSelection(
    top_left=top_left, lower_right=lower_right),
                                                                                exclude_ids=[])
t_document = add_sel_elements(t_document=t_document,
                 selection_values=sel_values,
                 key_base_name="COVID_CONTACT",
                 page_block=t_document.pages[0])
# travel
question = geofinder_doc.find_phrase_on_page("Did you travel in the past 14 days")[0]
top_left = t2.TPoint(y=question.ymin - 50, x=0)
lower_right = t2.TPoint(y=question.ymax + 50, x=doc_width)
sel_values: list[SelectionElement] = geofinder_doc.get_selection_values_in_area(area_selection=AreaSelection(
    top_left=top_left, lower_right=lower_right),
                                                                                exclude_ids=[])
t_document = add_sel_elements(t_document=t_document,
                 selection_values=sel_values,
                 key_base_name="TRAVEL",
                 page_block=t_document.pages[0])


All the keys now have a context which makes it possible to parse the response in downstream processes

In [None]:
print(get_forms_string(t2.TDocumentSchema().dump(t_document)))

Here a little snippet how we can use Amazon Textract FORMS information in a Pandas dataframe

In [None]:
!python -m pip install pandas

In [None]:
import pandas as pd

In [None]:
from textractprettyprinter.t_pretty_print import convert_form_to_list
from trp import Document

tdoc=Document(t2.TDocumentSchema().dump(t_document))
dfs = list()
for page in tdoc.pages:
    dfs.append(pd.DataFrame(convert_form_to_list(trp_form=page.form)))

In [None]:
dfs[0]