# Initialization

In [4]:
!pip install prettytable
from collections import defaultdict

import boto3
import numpy as np
import prettytable
import sagemaker
import scipy.cluster.hierarchy as hcluster
from sagemaker import get_execution_role
from sagemaker.mxnet import MXNetPredictor
from sagemaker.predictor import RealTimePredictor, json_serializer, json_deserializer

role = get_execution_role()
session = boto3.Session(region_name='us-west-2')
sagemaker_session = sagemaker.Session(boto_session=session)


def l2_distance(field, value):
    return np.linalg.norm((np.array(field["center"]) - np.array(value["center"])))


def get_center(bbox):  # {'top': 911, 'height': 31, 'width': 328, 'left': 961}
    return bbox['top'] + bbox['height'] / 2, bbox["left"] + bbox["width"] / 2


class JSONPredictor(RealTimePredictor):
    def __init__(self, endpoint_name, sagemaker_session):
        super(JSONPredictor, self).__init__(endpoint_name, sagemaker_session, json_serializer, json_deserializer)


loc_predictor = MXNetPredictor('localization-model-2019-01-11', sagemaker_session)
field_matching = JSONPredictor('field-match-2019-01-24-12-39-05-522', sagemaker_session)
ml_field_matching = MXNetPredictor("field-match-ml-2019-01-20")
print("predictors reference created")

[33mYou are using pip version 10.0.1, however version 19.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
predictors reference created


# calling the localizer

In [5]:
bucket = "unum-files"
file_name = "field-match-demo/Accident Claim small- 2_3.tiff"
loc_out = loc_predictor.predict({"url": "s3://{}/{}".format(bucket, file_name)})
print("localized")
print(loc_out)


localized
{'num_hw_crops': 26, 'hw_key': 'field-match-demo/Accident Claim small- 2_3/hand_written.pkl', 'hp_key': 'field-match-demo/Accident Claim small- 2_3/hand_printed.pkl', 'bucket_name': 'unum-files', 'num_hp_crops': 6}


# calling the field matching

In [6]:
data = {
    "hw_endpoint": 'pytorch-handwriting-ocr-2018-11-21-20-10-49-542',  
    "hp_endpoint": 'hand-printed-model-2018-12-10',#''  #  
    "sp_endpoint": "spell-corrector-2018-11-26-17-44-10-615",
    
    #"hw_endpoint":"pytorch-handwriting-ocr-2018-11-21-20-10-49-542",
    #"hp_endpoint":"sagemaker-mxnet-2018-11-03-23-32-01-918",
    #"sp_endpoint":"spell-corrector-2018-11-26-17-44-10-615",
    
    "field_names": [{"bucket": "ahmedb-test", "filename": "field_name_list.txt"},
                    {"bucket": "unum-files", "filename": "unum_field_names.txt"}],
    "field_names_ignore": [
        {"bucket": "ahmedb-test", "filename": "must_ignore.txt"},
        {"bucket": "unum-files", "filename": "unum_must_ignore_field_names.txt"}
    ],

    #"hw_pickle": {"bucket": "unum-files", "filename": "preprocessed/0a654812-21c5-4482-b054-9f0d2425df42$1/hand_written.pkl"},
    #"hp_pickle": {"bucket": "unum-files", "filename": "preprocessed/0a654812-21c5-4482-b054-9f0d2425df42$1/hand_printed.pkl"},
    #"page_image": {"bucket": "unum-files", "filename": "preprocessed/0a654812-21c5-4482-b054-9f0d2425df42$1.tiff"},

    "hw_pickle": {"bucket": loc_out['bucket_name'], "filename": loc_out['hw_key']},
    "hp_pickle": {"bucket": loc_out['bucket_name'], "filename": loc_out['hp_key']},
    "page_image": {"bucket": bucket, "filename": file_name},

}

fields = []
values = []
text_to_score={}
initial_matching = field_matching.predict(data)
original_match = prettytable.PrettyTable(["field", "values", "field score", "value score"])
for pair in initial_matching['field_match_output']:
    fields.append({"string": pair['field_name'], "bbox": pair['bbox'], "center": get_center(pair['bbox'])})
    
    text_to_score[pair['field_name']]= pair["confidence"]
    if pair["value"]['bbox'] != {'top': -1, 'height': -1, 'width': -1, 'left': -1}:
        values.append({"string": pair["value"]['field_value'], "bbox": pair["value"]['bbox'], "center": get_center(pair["value"]['bbox'])})
        text_to_score[pair["value"]['field_value']] = pair["value"]['confidence']

    # print({"strings": {"field": , "value": pair["value"]['field_value']},
    #        "bboxs": {"field": pair['bbox'], "value": pair["value"]['bbox']}})
    original_match.add_row([pair['field_name'], pair["confidence"],
                            pair["value"]['field_value'], pair["value"]['confidence']
                            ])
    
    
print(original_match)


+----------------------------------------------------------------------------------------+--------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------+
|                                         field                                          |       values       |                                                                                      field score                                                                                       |     value score     |
+----------------------------------------------------------------------------------------+--------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------+
|                             ATTENDING PHY

# rematching by nearest

In [7]:
predictions_act = prettytable.PrettyTable(["field", "field score", "values", "value score", "score"])
for field in fields:
    candidates = []
    for value in values:
        candidates.append((value, l2_distance(field, value)))

    nearest = list(map(lambda item: item[0]["string"], sorted(candidates, key=lambda item: item[1])[:5]))
    input_to_matching = {"field_names": [field["string"]], "field_values": nearest}
    results = ml_field_matching.predict(input_to_matching)  # siamese string field match
    for result in sorted(results, key=lambda item: -item["score"]):
        predictions_act.add_row([result["field"],
                                 text_to_score[result["field"]],
                                 result["value"],
                                    text_to_score[result["value"]],
                                 result["score"],
                                 ])

print(predictions_act)

+----------------------------------------------------------------------------------------+--------------------+---------------------------------------------+---------------------+----------------------+
|                                         field                                          |    field score     |                    values                   |     value score     |        score         |
+----------------------------------------------------------------------------------------+--------------------+---------------------------------------------+---------------------+----------------------+
|                             ATTENDING PHYSICIAN STATEMENT                              | 0.9559999999999998 |               JoNES, Miriam I               |  0.8693156807919513 |  0.5575166344642639  |
|             Insured/Policyholder Name (Last Name, First Name, MI, Suffix)              | 0.9439999999999997 |               JoNES, Miriam I               |  0.8693156807919513 |  0.54261

# rematching using hugarian alg

In [8]:
fields_strings = list(map(lambda item: item["string"], fields))
values_strings = list(map(lambda item: item["string"], values))

print(len(fields_strings))
print(len(values_strings))
data = {'field_names': fields_strings, 'field_values':values_strings}
results = ml_field_matching.predict(data)
for result in results:
    print(result)

39
19


ModelError: An error occurred (ModelError) when calling the InvokeEndpoint operation: Received server error (504) from model with message "<html>
<head><title>504 Gateway Time-out</title></head>
<body bgcolor="white">
<center><h1>504 Gateway Time-out</h1></center>
<hr><center>nginx/1.10.3 (Ubuntu)</center>
</body>
</html>
". See https://us-east-1.console.aws.amazon.com/cloudwatch/home?region=us-east-1#logEventViewer:group=/aws/sagemaker/Endpoints/field-match-ml-2019-01-20 in account 620580205565 for more information.

# rematching using hugarian alg + clustering

In [14]:
points_2d = []
for field in fields:
    points_2d.append(field["center"])
for value in values:
    points_2d.append(value["center"])

points_2d = np.array(points_2d)

# clustering
thresh = 250
clusters = hcluster.fclusterdata(points_2d, thresh, criterion="distance")

groupings = defaultdict(lambda: {'field_names': [], 'field_values': []})
for index, class_ in enumerate(clusters):
    if index >= len(fields):
        groupings[class_]["field_values"].append(values[index - len(fields)]["string"])
    else:
        groupings[class_]["field_names"].append(fields[index]["string"])

for cluster in [grouping for grouping in groupings.values()]:
    cluster = {"field_names": list(set(cluster["field_names"])), "field_values": list(set(cluster["field_values"]))}
    predictions_act= prettytable.PrettyTable(["field", "field score", "values", "value score", "score"])
        
    if cluster["field_names"] and cluster["field_values"]:
        import pprint
        pprint.pprint(cluster)
        results = ml_field_matching.predict(cluster)
        for result in sorted(results, key=lambda item: -item["score"]):
            predictions_act.add_row([result["field"],
                                     text_to_score[result["field"]],
                                     result["value"],
                                     text_to_score[result["value"]],
                                     result["score"],
                                     ])
        print(predictions_act)
        print("=" * 50)

{'field_names': ['Insured/Policyholder Name (Last Name, First Name, MI, '
                 'Suffix) ',
                 'ACCIDENT DETAILS ',
                 'ATTENDING PHYSICIAN STATEMENT ',
                 'Patient Name (Last Name, First Name, MI. Suffix) '],
 'field_values': ['JoNES, Miriam I ', 'JoNE,Miriam I ']}
+---------------------------------------------------+--------------------+------------------+--------------------+---------------------+
|                       field                       |    field score     |      values      |    value score     |        score        |
+---------------------------------------------------+--------------------+------------------+--------------------+---------------------+
| Patient Name (Last Name, First Name, MI. Suffix)  | 0.9426666666666667 | JoNES, Miriam I  | 0.8693156807919513 |  0.5482492446899414 |
|           ATTENDING PHYSICIAN STATEMENT           | 0.9559999999999998 |  JoNE,Miriam I   | 0.8768231259082366 | 0.448441386222839