# Initialization

In [2]:
!pip install prettytable
from collections import defaultdict

import boto3
import numpy as np
import prettytable
import sagemaker
import scipy.cluster.hierarchy as hcluster
from sagemaker import get_execution_role
from sagemaker.mxnet import MXNetPredictor
from sagemaker.predictor import RealTimePredictor, json_serializer, json_deserializer

role = get_execution_role()
session = boto3.Session(region_name='us-west-2')
sagemaker_session = sagemaker.Session(boto_session=session)


def l2_distance(field, value):
    return np.linalg.norm((np.array(field["center"]) - np.array(value["center"])))


def get_center(bbox):  # {'top': 911, 'height': 31, 'width': 328, 'left': 961}
    return bbox['top'] + bbox['height'] / 2, bbox["left"] + bbox["width"] / 2


class JSONPredictor(RealTimePredictor):
    def __init__(self, endpoint_name, sagemaker_session):
        super(JSONPredictor, self).__init__(endpoint_name, sagemaker_session, json_serializer, json_deserializer)


loc_predictor = MXNetPredictor('localization-model-2019-01-11', sagemaker_session)
field_matching = JSONPredictor('field-match-2019-01-24-12-39-05-522', sagemaker_session)
ml_field_matching = MXNetPredictor("field-match-ml-2019-01-20")
print("predictors reference created")

Collecting prettytable
  Downloading https://files.pythonhosted.org/packages/ef/30/4b0746848746ed5941f052479e7c23d2b56d174b82f4fd34a25e389831f5/prettytable-0.7.2.tar.bz2
Building wheels for collected packages: prettytable
  Running setup.py bdist_wheel for prettytable ... [?25ldone
[?25h  Stored in directory: /home/ec2-user/.cache/pip/wheels/80/34/1c/3967380d9676d162cb59513bd9dc862d0584e045a162095606
Successfully built prettytable
Installing collected packages: prettytable
Successfully installed prettytable-0.7.2
[33mYou are using pip version 10.0.1, however version 19.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
predictors reference created


# calling the localizer

In [3]:
bucket = "unum-files"
file_name = "field-match-demo/Accident Claim small- 2_3.tiff"
loc_out = loc_predictor.predict({"url": "s3://{}/{}".format(bucket, file_name)})
print("localized")
print(loc_out)


localized
{'num_hw_crops': 26, 'hw_key': 'field-match-demo/Accident Claim small- 2_3/hand_written.pkl', 'hp_key': 'field-match-demo/Accident Claim small- 2_3/hand_printed.pkl', 'bucket_name': 'unum-files', 'num_hp_crops': 6}


# calling the field matching

In [4]:
data = {
    "hw_endpoint": 'pytorch-handwriting-ocr-2018-11-21-20-10-49-542',  
    "hp_endpoint": 'hand-printed-model-2018-12-10',#''  #  
    "sp_endpoint": "spell-corrector-2018-11-26-17-44-10-615",
    
    #"hw_endpoint":"pytorch-handwriting-ocr-2018-11-21-20-10-49-542",
    #"hp_endpoint":"sagemaker-mxnet-2018-11-03-23-32-01-918",
    #"sp_endpoint":"spell-corrector-2018-11-26-17-44-10-615",
    
    "field_names": [{"bucket": "ahmedb-test", "filename": "field_name_list.txt"},
                    {"bucket": "unum-files", "filename": "unum_field_names.txt"}],
    "field_names_ignore": [
        {"bucket": "ahmedb-test", "filename": "must_ignore.txt"},
        {"bucket": "unum-files", "filename": "unum_must_ignore_field_names.txt"}
    ],

    #"hw_pickle": {"bucket": "unum-files", "filename": "preprocessed/0a654812-21c5-4482-b054-9f0d2425df42$1/hand_written.pkl"},
    #"hp_pickle": {"bucket": "unum-files", "filename": "preprocessed/0a654812-21c5-4482-b054-9f0d2425df42$1/hand_printed.pkl"},
    #"page_image": {"bucket": "unum-files", "filename": "preprocessed/0a654812-21c5-4482-b054-9f0d2425df42$1.tiff"},

    "hw_pickle": {"bucket": loc_out['bucket_name'], "filename": loc_out['hw_key']},
    "hp_pickle": {"bucket": loc_out['bucket_name'], "filename": loc_out['hp_key']},
    "page_image": {"bucket": bucket, "filename": file_name},

}

fields = []
values = []
text_to_score={}
initial_matching = field_matching.predict(data)
original_match = prettytable.PrettyTable(["field", "values", "field score", "value score"])
for pair in initial_matching['field_match_output']:
    fields.append({"string": pair['field_name'], "bbox": pair['bbox'], "center": get_center(pair['bbox'])})
    
    text_to_score[pair['field_name']]= pair["confidence"]
    if pair["value"]['bbox'] != {'top': -1, 'height': -1, 'width': -1, 'left': -1}:
        values.append({"string": pair["value"]['field_value'], "bbox": pair["value"]['bbox'], "center": get_center(pair["value"]['bbox'])})
        text_to_score[pair["value"]['field_value']] = pair["value"]['confidence']

    # print({"strings": {"field": , "value": pair["value"]['field_value']},
    #        "bboxs": {"field": pair['bbox'], "value": pair["value"]['bbox']}})
    original_match.add_row([pair['field_name'], pair["confidence"],
                            pair["value"]['field_value'], pair["value"]['confidence']
                            ])
    
    
print(original_match)


+----------------------------------------------------------------------------------------+--------------------+------------------+--------------------+
|                                         field                                          |       values       |   field score    |    value score     |
+----------------------------------------------------------------------------------------+--------------------+------------------+--------------------+
|                             ATTENDING PHYSICIAN STATEMENT                              | 0.9559999999999998 |                  |         0          |
|             Insured/Policyholder Name (Last Name, First Name, MI, Suffix)              | 0.9439999999999997 |  JoNE,Miriam I   | 0.8768231259082366 |
|                               Date of Birth (mm/dd/yy)                                 | 0.9542857142857143 |    05-01-75      | 0.9599531599599019 |
|                   Patient Name (Last Name, First Name, MI. Suffix)                    

# rematching by nearest

In [49]:
predictions_act = prettytable.PrettyTable(["field", "field score", "values", "value score", "score"])
dist_thresh = 100
for field in fields:
    print(field["string"])
    candidates = []
    for value in values:
        print(value["string"])
        l2_dist = l2_distance(field, value)
        print(str(l2_dist))
        if(l2_dist < dist_thresh):
            candidates.append((value, l2_dist))
            print('added')

    nearest = list(map(lambda item: item[0]["string"], sorted(candidates, key=lambda item: item[1])[:5]))
    input_to_matching = {"field_names": [field["string"]], "field_values": nearest}
    if(len(nearest) != 0):
        results = ml_field_matching.predict(input_to_matching)  # siamese string field match
    else:
        results = [{"field": field["string"], "value": '', "score": 0}]
        text_to_score[''] = ''
    for result in sorted(results, key=lambda item: -item["score"]):
        predictions_act.add_row([result["field"],
                                 text_to_score[result["field"]],
                                 result["value"],
                                 text_to_score[result["value"]],
                                 result["score"],
                                 ])

print(predictions_act)


ATTENDING PHYSICIAN STATEMENT 
JoNE,Miriam I 
152.70641767784352
05-01-75 
1584.9645106436926
JoNES, Miriam I 
261.55783299301135
05-01-75 
1602.4197327791492
08-03-18 
1337.736894908711
06-05-18 
1618.4189815990171
Insured/Policyholder Name (Last Name, First Name, MI, Suffix) 
JoNE,Miriam I 
66.66520831738246
added
05-01-75 
1521.9535144018032
JoNES, Miriam I 
185.52762597521695
05-01-75 
1534.8058509140496
08-03-18 
1262.4872276581652
06-05-18 
1560.2621574594443
Date of Birth (mm/dd/yy) 
JoNE,Miriam I 
1436.9777312122828
05-01-75 
81.6899626637202
added
JoNES, Miriam I 
1455.3038342559262
05-01-75 
191.8000260688199
08-03-18 
417.805277611473
06-05-18 
2269.1296238866566
Patient Name (Last Name, First Name, MI. Suffix) 
JoNE,Miriam I 
132.15332004910056
05-01-75 
1611.8552199251644
JoNES, Miriam I 
128.02441173463754
05-01-75 
1615.0613765426997
08-03-18 
1324.5777629116383
06-05-18 
1432.4246053457753
Date of Birth (mm/dd/yy) 
JoNE,Miriam I 
1437.086114329966
05-01-75 
83.852549156

# rematching using hugarian alg

In [6]:
fields_strings = list(map(lambda item: item["string"], fields))
values_strings = list(map(lambda item: item["string"], values))

print(len(fields_strings))
print(len(values_strings))
data = {'field_names': fields_strings, 'field_values':values_strings}
results = ml_field_matching.predict(data)
for result in results:
    print(result)

39
6
{'score': 0.9799382090568542, 'value': '08-03-18 ', 'field': 'ifyes, date of accident (mm/dd/yy) '}
{'score': 0.8004761338233948, 'value': 'JoNE,Miriam I ', 'field': 'uffix) Please '}
{'score': 0.9768258929252625, 'value': '06-05-18 ', 'field': 'If yes, as of what date (mm/dd/yy) '}
{'score': 0.5575166344642639, 'value': 'JoNES, Miriam I ', 'field': 'ATTENDING PHYSICIAN STATEMENT '}
{'score': 0.9869855046272278, 'value': '05-01-75 ', 'field': 'Actual Delivery Date (mm/dd/yy '}
{'score': 0.9762994050979614, 'value': '05-01-75 ', 'field': 'Date '}


# rematching using hugarian alg + clustering

In [21]:
points_2d = []
for field in fields:
    points_2d.append(field["center"])
for value in values:
    points_2d.append(value["center"])

points_2d = np.array(points_2d)

# clustering
thresh = 250
clusters = hcluster.fclusterdata(points_2d, thresh, criterion="distance")

groupings = defaultdict(lambda: {'field_names': [], 'field_values': []})
for index, class_ in enumerate(clusters):
    if index >= len(fields):
        groupings[class_]["field_values"].append(values[index - len(fields)]["string"])
    else:
        groupings[class_]["field_names"].append(fields[index]["string"])

for cluster in [grouping for grouping in groupings.values()]:
    cluster = {"field_names": list(set(cluster["field_names"])), "field_values": list(set(cluster["field_values"]))}
    predictions_act= prettytable.PrettyTable(["field", "field score", "values", "value score", "score"])
        
    if cluster["field_names"] and cluster["field_values"]:
        import pprint
        pprint.pprint(cluster)
        results = ml_field_matching.predict(cluster)
        for result in sorted(results, key=lambda item: -item["score"]):
            predictions_act.add_row([result["field"],
                                     text_to_score[result["field"]],
                                     result["value"],
                                     text_to_score[result["value"]],
                                     result["score"],
                                     ])
        print(predictions_act)
        print("=" * 50)

{'field_names': ['Patient Name (Last Name, First Name, MI. Suffix) ',
                 'ACCIDENT DETAILS ',
                 'ATTENDING PHYSICIAN STATEMENT ',
                 'Insured/Policyholder Name (Last Name, First Name, MI, '
                 'Suffix) '],
 'field_values': ['JoNE,Miriam I ', 'JoNES, Miriam I ']}
+---------------------------------------------------+--------------------+------------------+--------------------+---------------------+
|                       field                       |    field score     |      values      |    value score     |        score        |
+---------------------------------------------------+--------------------+------------------+--------------------+---------------------+
| Patient Name (Last Name, First Name, MI. Suffix)  | 0.9426666666666667 | JoNES, Miriam I  | 0.8693156807919513 |  0.5482492446899414 |
|           ATTENDING PHYSICIAN STATEMENT           | 0.9559999999999998 |  JoNE,Miriam I   | 0.8768231259082366 | 0.448441386222839

In [44]:
import sagemaker
from sagemaker import get_execution_role

sagemaker_session = sagemaker.Session()

role = get_execution_role()

from sagemaker.mxnet.model import MXNetModel,MXNetPredictor

sagemaker_model = MXNetModel(model_data='s3://sagemaker-field-matching/elmo.h5.tar.gz',py_version="py3",
                             role=role, entry_point="global_endpoint.py",)


#predictor = sagemaker_model.deploy(initial_instance_count=1, instance_type='ml.t2.medium',endpoint_name="field-match-ml-2019-01-20") # , instance_type='local'

predictor = sagemaker_model.deploy(initial_instance_count=1, instance_type='ml.t2.medium',endpoint_name="ml-field-match-2019-01-31-5") # , instance_type='local'

INFO:sagemaker:Created S3 bucket: sagemaker-us-east-1-620580205565
INFO:sagemaker:Creating model with name: sagemaker-mxnet-2019-01-31-09-08-24-515
INFO:sagemaker:Creating endpoint with name ml-field-match-2019-01-31-6


---------------------------------------------------------------------------!

In [45]:
import sagemaker
from sagemaker import get_execution_role
import json
import pandas as pd
sagemaker_session = sagemaker.Session()

role = get_execution_role()

from sagemaker.mxnet.model import MXNetModel, MXNetPredictor

predictor = MXNetPredictor("ml-field-match-2019-01-31-6") 


results = predictor.predict(initial_matching)
for result in results:
    print(result)

{'value_detection_score': '', 'value': '', 'field_detection_score': 0.9559999999999998, 'score': 0, 'field': 'ATTENDING PHYSICIAN STATEMENT '}
{'value_detection_score': 0.8768231259082366, 'value': 'JoNE,Miriam I ', 'field_detection_score': 0.9439999999999997, 'score': 0.4258142113685608, 'field': 'Insured/Policyholder Name (Last Name, First Name, MI, Suffix) '}
{'value_detection_score': 0.9991248331432823, 'value': '05-01-75 ', 'field_detection_score': 0.9514285714285713, 'score': 0.9655383229255676, 'field': 'Date of Birth (mm/dd/yy) '}
{'value_detection_score': '', 'value': '', 'field_detection_score': 0.9426666666666667, 'score': 0, 'field': 'Patient Name (Last Name, First Name, MI. Suffix) '}
{'value_detection_score': 0.9991248331432823, 'value': '05-01-75 ', 'field_detection_score': 0.9514285714285713, 'score': 0.9655397534370422, 'field': 'Date of Birth (mm/dd/yy) '}
{'value_detection_score': '', 'value': '', 'field_detection_score': 0.96, 'score': 0, 'field': 'ACCIDENT DETAILS 

In [47]:
predictions_act = prettytable.PrettyTable(["field", "value", "score", "field_detection_score", "value_detection_score"])
print(predictions_act) 

+-------+-------+-------+-----------------------+-----------------------+
| field | value | score | field_detection_score | value_detection_score |
+-------+-------+-------+-----------------------+-----------------------+
+-------+-------+-------+-----------------------+-----------------------+


In [48]:
predictions_act = prettytable.PrettyTable(["field", "value", "score", "field_detection_score", "value_detection_score"])
for result in results:
    predictions_act.add_row([result["field"],                             
                             result["value"],
                             result["score"],
                             result["field_detection_score"],
                             result["value_detection_score"],
                             ])
print(predictions_act)    

+----------------------------------------------------------------------------------------+----------------+--------------------+-----------------------+-----------------------+
|                                         field                                          |     value      |       score        | field_detection_score | value_detection_score |
+----------------------------------------------------------------------------------------+----------------+--------------------+-----------------------+-----------------------+
|                             ATTENDING PHYSICIAN STATEMENT                              |                |         0          |   0.9559999999999998  |                       |
|             Insured/Policyholder Name (Last Name, First Name, MI, Suffix)              | JoNE,Miriam I  | 0.4258142113685608 |   0.9439999999999997  |   0.8768231259082366  |
|                               Date of Birth (mm/dd/yy)                                 |   05-01-75     | 0.96553