In [33]:
import json

with open("/home/ubuntu/config.json") as file:
    config = json.load(file)

In [34]:
workspace_bucket = "applied-agi"
local_pdfs_folder_path = "data/"

s3_pdfs_folder_path = "ner-comprehend/pdfs"
source_docs_s3_uri = f"s3://{workspace_bucket}/{s3_pdfs_folder_path}/"

annotations_folder_path = "ner-comprehend/annotations"
annotations_data_s3_uri = f"s3://{workspace_bucket}/{annotations_folder_path}/"

manifest_folder_path="ner-comprehend/manifests"
output_manifest_file_name="output.manifest"
augmented_manifests_s3_uri = f"s3://{workspace_bucket}/{manifest_folder_path}/{output_manifest_file_name}"

comprehend_data_access_role_arn = config["comprehend_s3_role"]
annotation_job_name = "pdf-ner-annotation"

print(f"source_docs_s3_uri : {source_docs_s3_uri}")
print(f"annotations_data_s3_uri : {annotations_data_s3_uri}")
print(f"augmented_manifests_s3_uri : {augmented_manifests_s3_uri}")
print(f"comprehend_data_access_role_arn : {comprehend_data_access_role_arn}")

source_docs_s3_uri : s3://applied-agi/ner-comprehend/pdfs/
annotations_data_s3_uri : s3://applied-agi/ner-comprehend/annotations/
augmented_manifests_s3_uri : s3://applied-agi/ner-comprehend/manifests/output.manifest
comprehend_data_access_role_arn : arn:aws:iam::324622400514:role/applied-agi-comprehend-s3-access


In [35]:
from utils.entities import entity_names_list

entity_types = [{"Type": entity_name} for entity_name in entity_names_list]
# entity_types

In [36]:
import boto3

comprehend_client = boto3.client("comprehend")

response = comprehend_client.create_entity_recognizer(
    RecognizerName="pdf-ner",
    VersionName="v3",
    LanguageCode="en",
    DataAccessRoleArn=comprehend_data_access_role_arn,
    InputDataConfig={
        "DataFormat": "AUGMENTED_MANIFEST",
        'EntityTypes': entity_types,
        'AugmentedManifests': [
            {
                'S3Uri': augmented_manifests_s3_uri,
                'Split': 'TRAIN',
                'AttributeNames':[
                    annotation_job_name
                ],
                'AnnotationDataS3Uri': annotations_data_s3_uri,
                'SourceDocumentsS3Uri': source_docs_s3_uri,
                'DocumentType': 'SEMI_STRUCTURED_DOCUMENT'
            },
        ]
    }
)

In [37]:
# All recognizers
recognizers = comprehend_client.list_entity_recognizers()
# View the last submitted job
recognizers['EntityRecognizerPropertiesList'][-1]

{'EntityRecognizerArn': 'arn:aws:comprehend:us-east-1:324622400514:entity-recognizer/pdf-ner/version/v3',
 'LanguageCode': 'en',
 'Status': 'SUBMITTED',
 'SubmitTime': datetime.datetime(2024, 3, 23, 16, 58, 31, 281000, tzinfo=tzlocal()),
 'InputDataConfig': {'DataFormat': 'AUGMENTED_MANIFEST',
  'EntityTypes': [{'Type': 'INVOICE_NUMBER'},
   {'Type': 'INVOICE_DATE'},
   {'Type': 'COMPANY_NAME'},
   {'Type': 'INVOICE_AMOUNT'},
   {'Type': 'CURRENCY'},
   {'Type': 'REMIT_TO_ADDRESS_STREET'},
   {'Type': 'REMIT_TO_ADDRESS_CITY'},
   {'Type': 'REMIT_TO_ADDRESS_STATE'},
   {'Type': 'REMIT_TO_ADDRESS_COUNTRY'},
   {'Type': 'REMIT_TO_ADDRESS_ZIP_CODE'},
   {'Type': 'REMIT_TO_ADDRESS_OTHERS'},
   {'Type': 'SHIP_TO_ADDRESS_STREET'},
   {'Type': 'SHIP_TO_ADDRESS_CITY'},
   {'Type': 'SHIP_TO_ADDRESS_STATE'},
   {'Type': 'SHIP_TO_ADDRESS_COUNTRY'},
   {'Type': 'SHIP_TO_ADDRESS_ZIP_CODE'},
   {'Type': 'SHIP_TO_ADDRESS_OTHERS'}],
  'AugmentedManifests': [{'S3Uri': 's3://applied-agi/ner-comprehend/ma

In [38]:
# To monitor the status of the training job, you can use the describe_entity_recognizer API.
# Check status of custom model training periodically until complete.
import time

recognizer_arn = recognizers['EntityRecognizerPropertiesList'][-1]['EntityRecognizerArn']

while True:
    response = comprehend_client.describe_entity_recognizer(
        EntityRecognizerArn=recognizer_arn
    )

    status = response["EntityRecognizerProperties"]["Status"]
    if "IN_ERROR" == status:
        print('TRAINING ERROR')
        break
    if "TRAINED" == status:
        print('TRAINING COMPLETE')
        break
    print(status)
    time.sleep(60)

SUBMITTED


SUBMITTED
SUBMITTED
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING
TRAINING COMPLETE


In [41]:
# Per entity metrics
entity_metrics = response['EntityRecognizerProperties']['RecognizerMetadata']['EntityTypes']
for entity in entity_metrics:
    print(entity['Type'])
    print(entity['EvaluationMetrics'])
    print()

COMPANY_NAME
{'Precision': 0.2631578947368421, 'Recall': 0.40540540540540543, 'F1Score': 0.3191489361702128}

CURRENCY
{'Precision': 1.0, 'Recall': 1.0, 'F1Score': 1.0}

INVOICE_AMOUNT
{'Precision': 1.0, 'Recall': 1.0, 'F1Score': 1.0}

INVOICE_DATE
{'Precision': 1.0, 'Recall': 1.0, 'F1Score': 1.0}

INVOICE_NUMBER
{'Precision': 0.8888888888888888, 'Recall': 0.8888888888888888, 'F1Score': 0.8888888888888888}

REMIT_TO_ADDRESS_CITY
{'Precision': 1.0, 'Recall': 1.0, 'F1Score': 1.0}

REMIT_TO_ADDRESS_COUNTRY
{'Precision': 1.0, 'Recall': 1.0, 'F1Score': 1.0}

REMIT_TO_ADDRESS_OTHERS
{'Precision': 1.0, 'Recall': 0.13333333333333333, 'F1Score': 0.23529411764705882}

REMIT_TO_ADDRESS_STREET
{'Precision': 1.0, 'Recall': 1.0, 'F1Score': 1.0}

REMIT_TO_ADDRESS_ZIP_CODE
{'Precision': 1.0, 'Recall': 1.0, 'F1Score': 1.0}

SHIP_TO_ADDRESS_CITY
{'Precision': 1.0, 'Recall': 0.9565217391304348, 'F1Score': 0.9777777777777777}

SHIP_TO_ADDRESS_COUNTRY
{'Precision': 1.0, 'Recall': 1.0, 'F1Score': 1.0}

SHIP

In [42]:
%%time
endpoint_name = "pdf-ner-v3"

endpoint_response = comprehend_client.create_endpoint(
    EndpointName= endpoint_name,
    ModelArn= recognizer_arn,
    DesiredInferenceUnits=1,
    ClientRequestToken='string',
    Tags=[
        {
            'Key': 'name',
            'Value': 'invoice_endpoint'
        },
    ],
    DataAccessRoleArn= comprehend_data_access_role_arn
)
print(json.dumps(endpoint_response, indent=4, sort_keys=True))

{
    "EndpointArn": "arn:aws:comprehend:us-east-1:324622400514:entity-recognizer-endpoint/pdf-ner-v3",
    "ResponseMetadata": {
        "HTTPHeaders": {
            "content-length": "97",
            "content-type": "application/x-amz-json-1.1",
            "date": "Sat, 23 Mar 2024 17:32:41 GMT",
            "x-amzn-requestid": "cd8ebaa4-2eed-491c-86e8-6b3aee62ee62"
        },
        "HTTPStatusCode": 200,
        "RequestId": "cd8ebaa4-2eed-491c-86e8-6b3aee62ee62",
        "RetryAttempts": 0
    }
}
CPU times: user 14 ms, sys: 4.54 ms, total: 18.6 ms
Wall time: 136 ms


In [43]:
EndpointArn= endpoint_response["EndpointArn"]

ep_response = comprehend_client.describe_endpoint(
    EndpointArn= EndpointArn
)

ep_status = (ep_response["EndpointProperties"]["Status"])

while ep_status != "IN_SERVICE":
    ep_response = comprehend_client.describe_endpoint(EndpointArn= EndpointArn)
    ep_status = ep_response["EndpointProperties"]["Status"]
    print(ep_status)
    time.sleep(60)

print(f"your endpoint is : {ep_status} ")


CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
CREATING
IN_SERVICE


KeyboardInterrupt: 

### inference

In [54]:
import base64
import boto3
import json
from botocore.exceptions import ClientError

comprehend_client = boto3.client('comprehend')

key = f'{s3_pdfs_folder_path}/invoice1.pdf'
obj = boto3.resource("s3").Object(workspace_bucket, key)
document_bytes = obj.get()['Body'].read()
# document_bytes

In [55]:
try:
    response = comprehend_client.detect_entities(
        Bytes = document_bytes, 
        DocumentReaderConfig={
            # "DocumentReadAction": "TEXTRACT_ANALYZE_DOCUMENT",
            "DocumentReadAction": "TEXTRACT_DETECT_DOCUMENT_TEXT",
            "DocumentReadMode": "SERVICE_DEFAULT"
        },
        EndpointArn=EndpointArn
    )
    # print(json.dumps(response, indent=4, sort_keys=True))

except ClientError as e:
    print(e)
    print("Error", e.response['Reason'], e.response['Detail']['Reason'])

In [56]:
extracted_chars = 0
for page_detail in response["DocumentMetadata"]["ExtractedCharacters"]:
    extracted_chars = extracted_chars + page_detail["Count"]
print ("Number of pages in this document :  " + str(page_detail["Page"]) + " and characters extracted count is : " + str(extracted_chars))


for results in response["Entities"]:
    print("Entity Type : " + str(results["Type"]) +"        Entity : "+str(results["Text"])+  "        Score : "+ str(round(results["Score"]*100, 2)))


Number of pages in this document :  1 and characters extracted count is : 948
Entity Type : COMPANY_NAME        Entity : BROUR        Score : 99.97
Entity Type : REMIT_TO_ADDRESS_STREET        Entity : 12 Commonwealth Drive        Score : 99.95
Entity Type : INVOICE_NUMBER        Entity : 124567AB        Score : 99.88
Entity Type : SHIP_TO_ADDRESS_COUNTRY        Entity : Singapore        Score : 99.95
Entity Type : REMIT_TO_ADDRESS_ZIP_CODE        Entity : S143023        Score : 99.54
Entity Type : INVOICE_DATE        Entity : 04/05/2022        Score : 99.97
Entity Type : SHIP_TO_ADDRESS_OTHERS        Entity : Phone:        Score : 99.95
Entity Type : SHIP_TO_ADDRESS_STREET        Entity : WeCan Halt Road #28-01        Score : 99.97
Entity Type : SHIP_TO_ADDRESS_ZIP_CODE        Entity : S123456        Score : 99.92
Entity Type : SHIP_TO_ADDRESS_OTHERS        Entity : 123445        Score : 99.77
Entity Type : INVOICE_AMOUNT        Entity : 7000        Score : 99.99


In [57]:
response["Entities"]

[{'Score': 0.9996871948242188,
  'Type': 'COMPANY_NAME',
  'Text': 'BROUR',
  'BlockReferences': [{'BlockId': '6546aec1-2668-445c-811e-c4840b74d2f0',
    'BeginOffset': 0,
    'EndOffset': 5,
    'ChildBlocks': [{'ChildBlockId': 'a030a342-9c29-44ce-8eb3-f41aa6c5a937',
      'BeginOffset': 0,
      'EndOffset': 5}]}]},
 {'Score': 0.9995077848434448,
  'Type': 'REMIT_TO_ADDRESS_STREET',
  'Text': '12 Commonwealth Drive',
  'BlockReferences': [{'BlockId': '01d01f7f-11e0-4ada-872d-4056d36cb231',
    'BeginOffset': 0,
    'EndOffset': 21,
    'ChildBlocks': [{'ChildBlockId': 'bc3b46f7-58a3-4aaa-a311-c915c4414025',
      'BeginOffset': 0,
      'EndOffset': 2},
     {'ChildBlockId': 'e6f175d6-06e5-48bc-a367-6d98d923476e',
      'BeginOffset': 0,
      'EndOffset': 12},
     {'ChildBlockId': '6447b834-8bf7-4f4e-ad65-6ddddd8d4c65',
      'BeginOffset': 0,
      'EndOffset': 5}]}]},
 {'Score': 0.9987666606903076,
  'Type': 'INVOICE_NUMBER',
  'Text': '124567AB',
  'BlockReferences': [{'BlockId'