# DocAI Invoice Parser Demo

**This notebook assumes you have docai api enabled for your project and security setup.**

In [278]:
import json
import base64
from google.cloud import documentai_v1beta3 as documentai

### Replace with your own project and location.

In [274]:
# Replace with your own values.
PROJECT_ID="google.com:ml-baguette-demos"
LOCATION="us"

### Helper Functions

In [267]:
def get_parsed_entities(parsed_document_response):
    """
    Returns dict of parsed entities fron a document response.
    """
    parsed_entities = {}
    for entity in parsed_document_response.entities:
        parsed_entities[entity.type_] = entity.mention_text

    return parsed_entities

In [268]:
def get_parsed_entities_json(parsed_document_dict):
    """
    Returns dict of parsed entities fron a document response.
    """
    parsed_entities = {}
    for entity in parsed_document_dict['entities']:
        parsed_entities[entity['type']] = entity['mentionText']

    return parsed_entities

### Process Document and print LRO if HITL triggered

In [269]:
def process_document_sample(
    project_id: str, location: str, processor_id: str, file_path: str
):
    """
    Process Document and print LRO if HITL triggered.
    """
    from google.cloud import documentai_v1beta3 as documentai

    # Instantiates a client
    client = documentai.DocumentProcessorServiceClient()

    # The full resource name of the processor, e.g.:
    # projects/project-id/locations/location/processor/processor-id
    # You must create new processors in the Cloud Console first
    name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"

    with open(file_path, "rb") as image:
        image_content = image.read()

    # Read the file into memory
    document = {"content": image_content, "mime_type": "application/pdf"}

    # Configure the process request
    request = {"name": name, "document": document}

    # Recognizes text entities in the PDF document
    result = client.process_document(request=request)
    
    print("Document processing complete.\n")
    document = result.document
    entities = get_parsed_entities(document)
    
    print("Parsed Entities:\n")
    
    for t in entities:
        print("{} : {}\n ".format(t, entities[t]))
    
    if result.human_review_operation:
        print ("Triggered HITL operation: {}".format(result.human_review_operation))


In [280]:
#Replace with your own values.
PROCESSOR_ID = 'e46132dfddbdc8f2'
DOCUMENT_PATH = 'docai-demos/test-files/acme-sample-invoice.pdf'

In [277]:
process_document_sample(PROJECT_ID, LOCATION, PROCESSOR_ID, DOCUMENT_PATH)

Document processing complete.

Parsed Entities:

net_amount : $331.00
 
due_date : 12/13/2019
 
purchase_order : CC-2342
 
payment_terms : Net 30
 
supplier_name : Acme Pest Control LLC
 
receiver_name : Carrie Webb
 
total_tax_amount : 29.79
 
supplier_address : 145 Corporate Ln
Northbrook, IL 60062
555-314-1888
United States
 
total_amount : 372.24
 
invoice_id : 126935
 
receiver_address : 909 Kuvalis Fort Apt. 525
Bannockburn, IL 60015
United States
 
invoice_date : 11/27/2019
 
currency : $
 
supplier_phone : 555-314-1888
 
freight_amount : $11.45
 
vat/amount : $331.00
 
line_item : 4 Labor $42.00 $168.00
 
Triggered HITL operation: projects/660199673046/locations/us/operations/13689527806918032121


**Update the lro_id with the HITL operation below:**

In [281]:
lro_id = "projects/660199673046/locations/us/operations/13839970306437524071"

In [282]:
from google.cloud import documentai_v1beta3 as documentai
client = documentai.DocumentProcessorServiceClient()
operation = client._transport.operations_client.get_operation(lro_id)
if operation.done:
    print("HITL location: {} ".format(str(operation.response.value)[5:-1]))
else:
    print('Waiting on human review.')

HITL location: gs://ml-baguette-dai-sample-docs-us/receipt-output/13839970306437524071/data-00001-of-00001.json 


**Update GCS url with the HITL location in the command below:**

In [283]:
!gsutil cp "gs://ml-baguette-dai-sample-docs-us/receipt-output/13839970306437524071/data-00001-of-00001.json" response.json

Copying gs://ml-baguette-dai-sample-docs-us/receipt-output/13839970306437524071/data-00001-of-00001.json...
/ [1 files][437.9 KiB/437.9 KiB]                                                
Operation completed over 1 objects/437.9 KiB.                                    


In [284]:
with open("response.json", "r") as file:
    import json
    
    data = json.load(file)
    entities = get_parsed_entities_json(data)
    
    for t in entities:
        print("{} : {}\n ".format(t, entities[t]))

net_amount : $331.00
 
line_item : 4 Labor $42.00 $168.00
 
supplier_name : Acme Pest Control LLC
 
supplier_address : 145 Corporate Ln
Northbrook, IL 60062
555-314-1888
United States
 
invoice_id : 126935
 
supplier_phone : 555-314-1888
 
invoice_date : 11/27/2019
 
receiver_name : Carrie Webb
 
receiver_address : 909 Kuvalis Fort Apt. 525
Bannockburn, IL 60015
United States
 
due_date : 12/13/2019
 
payment_terms : Net 30
 
purchase_order : CC-2342
 
vat/amount : $331.00
 
total_tax_amount : 29.79
 
freight_amount : $11.45
 
total_amount : 372.24
 
currency : $
 
