In [6]:
from google.cloud import documentai_v1beta3 as documentai
from google.cloud import storage

In [9]:
storage_client = storage.Client()
bucket = storage_client.get_bucket('jayadeep-etl-platform')
blob = bucket.get_blob('docuai/demo-doc.pdf')
content = blob.download_as_string()

In [4]:
project_id = "jayadeep-etl-platform"
location = "us"
processor_id = "900a4ac66b3ab487"

In [10]:
doc_client = documentai.DocumentProcessorServiceClient()
name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"
document = {"content": content, "mime_type": "application/pdf"}
request = {"name": name, "document": document}
result = doc_client.process_document(request=request)
document = result.document

In [11]:
document_pages = document.pages

In [16]:
def get_text(doc_element: dict, document: dict):
    """
    Document AI identifies form fields by their offsets
    in document text. This function converts offsets
    to text snippets.
    """
    response = ""
    # If a text segment spans several lines, it will
    # be stored in different text segments.
    for segment in doc_element.text_anchor.text_segments:
        start_index = (
            int(segment.start_index)
            if segment in doc_element.text_anchor.text_segments
            else 0
        )
        end_index = int(segment.end_index)
        response += document.text[start_index:end_index]
    return response.replace('\n','')

In [13]:
for page in document_pages:
    paragraphs = page.paragraphs
    for paragraph in paragraphs:
        paragraph_text = get_text(paragraph.layout, document)
        print(f"Paragraph text: {paragraph_text}")

Paragraph text: Sample document:

Paragraph text: Description: This text needs to be extracted

Paragraph text: Strength: This text needs to be extracted

Paragraph text: Name: ABC

Paragraph text: Quantity: 250

Paragraph text: Issue No: PQ-43-97A

Paragraph text: Date: 01-01-2020

Paragraph text: Sequence

Paragraph text: Frames

Paragraph text: Tracking without Tracking with
distance cue P/R/F distance cue P/R/F

Paragraph text: UAV1

Paragraph text: 616

Paragraph text: 0.90/0.90/0.90

Paragraph text: 0.94/0.94/0.94

Paragraph text: UAV 2

Paragraph text: 1833

Paragraph text: 0.92/0.88/0.90

Paragraph text: 0.96/0.99/0.97

Paragraph text: Traffic 1

Paragraph text: 156

Paragraph text: 0.88/0.91/0.89

Paragraph text: 0.96/0.98/0.97

Paragraph text: Traffic 2

Paragraph text: 227

Paragraph text: 0.85/0.90/0.87

Paragraph text: 0.97/0.97/0.97

Paragraph text: Pedestrian 2

Paragraph text: 338

Paragraph text: 0.93/0.95/0.94

Paragraph text: 1.00/1.00/1.00



In [17]:
for page in document.pages:
    print("Page number: {}".format(page.page_number))
    for table_num, table in enumerate(page.tables):
        print("Table {}: ".format(table_num))
        for row_num, row in enumerate(table.header_rows):
            cells = "\t".join([get_text(cell.layout,document) for cell in row.cells])
            print("Header Row {}: {}".format(row_num, cells))
        for row_num, row in enumerate(table.body_rows):
            #print (row)
            cells = "\t".join([get_text(cell.layout,document) for cell in row.cells])
            #print (cells)
            print("Row {}: {}".format(row_num, cells))

Page number: 1
Table 0: 
Header Row 0: Strength: This text needs to be extracted	
Row 0: Name: ABC	Quantity: 250
Row 1: Issue No: PQ-43-97A	Date: 01-01-2020
Table 1: 
Header Row 0: Sequence	Frames	Tracking without distance cue P/R/F 	Tracking withdistance cue P/R/F
Row 0: UAV1	616	0.90/0.90/0.90	0.94/0.94/0.94
Row 1: UAV 2	1833	0.92/0.88/0.90	0.96/0.99/0.97
Row 2: Traffic 1	156	0.88/0.91/0.89	0.96/0.98/0.97
Row 3: Traffic 2	227	0.85/0.90/0.87	0.97/0.97/0.97
Row 4: Pedestrian 2	338	0.93/0.95/0.94	1.00/1.00/1.00


In [None]:
# Extract Key Value Pairs

In [21]:
for page in document.pages:
    form_fields = page.form_fields
    for form_field in form_fields:
        field_name = get_text(form_field.field_name, document)
        field_value = get_text(form_field.field_value, document)

        print(f"Key Value Pairs: {field_name, field_value}")

Key Value Pairs: ('Date: ', '01-01-2020')
Key Value Pairs: ('Quantity: ', '250')
Key Value Pairs: ('Issue No: ', 'PQ-43-97A')
Key Value Pairs: ('Name: ', 'ABC')
Key Value Pairs: ('Strength: ', 'This text needs to be extracted')
Key Value Pairs: ('Description: ', 'This text needs to be extracted')
