In [1]:
import os
import six
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "C:/Users/ajay/Downloads/mykey.json"

### Small file online processing of PDF

In [2]:
from google.cloud import documentai_v1beta2 as documentai


def parse_form(project_id,input_uri,mime_type):
    """Parse a form"""

    client = documentai.DocumentUnderstandingServiceClient()

    gcs_source = documentai.types.GcsSource(uri=input_uri)

    # mime_type can be application/pdf, image/tiff,
    # and image/gif, or application/json
    input_config = documentai.types.InputConfig(
        gcs_source=gcs_source, mime_type= mime_type)

    # Improve form parsing results by providing key-value pair hints.
    # For each key hint, key is text that is likely to appear in the
    # document as a form field name (i.e. "DOB").
    # Value types are optional, but can be one or more of:
    # ADDRESS, LOCATION, ORGANIZATION, PERSON, PHONE_NUMBER, ID,
    # NUMBER, EMAIL, PRICE, TERMS, DATE, NAME
    key_value_pair_hints = [
        documentai.types.KeyValuePairHint(key='Emergency Contact',
                                          value_types=['NAME']),
        documentai.types.KeyValuePairHint(
            key='Referred By')
    ]

    # Setting enabled=True enables form extraction
    form_extraction_params = documentai.types.FormExtractionParams(
        enabled=True, key_value_pair_hints=key_value_pair_hints)

    # Location can be 'us' or 'eu'
    parent = 'projects/{}/locations/us'.format(project_id)
    request = documentai.types.ProcessDocumentRequest(
        parent=parent,
        input_config=input_config,
        form_extraction_params=form_extraction_params)

    document = client.process_document(request=request)

    def _get_text(el):
        """Doc AI identifies form fields by their offsets
        in document text. This function converts offsets
        to text snippets.
        """
        response = ''
        # If a text segment spans several lines, it will
        # be stored in different text segments.
        for segment in el.text_anchor.text_segments:
            start_index = segment.start_index
            end_index = segment.end_index
            response += document.text[start_index:end_index]
        return response

    for page in document.pages:
        print('Page number: {}'.format(page.page_number))
        for form_field in page.form_fields:
            print('Field Name: {}\tConfidence: {}'.format(
                _get_text(form_field.field_name),
                form_field.field_name.confidence))
            print('Field Value: {}\tConfidence: {}'.format(
                _get_text(form_field.field_value),
                form_field.field_value.confidence))

In [3]:
parse_form(project_id='gcpbabysteps',input_uri='gs://buck910/documentAI/AMIE-Filled.pdf',mime_type='application/pdf')

Page number: 1
Field Name: Mobile 	Confidence: 0.9999887347221375
Field Value: No.019897 526035 	Confidence: 0.9999887347221375
Field Name: Telephone No. 	Confidence: 0.9999879598617554
Field Value: 01212664 371
	Confidence: 0.9999879598617554
Field Name: NAME (in capital letters)
	Confidence: 0.9999828934669495
Field Value: RAJEEV KUMAR RANJANI
	Confidence: 0.9999828934669495
Field Name: MEMBERSHIP NUMBER
	Confidence: 0.9999669194221497
Field Value: 517320
	Confidence: 0.9999669194221497
Field Name: Full Signature of Candidate with Date
	Confidence: 0.9999066591262817
Field Value: Rajen Raleni
	Confidence: 0.9999066591262817
Field Name: * For example, 	Confidence: 0.9997069835662842
Field Value: CV 413	Confidence: 0.9997069835662842
Field Name: 2. I wish to appear from Examination Centre Code No. 	Confidence: 0.9989848136901855
Field Value: 247
	Confidence: 0.9989848136901855
Field Name: Scheme Code
(AN=2, AD=3, B=4)
	Confidence: 0.9971093535423279
Field Value: 3
	Confidence: 0.997109

### Online processing of tiff format

In [4]:
parse_form(project_id='gcpbabysteps',
           input_uri='gs://buck910/documentAI/AMIE-converted.tiff',
           mime_type='image/tiff')

Page number: 1
Field Name: NAME (in capital letters)
	Confidence: 0.9999972581863403
Field Value: RAJEEV KUMARI RANJANI
	Confidence: 0.9999972581863403
Field Name: Full Signature of Candidate with Date
	Confidence: 0.9999831914901733
Field Value: Rojen Rafoni
	Confidence: 0.9999831914901733
Field Name: Telephone No. 	Confidence: 0.99997478723526
Field Value: 012/112661437
	Confidence: 0.99997478723526
Field Name: Mobile No. 	Confidence: 0.9999509453773499
Field Value: 1918/9/7|5/2/610315 	Confidence: 0.9999509453773499
Field Name: * For example, 	Confidence: 0.9999120831489563
Field Value: CV 413	Confidence: 0.9999120831489563
Field Name: MEMBERSHIP NUMBER
	Confidence: 0.9998144507408142
Field Value: 517132 	Confidence: 0.9998144507408142
Field Name: 2. I wish to appear from Examination Centre Code No. 	Confidence: 0.9996899366378784
Field Value: 247
	Confidence: 0.9996899366378784
Field Name: E-mail : 	Confidence: 0.9949889779090881
Field Value: rajeey ranjan - 123@yahoo.co.in
	Confid

### Online processing of gif format

In [5]:
parse_form(project_id='gcpbabysteps',
           input_uri='gs://buck910/documentAI/AMIE-gif.gif',
           mime_type = 'image/gif')

Page number: 1
Field Name: Telephone No. 	Confidence: 0.9999881386756897
Field Value: 01210/26161437||
	Confidence: 0.9999881386756897
Field Name: Full Signature of Candidate with Date
	Confidence: 0.9999846816062927
Field Value: Rayen Rafoni
	Confidence: 0.9999846816062927
Field Name: NAME (in capital letters)
	Confidence: 0.999968409538269
Field Value: RAJEEV KUMARI RANJANI
	Confidence: 0.999968409538269
Field Name: Mobile No. 	Confidence: 0.9999464750289917
Field Value: 1918/9/7|5/21610315 	Confidence: 0.9999464750289917
Field Name: * For example, 	Confidence: 0.9999246001243591
Field Value: CV 413	Confidence: 0.9999246001243591
Field Name: 2. I wish to appear from Examination Centre Code No. 	Confidence: 0.9997371435165405
Field Value: 247
	Confidence: 0.9997371435165405
Field Name: E-mail : 	Confidence: 0.9997329115867615
Field Value: rajeey ranjan - 123@yahoo.co.in
	Confidence: 0.9997329115867615
Field Name: MEMBERSHIP NUMBER
	Confidence: 0.9995642304420471
Field Value: 517320-5


### Large PDF file offline processing

In [None]:
import re

from google.cloud import documentai_v1beta2 as documentai
from google.cloud import storage


def batch_parse_form(
        project_id='YOUR_PROJECT_ID',
        input_uri='gs://cloud-samples-data/documentai/form.pdf',
        destination_uri='gs://your-bucket-id/path/to/save/results/'):
    """Parse a form"""

    client = documentai.DocumentUnderstandingServiceClient()

    gcs_source = documentai.types.GcsSource(uri=input_uri)

    # mime_type can be application/pdf, image/tiff,
    # and image/gif, or application/json
    input_config = documentai.types.InputConfig(
        gcs_source=gcs_source, mime_type='application/pdf')

    # where to write results
    output_config = documentai.types.OutputConfig(
        gcs_destination=documentai.types.GcsDestination(
            uri=destination_uri),
        pages_per_shard=1  # Map one doc page to one output page
    )

    # Improve form parsing results by providing key-value pair hints.
    # For each key hint, key is text that is likely to appear in the
    # document as a form field name (i.e. "DOB").
    # Value types are optional, but can be one or more of:
    # ADDRESS, LOCATION, ORGANIZATION, PERSON, PHONE_NUMBER, ID,
    # NUMBER, EMAIL, PRICE, TERMS, DATE, NAME
    key_value_pair_hints = [
        documentai.types.KeyValuePairHint(
            key='Emergency Contact',
            value_types=['NAME']),
        documentai.types.KeyValuePairHint(
            key='Referred By')
    ]

    # Setting enabled=True enables form extraction
    form_extraction_params = documentai.types.FormExtractionParams(
        enabled=True, key_value_pair_hints=key_value_pair_hints)

    # Location can be 'us' or 'eu'
    parent = 'projects/{}/locations/us'.format(project_id)
    request = documentai.types.ProcessDocumentRequest(
        input_config=input_config,
        output_config=output_config,
        form_extraction_params=form_extraction_params)

    # Add each ProcessDocumentRequest to the batch request
    requests = []
    requests.append(request)

    batch_request = documentai.types.BatchProcessDocumentsRequest(
        parent=parent, requests=requests
    )

    operation = client.batch_process_documents(batch_request)

    # Wait for the operation to finish
    operation.result()

    # Results are written to GCS. Use a regex to find
    # output files
    match = re.match(r'gs://([^/]+)/(.+)', destination_uri)
    output_bucket = match.group(1)
    prefix = match.group(2)

    storage_client = storage.client.Client()
    bucket = storage_client.get_bucket(output_bucket)
    blob_list = list(bucket.list_blobs(prefix=prefix))
    print('Output files:')
    for blob in blob_list:
        print(blob.name)


In [14]:
batch_parse_form(project_id='gcpbabysteps',
        input_uri='gs://buck910/documentAI/decease_claim_set.pdf',
        destination_uri='gs://buck910/documentAI/')

Output files:
documentAI/
documentAI/AMIE-Filled-Application-Form.jpg
documentAI/AMIE-Filled.pdf
documentAI/AMIE-converted.tiff
documentAI/AMIE-gif.gif
documentAI/NoticeForm.gif
documentAI/NoticeForm.pdf
documentAI/NoticeForm.tiff
documentAI/decease_claim_-1-2.pdf
documentAI/decease_claim_set.pdf
documentAI/img020.jpg
documentAI/img_gif.gif
documentAI/output-page-1-to-1.json
documentAI/output-page-10-to-10.json
documentAI/output-page-11-to-11.json
documentAI/output-page-12-to-12.json
documentAI/output-page-13-to-13.json
documentAI/output-page-14-to-14.json
documentAI/output-page-15-to-15.json
documentAI/output-page-2-to-2.json
documentAI/output-page-3-to-3.json
documentAI/output-page-4-to-4.json
documentAI/output-page-5-to-5.json
documentAI/output-page-6-to-6.json
documentAI/output-page-7-to-7.json
documentAI/output-page-8-to-8.json
documentAI/output-page-9-to-9.json
documentAI/prs_out.json-output-page-1-to-1.json
documentAI/prs_out.json-output-page-10-to-10.json
documentAI/prs_out.jso

### Parsing documents containing tables

In [8]:
from google.cloud import documentai_v1beta2 as documentai


def parse_table(project_id,input_uri,mime_type):
    """Parse a form"""

    client = documentai.DocumentUnderstandingServiceClient()

    gcs_source = documentai.types.GcsSource(uri=input_uri)

    # mime_type can be application/pdf, image/tiff,
    # and image/gif, or application/json
    input_config = documentai.types.InputConfig(
        gcs_source=gcs_source, mime_type=mime_type)

    # Improve table parsing results by providing bounding boxes
    # specifying where the box appears in the document (optional)
    table_bound_hints = [
        documentai.types.TableBoundHint(
            page_number=1,
            bounding_box=documentai.types.BoundingPoly(
                # Define a polygon around tables to detect
                # Each vertice coordinate must be a number between 0 and 1
                normalized_vertices=[
                    # Top left
                    documentai.types.geometry.NormalizedVertex(
                        x=0,
                        y=0
                    ),
                    # Top right
                    documentai.types.geometry.NormalizedVertex(
                        x=1,
                        y=0
                    ),
                    # Bottom right
                    documentai.types.geometry.NormalizedVertex(
                        x=1,
                        y=1
                    ),
                    # Bottom left
                    documentai.types.geometry.NormalizedVertex(
                        x=0,
                        y=1
                    )
                ]
            )
        )
    ]

    # Setting enabled=True enables form extraction
    table_extraction_params = documentai.types.TableExtractionParams(
        enabled=True, table_bound_hints=table_bound_hints)

    # Location can be 'us' or 'eu'
    parent = 'projects/{}/locations/us'.format(project_id)
    request = documentai.types.ProcessDocumentRequest(
        parent=parent,
        input_config=input_config,
        table_extraction_params=table_extraction_params)

    document = client.process_document(request=request)

    def _get_text(el):
        """Convert text offset indexes into text snippets.
        """
        response = ''
        # If a text segment spans several lines, it will
        # be stored in different text segments.
        for segment in el.text_anchor.text_segments:
            start_index = segment.start_index
            end_index = segment.end_index
            response += document.text[start_index:end_index]
        return response

    for page in document.pages:
        print('Page number: {}'.format(page.page_number))
        for table_num, table in enumerate(page.tables):
            print('Table {}: '.format(table_num))
            for row_num, row in enumerate(table.header_rows):
                cells = '\t'.join(
                    [_get_text(cell.layout) for cell in row.cells])
                print('Header Row {}: {}'.format(row_num, cells))
            for row_num, row in enumerate(table.body_rows):
                cells = '\t'.join(
                    [_get_text(cell.layout) for cell in row.cells])
                print('Row {}: {}'.format(row_num, cells))


In [9]:
prs_tab = parse_table(project_id='gcpbabysteps',
                      input_uri='gs://buck910/documentAI/NoticeForm.pdf',
                      mime_type = 'application/pdf')

Page number: 1
Table 0: 
Header Row 0: S. 	Examination
	Current Last 	Revised/Extended 	Email for any

Row 0: No
		Date
	Last Date*
	Clarification

Row 1: 01
	Indira Gandhi National Open University
(IGNOU) Ph.D. and OPENMAT (MBA) Entrance Examination-2020
	15.06.2020 	30.06.2020
	ignou@nta.ac.in

Row 2: 02
	Indian Council of Agricultural Research
(ICAR) AIEEA 2020
	15.06.2020 	30.06.2020
	icar@nta.ac.in

Row 3: 03
	Jawaharlal Nehru University Entrance
Examination (JNUEE)-2020
	15.06.2020 	30.06.2020
	jnuanta.ac.in

Row 4: 04
	UGC-National Eligibility Test(UGC-NET)-
	15.06.2020 	30.06.2020
	ugcnet@nta.ac.in

Row 5: 	June 2020
			
Row 6: 05
	Joint CSIR-UGC NET Examination (CSIR-
UGC NET)-June 2020
	15.06.2020 	30.06.2020
	csirneta nta.ac.in

Row 7: 06
	All India AYUSH Post Graduate Entrance
	15.06.2020
	30.06.2020
	aiapget@nta.ac.in

Row 8: 	Test (AIAPGET)-2020
			


In [10]:
parse_table(project_id='gcpbabysteps',
            input_uri='gs://buck910/documentAI/NoticeForm.gif',
            mime_type = 'image/gif')

Page number: 1
Table 0: 
Header Row 0: S. 	Examination
	Current Last 	Revised/Extended 	Email for any

Row 0: No
		Date
	Last Date*
	Clarification

Row 1: 	Indira Gandhi National Open University
			ignou@nta.ac.in

Row 2: 01
	(IGNOU) Ph.D. and OPENMAT (MBA) Entrance Examination-2020
	15.06.2020 	30.06.2020
	
Row 3: 02 	Indian Council of Agricultural Research
(ICAR) AIEEA 2020
	15.06.2020 	30.06.2020
	icaranta.ac.in

Row 4: 03
	Jawaharlal Nehru University Entrance
Examination (JNUEE)-2020
	15.06.2020 	30.06.2020
	jnuanta.ac.in

Row 5: 04
	UGC-National Eligibility Test(UGC-NET)-June 2020
	15.06.2020 	30.06.2020
	ugcnet@nta.ac.in

Row 6: 	Joint CSIR-UGC NET Examination (CSIR-
			csirnetanta.ac.in

Row 7: 05
	UGC NET)-June 2020
	15.06.2020 	30.06.2020
	
Row 8: 06
	All India AYUSH Post Graduate Entrance
Test (AIAPGET)-2020
	15.06.2020
	30.06.2020
	aiapget@nta.ac.in



### Large Doc file offline processing

In [11]:
import re

from google.cloud import documentai_v1beta2 as documentai
from google.cloud import storage


def batch_parse_form(
        project_id='YOUR_PROJECT_ID',
        input_uri='gs://cloud-samples-data/documentai/form.pdf',
        destination_uri='gs://your-bucket-id/path/to/save/results/'):
    """Parse a form"""

    client = documentai.DocumentUnderstandingServiceClient()

    gcs_source = documentai.types.GcsSource(uri=input_uri)

    # mime_type can be application/pdf, image/tiff,
    # and image/gif, or application/json
    input_config = documentai.types.InputConfig(
        gcs_source=gcs_source, mime_type='application/pdf')

    # where to write results
    output_config = documentai.types.OutputConfig(
        gcs_destination=documentai.types.GcsDestination(
            uri=destination_uri),
        pages_per_shard=1  # Map one doc page to one output page
    )

    # Improve form parsing results by providing key-value pair hints.
    # For each key hint, key is text that is likely to appear in the
    # document as a form field name (i.e. "DOB").
    # Value types are optional, but can be one or more of:
    # ADDRESS, LOCATION, ORGANIZATION, PERSON, PHONE_NUMBER, ID,
    # NUMBER, EMAIL, PRICE, TERMS, DATE, NAME
    key_value_pair_hints = [
        documentai.types.KeyValuePairHint(
            key='Emergency Contact',
            value_types=['NAME']),
        documentai.types.KeyValuePairHint(
            key='Referred By')
    ]

    # Setting enabled=True enables form extraction
    form_extraction_params = documentai.types.FormExtractionParams(
        enabled=True, key_value_pair_hints=key_value_pair_hints)

    # Location can be 'us' or 'eu'
    parent = 'projects/{}/locations/us'.format(project_id)
    request = documentai.types.ProcessDocumentRequest(
        input_config=input_config,
        output_config=output_config,
        form_extraction_params=form_extraction_params)

    # Add each ProcessDocumentRequest to the batch request
    requests = []
    requests.append(request)

    batch_request = documentai.types.BatchProcessDocumentsRequest(
        parent=parent, requests=requests
    )

    operation = client.batch_process_documents(batch_request)

    # Wait for the operation to finish
    operation.result()

    # Results are written to GCS. Use a regex to find
    # output files
    match = re.match(r'gs://([^/]+)/(.+)', destination_uri)
    output_bucket = match.group(1)
    prefix = match.group(2)

    storage_client = storage.client.Client()
    bucket = storage_client.get_bucket(output_bucket)
    blob_list = list(bucket.list_blobs(prefix=prefix))
    print('Output files:')
    for blob in blob_list:
        print(blob.name)


In [12]:
batch_parse_form(project_id='gcpbabysteps',
        input_uri='gs://buck910/documentAI/decease_claim_set.pdf',
        destination_uri='gs://buck910/documentAI/prs_out.json')

Output files:
documentAI/prs_out.json-output-page-1-to-1.json
documentAI/prs_out.json-output-page-10-to-10.json
documentAI/prs_out.json-output-page-11-to-11.json
documentAI/prs_out.json-output-page-12-to-12.json
documentAI/prs_out.json-output-page-13-to-13.json
documentAI/prs_out.json-output-page-14-to-14.json
documentAI/prs_out.json-output-page-15-to-15.json
documentAI/prs_out.json-output-page-2-to-2.json
documentAI/prs_out.json-output-page-3-to-3.json
documentAI/prs_out.json-output-page-4-to-4.json
documentAI/prs_out.json-output-page-5-to-5.json
documentAI/prs_out.json-output-page-6-to-6.json
documentAI/prs_out.json-output-page-7-to-7.json
documentAI/prs_out.json-output-page-8-to-8.json
documentAI/prs_out.json-output-page-9-to-9.json


### Converting extracted pdf table to a dataframe/csv

In [15]:
from google.cloud import documentai_v1beta2 as documentai
import pandas as pd 

project_id='gcpbabysteps'
input_uri='gs://buck910/documentAI/table1.pdf'
"""Parse a form"""

client = documentai.DocumentUnderstandingServiceClient()
gcs_source = documentai.types.GcsSource(uri=input_uri)

# mime_type can be application/pdf, image/tiff,
# and image/gif, or application/json
input_config = documentai.types.InputConfig(
    gcs_source=gcs_source, mime_type='application/pdf')

# Improve table parsing results by providing bounding boxes
# specifying where the box appears in the document (optional)
table_bound_hints = [
    documentai.types.TableBoundHint(
        page_number=1,
        bounding_box=documentai.types.BoundingPoly(
            # Define a polygon around tables to detect
            # Each vertice coordinate must be a number between 0 and 1
            normalized_vertices=[
                # Top left
                documentai.types.geometry.NormalizedVertex(
                    x=0,
                    y=0
                ),
                # Top right
                documentai.types.geometry.NormalizedVertex(
                    x=1,
                    y=0
                ),
                # Bottom right
                documentai.types.geometry.NormalizedVertex(
                    x=1,
                    y=1
                ),
                # Bottom left
                documentai.types.geometry.NormalizedVertex(
                    x=0,
                    y=1
                )
            ]
        )
    )
]

 

# Setting enabled=True enables form extraction
table_extraction_params = documentai.types.TableExtractionParams(
    enabled=True, table_bound_hints=table_bound_hints)

# Location can be 'us' or 'eu'
parent = 'projects/{}/locations/us'.format(project_id)
request = documentai.types.ProcessDocumentRequest(
    parent=parent,
    input_config=input_config,
    table_extraction_params=table_extraction_params)
document = client.process_document(request=request)
def _get_text(el):
    """Convert text offset indexes into text snippets.
    """
    response = ''
    # If a text segment spans several lines, it will
    # be stored in different text segments.
    for segment in el.text_anchor.text_segments:
        start_index = segment.start_index
        end_index = segment.end_index
        response += document.text[start_index:end_index]
    return response
for page in document.pages:
#     print('Page number: {}'.format(page.page_number))
    for table_num, table in enumerate(page.tables):
#         print('Table {}: '.format(table_num))
        for row_num, row in enumerate(table.header_rows):
            cells_header = [_get_text(cell.layout) for cell in row.cells]
            cells_header1 = [x.replace('\n',' ').strip() for x in cells_header]
            df = pd.DataFrame(columns = cells_header1)
#             print(cells_header1)
        for row_num, row in enumerate(table.body_rows):
            row1 = [_get_text(cell.layout) for cell in row.cells]
            row1 = [x.replace('\n',' ').strip() for x in row1]
            df.loc[len(df)] = row1   
#             print(row1)
print(df)


                                    Response options        Paula  \
0                              By giving a mark only     00 (00%)   
1                By suggesting error correction only   04 (05.88)   
2  By providing written comments on the content only     00 (00%)   
3   By giving a mark and suggesting error correction  03 (04.41%)   
4  By giving a mark and providing written comment...  02 (02.94%)   
5  By suggesting error correction and providing w...  10 (14.70%)   
6  By giving a mark, suggesting error correction,...  49 (72.05%)   
7                                              Total    68 (100%)   

          Mike        Total  
0     00 (00%)     00 (00%)  
1  08 (12.50%)  12 (09.09%)  
2  02 (03.12%)  02 (01.51%)  
3   04(06.25%)  07 (05.30%)  
4  05 (07.81%)  07 (05.30%)  
5  02 (03.12%)  12 (09.09%)  
6  43 (67.19%)  92 (69.70%)  
7    64 (100%)   132 (100%)  


In [18]:
df.to_csv('C:\\Users\\ajay\\Desktop\\docAI\\table1.csv')

In [19]:
from google.cloud import documentai_v1beta2 as documentai
import pandas as pd 

project_id='gcpbabysteps'
input_uri='gs://buck910/documentAI/table1.tiff'
"""Parse a form"""

client = documentai.DocumentUnderstandingServiceClient()
gcs_source = documentai.types.GcsSource(uri=input_uri)

# mime_type can be application/pdf, image/tiff,
# and image/gif, or application/json
input_config = documentai.types.InputConfig(
    gcs_source=gcs_source, mime_type='image/tiff')

# Improve table parsing results by providing bounding boxes
# specifying where the box appears in the document (optional)
table_bound_hints = [
    documentai.types.TableBoundHint(
        page_number=1,
        bounding_box=documentai.types.BoundingPoly(
            # Define a polygon around tables to detect
            # Each vertice coordinate must be a number between 0 and 1
            normalized_vertices=[
                # Top left
                documentai.types.geometry.NormalizedVertex(
                    x=0,
                    y=0
                ),
                # Top right
                documentai.types.geometry.NormalizedVertex(
                    x=1,
                    y=0
                ),
                # Bottom right
                documentai.types.geometry.NormalizedVertex(
                    x=1,
                    y=1
                ),
                # Bottom left
                documentai.types.geometry.NormalizedVertex(
                    x=0,
                    y=1
                )
            ]
        )
    )
]

 

# Setting enabled=True enables form extraction
table_extraction_params = documentai.types.TableExtractionParams(
    enabled=True, table_bound_hints=table_bound_hints)

# Location can be 'us' or 'eu'
parent = 'projects/{}/locations/us'.format(project_id)
request = documentai.types.ProcessDocumentRequest(
    parent=parent,
    input_config=input_config,
    table_extraction_params=table_extraction_params)
document = client.process_document(request=request)
def _get_text(el):
    """Convert text offset indexes into text snippets.
    """
    response = ''
    # If a text segment spans several lines, it will
    # be stored in different text segments.
    for segment in el.text_anchor.text_segments:
        start_index = segment.start_index
        end_index = segment.end_index
        response += document.text[start_index:end_index]
    return response
for page in document.pages:
#     print('Page number: {}'.format(page.page_number))
    for table_num, table in enumerate(page.tables):
#         print('Table {}: '.format(table_num))
        for row_num, row in enumerate(table.header_rows):
            cells_header = [_get_text(cell.layout) for cell in row.cells]
            cells_header1 = [x.replace('\n',' ').strip() for x in cells_header]
            df = pd.DataFrame(columns = cells_header1)
#             print(cells_header1)
        for row_num, row in enumerate(table.body_rows):
            row1 = [_get_text(cell.layout) for cell in row.cells]
            row1 = [x.replace('\n',' ').strip() for x in row1]
            df.loc[len(df)] = row1   
#             print(row1)
print(df)


                                    Response options        Paula  \
0                              By giving a mark only     00 (00%)   
1                By suggesting error correction only   04 (05.88)   
2  By providing written comments on the content only     00 (00%)   
3   By giving a mark and suggesting error correction  03 (04.41%)   
4  By giving a mark and providing written comment...  02 (02.94%)   
5                 By suggesting error correction and  10 (14.70%)   
6          providing written comments on the content                
7  By giving a mark, suggesting error correction,...  49 (72.05%)   
8                                              Total    68 (100%)   

          Mike        Total  
0     00 (00%)     00 (00%)  
1  08 (12.50%)  12 (09.09%)  
2  02 (03.12%)  02 (01.51%)  
3   04(06.25%)  07 (05.30%)  
4  05 (07.81%)  07 (05.30%)  
5  02 (03.12%)  12 (09.09%)  
6                            
7  43 (67.19%)  92 (69.70%)  
8    64 (100%)   132 (100%)  
