# Table Parsing
* Created: 2024-11-10 (Sat)
* Updated: 2024-11-11 (Sun)

See https://codelabs.developers.google.com/codelabs/docai-form-parser-v1-python?hl=ko#5
source: https://github.com/GoogleCloudPlatform/document-ai-samples/blob/main/community/codelabs/docai-form-parser/table_parsing.py

In [1]:
!pip3 install --upgrade -q pandas

In [2]:
!pip3 install --upgrade -q google-cloud-documentai

In [None]:
!gcloud storage cp gs://cloud-samples-data/documentai/codelabs/form-parser/form_with_tables.pdf .

In [3]:
"""
Uses Document AI online processing to call a form parser processor
Extracts the tables and data in the document.
"""
from os.path import splitext
from typing import List, Sequence

import pandas as pd
import google.cloud.documentai as docai
#from google.cloud import documentai as docai

def online_process(
    project_id: str,
    location: str,
    processor_id: str,
    file_path: str,
    mime_type: str,
) -> docai.Document:
    """
    Processes a document using the Document AI Online Processing API.
    """

    opts = {"api_endpoint": f"{location}-documentai.googleapis.com"}

    # Instantiates a client
    docai_client = docai.DocumentProcessorServiceClient(client_options=opts)

    # The full resource name of the processor, e.g.:
    # projects/project-id/locations/location/processor/processor-id
    # You must create new processors in the Cloud Console first
    resource_name = docai_client.processor_path(project_id, location, processor_id)

    # Read the file into memory
    with open(file_path, "rb") as image:
        image_content = image.read()

        # Load Binary Data into Document AI RawDocument Object
        raw_document = docai.RawDocument(
            content=image_content, mime_type=mime_type
        )

        # Configure the process request
        request = docai.ProcessRequest(
            name=resource_name, raw_document=raw_document
        )

        # Use the Document AI client to process the sample form
        result = docai_client.process_document(request=request)

        return result.document

def get_table_data(
    rows: Sequence[docai.Document.Page.Table.TableRow], text: str
) -> List[List[str]]:
    """
    Get Text data from table rows
    """
    all_values: List[List[str]] = []
    for row in rows:
        current_row_values: List[str] = []
        for cell in row.cells:
            current_row_values.append(
                text_anchor_to_text(cell.layout.text_anchor, text)
            )
        all_values.append(current_row_values)
    return all_values

def text_anchor_to_text(text_anchor: docai.Document.TextAnchor, text: str) -> str:
    """
    Document AI identifies table data by their offsets in the entirety of the
    document's text. This function converts offsets to a string.
    """
    response = ""
    # If a text segment spans several lines, it will
    # be stored in different text segments.
    for segment in text_anchor.text_segments:
        start_index = int(segment.start_index)
        end_index = int(segment.end_index)
        response += text[start_index:end_index]
    return response.strip().replace("\n", " ")

In [4]:
# TODO: Change these variables
#PROJECT_ID   = "docai-sandbox-439006"
PROJECT_ID   = "qwiklabs-gcp-00-be8e83390131"
LOCATION     = "us"  # "us" or "eu"
PROCESSOR_ID = "fcf4ce1a697a0f44"

In [9]:
# The local file in your current working directory
#FILE_PATH = "form_with_tables.pdf"
FILE_PATH = "sample-form-with-table.pdf"
# Supported MIME_TYPE https://cloud.google.com/document-ai/docs/processors-list
MIME_TYPE = "application/pdf"

In [10]:
document = online_process(
    project_id=PROJECT_ID,
    location=LOCATION,
    processor_id=PROCESSOR_ID,
    file_path=FILE_PATH,
    mime_type=MIME_TYPE,
)

header_row_values: List[List[str]] = []
body_row_values: List[List[str]] = []

# Input Filename without extension
output_file_prefix = splitext(FILE_PATH)[0]

for page in document.pages:
    for index, table in enumerate(page.tables):
        header_row_values = get_table_data(table.header_rows, document.text)
        body_row_values = get_table_data(table.body_rows, document.text)

        # Create a Pandas Dataframe to print the values in tabular format.
        df = pd.DataFrame(
            data=body_row_values,
            columns=pd.MultiIndex.from_arrays(header_row_values),
        )

        print(f"Page {page.page_number} - Table {index}")
        print(df)

        # Save each table as a CSV file
        output_filename = f"{output_file_prefix}_pg{page.page_number}_tb{index}.csv"
        df.to_csv(output_filename, index=False)

Page 1 - Table 0
     Item    Description
0  Item 1  Description 1
1  Item 2  Description 2
2  Item 3  Description 3


Extract information from the table header rows and body rows and save the table data into a Cloud Storage Bucket with the filename sample-form-with-table-tb0.csv.

In [11]:
# TODO: Change it. Don't use the leading gs://
#bucket_name     = "thekim-cepf-documentai"
bucket_name     = "qwiklabs-gcp-00-be8e83390131-cepf-documentai"
input_file_name = "sample-form-with-table-tb0.csv" 

In [12]:
from google.cloud import storage

def upload_to_bucket(bucket_name, input_file_name):
    """Uploads a file to the bucket."""
    # Instantiates a client
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(input_file_name)

    blob.upload_from_filename(input_file_name)

    print(f"File {input_file_name} uploaded to {bucket_name}.")

# Save the DataFrame to a CSV file
df.to_csv(input_file_name, index=False) 
upload_to_bucket(bucket_name, input_file_name)

File sample-form-with-table-tb0.csv uploaded to qwiklabs-gcp-00-be8e83390131-cepf-documentai.
