# Extract specific key/value pairs returned from the Form Parser API
- Created: 2024-11-10 (Sat)
- Updated: 2024-11-11 (Sun)

In [1]:
!pip3 install --upgrade -q pandas

In [2]:
!pip3 install --upgrade -q google-cloud-documentai

In [3]:
import pandas as pd
import google.cloud.documentai as docai
#from google.cloud import documentai_v1 as documentai

def online_process(
    project_id: str,
    location: str,
    processor_id: str,
    file_path: str,
    mime_type: str,
) -> docai.Document:
    """
    Processes a document using the Document AI Online Processing API.
    """

    opts = {"api_endpoint": f"{location}-documentai.googleapis.com"}

    # Instantiates a client
    docai_client = docai.DocumentProcessorServiceClient(client_options=opts)

    # The full resource name of the processor, e.g.:
    # projects/project-id/locations/location/processor/processor-id
    # You must create new processors in the Cloud Console first
    resource_name = docai_client.processor_path(project_id, location, processor_id)

    # Read the file into memory
    with open(file_path, "rb") as image:
        image_content = image.read()

        # Load Binary Data into Document AI RawDocument Object
        raw_document = docai.RawDocument(
            content=image_content, mime_type=mime_type
        )

        # Configure the process request
        request = docai.ProcessRequest(
            name=resource_name, raw_document=raw_document
        )

        # Use the Document AI client to process the sample form
        result = docai_client.process_document(request=request)

        return result.document

def trim_text(text: str):
    """
    Remove extra space characters from text (blank, newline, tab, etc.)
    """
    return text.strip().replace("\n", " ")

In [7]:
# TODO: Change these variables
#PROJECT_ID   = "docai-sandbox-439006"
PROJECT_ID   = "qwiklabs-gcp-00-be8e83390131"
LOCATION     = "us"  # "us" or "eu"
PROCESSOR_ID = "fcf4ce1a697a0f44"

In [8]:
# The local file in your current working directory
FILE_PATH = "sample-intake-form.pdf"
#FILE_PATH = "intake-form.pdf"
# Supported MIME_TYPE https://cloud.google.com/document-ai/docs/processors-list
MIME_TYPE = "application/pdf"

In [9]:
document = online_process(
    project_id=PROJECT_ID,
    location=LOCATION,
    processor_id=PROCESSOR_ID,
    file_path=FILE_PATH,
    mime_type=MIME_TYPE,
)

names = []
name_confidence = []
values = []
value_confidence = []

for page in document.pages:
    for field in page.form_fields:
        # Get the extracted field names
        names.append(trim_text(field.field_name.text_anchor.content))
        # Confidence - How "sure" the Model is that the text is correct
        name_confidence.append(field.field_name.confidence)

        values.append(trim_text(field.field_value.text_anchor.content))
        value_confidence.append(field.field_value.confidence)

# Create a Pandas Dataframe to print the values in tabular format.
df = pd.DataFrame(
    {
        "Field Name": names,
        "Field Name Confidence": name_confidence,
        "Field Value": values,
        "Field Value Confidence": value_confidence,
    }
)

print(df)

                                           Field Name  Field Name Confidence  \
0   Are you currently taking any medication? (If y...               0.943337   
1                                           _Phone #:               0.932336   
2                                                Zip:               0.914201   
3                                               City:               0.900499   
4                                              State:               0.893907   
5                                                DOB:               0.885175   
6                                             Gender:               0.882370   
7                                               Name:               0.872789   
8                                     Marital Status:               0.852380   
9   Describe your medical concerns (symptoms, diag...               0.843905   
10                                              Date:               0.829963   
11                                      

Save the text output into a Cloud Storage Bucket with the filename cepf_form_parser.csv.

In [10]:
# TODO: Change it. Don't use the leading gs://
#bucket_name     = "thekim-cepf-documentai"
bucket_name     = "qwiklabs-gcp-00-be8e83390131-cepf-documentai"
input_file_name = "cepf_form_parser.csv" 

In [11]:
from google.cloud import storage

def upload_to_bucket(bucket_name, input_file_name):
    """Uploads a file to the bucket."""
    # Instantiates a client
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(input_file_name)

    blob.upload_from_filename(input_file_name)

    print(f"File {input_file_name} uploaded to {bucket_name}.")

# Save the DataFrame to a CSV file
df.to_csv('cepf_form_parser.csv', index=False) 
upload_to_bucket(bucket_name, input_file_name)

File cepf_form_parser.csv uploaded to qwiklabs-gcp-00-be8e83390131-cepf-documentai.
