# Document AI Form Parser Visualizer
This notebook shows you how to analyze a pdf using the Google Cloud DocumentAI API

In [None]:
# Install necessary Python libraries
!pip install google-cloud-documentai
!pip install wand
!pip install pillo
!pip install tabulate

!apt-get update
!apt-get install libmagickwand-dev

In [1]:
from google.cloud import documentai_v1beta3 as documentai
from wand.image import Image as WImage
from PIL import Image, ImageDraw

import os
import pandas as pd
from tabulate import tabulate

ImportError: cannot import name 'documentai_v1beta3' from 'google.cloud' (unknown location)

## Download our sample pdf from GCS

In [None]:
PDF_URI = "gs://cesummit_workshop_data/form.pdf" #@param {type: "string"}

In [None]:
# Download the doc
!gsutil cp $PDF_URI ./form.pdf

## Set your Processor Variables 

In [None]:
PROJECT_ID = "google.com:ml-baguette-demos"
LOCATION = "us"  # Format is 'us' or 'eu'
PROCESSOR_ID = "e46132dfddbdc8f2"  # Create processor in Cloud Console

The following code calls the synchronous API and parses the form fields and values.

In [None]:
def process_document_sample():
    # Instantiates a client
    client = documentai.DocumentProcessorServiceClient()

    # The full resource name of the processor, e.g.:
    # projects/project-id/locations/location/processor/processor-id
    # You must create new processors in the Cloud Console first
    name = f"projects/{PROJECT_ID}/locations/{LOCATION}/processors/{PROCESSOR_ID}"

    with open('form.pdf', "rb") as image:
        image_content = image.read()

    # Read the file into memory
    document = {"content": image_content, "mime_type": "application/pdf"}

    # Configure the process request
    request = {"name": name, "document": document}

    # Recognizes text entities in the PDF document
    result = client.process_document(request=request)
    document = result.document
    print("Document processing complete.\n\n")

    # For a full list of Document object attributes, please reference this page: https://googleapis.dev/python/documentai/latest/_modules/google/cloud/documentai_v1beta3/types/document.html#Document    
    document_pages = document.pages
    keys = []
    keysConf = []
    values = []
    valuesConf = []
    
    # Grab each key/value pair and their corresponding confidence scores.
    for page in document_pages:
        for form_field in page.form_fields:
            fieldName=get_text(form_field.field_name,document)
            keys.append(fieldName.replace(':', ''))
            nameConfidence = round(form_field.field_name.confidence,4)
            keysConf.append(nameConfidence)
            fieldValue = get_text(form_field.field_value,document)
            values.append(fieldValue.replace(':', ''))
            valueConfidence = round(form_field.field_value.confidence,4)
            valuesConf.append(valueConfidence)
    
    # Create a Pandas Dataframe to print the values in tabular format. 
    df = pd.DataFrame({'Key': keys, 'Key Conf': keysConf, 'Value': values, 'Value Conf': valuesConf}) 
    print(tabulate(df, headers='keys', tablefmt='psql'))

    return document

def get_text(doc_element: dict, document: dict):
    """
    Document AI identifies form fields by their offsets
    in document text. This function converts offsets
    to text snippets.
    """
    response = ""
    # If a text segment spans several lines, it will
    # be stored in different text segments.
    for segment in doc_element.text_anchor.text_segments:
        start_index = (
            int(segment.start_index)
            if segment in doc_element.text_anchor.text_segments
            else 0
        )
        end_index = int(segment.end_index)
        response += document.text[start_index:end_index]
    return response



In [None]:
doc = process_document_sample()

## Draw the bounding boxes
We will now download the pdf above a jpg and use the spatial data to mark our values.

In [None]:
JPG_URI = "gs://cesummit_workshop_data/form.jpg" #@param {type: "string"}

In [None]:
# Download the doc
!gsutil cp $JPG_URI ./form.jpg

In [None]:
im = Image.open('form.jpg')
draw = ImageDraw.Draw(im)
for form_field in doc.pages[0].form_fields:
    # Draw the bounding box around the form_fields
    # First get the co-ords of the field name
    vertices = []
    for vertex in form_field.field_name.bounding_poly.normalized_vertices:
      vertices.append({'x': vertex.x * im.size[0], 'y': vertex.y * im.size[1]})
    draw.polygon([
        vertices[0]['x'], vertices[0]['y'],
        vertices[1]['x'], vertices[1]['y'],
        vertices[2]['x'], vertices[2]['y'],
        vertices[3]['x'], vertices[3]['y']], outline='red')
    
    vertices = []
    for vertex in form_field.field_value.bounding_poly.normalized_vertices:
        vertices.append({'x': vertex.x * im.size[0], 'y': vertex.y * im.size[1]})
    draw.polygon([
        vertices[0]['x'], vertices[0]['y'],
        vertices[1]['x'], vertices[1]['y'],
        vertices[2]['x'], vertices[2]['y'],
        vertices[3]['x'], vertices[3]['y']], outline='blue')

In [None]:
im