## Summarize using DocAI

#### Imports 

In [1]:
from google.api_core.client_options import ClientOptions
from concurrent.futures import ThreadPoolExecutor
from google.cloud import documentai
from pypdf import PdfWriter
from pypdf import PdfReader 
from tqdm import tqdm
import tenacity
import vertexai
import logging
import os


##### Setup logging

In [2]:
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
logger.addHandler(logging.StreamHandler())

#### Essentials

In [3]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = './../credentials/vai-key.json'

PROJECT_ID = 'arun-genai-bb'
LOCATION = 'us-central1'  # not us-central1

In [4]:
client_options = ClientOptions(api_endpoint=f"{LOCATION}-documentai.googleapis.com")
client = documentai.DocumentProcessorServiceClient(client_options=client_options)
parent = client.common_location_path(PROJECT_ID, LOCATION)

In [5]:
DOCAI_PROCESSOR_NAME = 'projects/390991481152/locations/us/processors/86e3109e1fe09371'  # copy from notebook 00
vertexai.init(project=PROJECT_ID, location=LOCATION)
client_options = ClientOptions(api_endpoint=f'us-documentai.googleapis.com')
docai_client = documentai.DocumentProcessorServiceClient(client_options=client_options)

In [6]:
LOCAL_INPUT_DIR = './DATA/INPUT'
LOCAL_OUTPUT_DIR = './DATA/OUTPUT'
FILE_NAME = 'file-2'

In [7]:
reader = PdfReader(f'{LOCAL_INPUT_DIR}/{FILE_NAME}.pdf') 
pages = {}

for i, page in enumerate(reader.pages):
    pages[i] = page

In [8]:
n = len(reader.pages)
d = 15  # docai has a current constraint of 15 pages per document 
for i in range(0, n, d):
    writer = PdfWriter()
    for j in range(i, i+d):
        if j < n:
            writer.add_page(pages[j])
    os.makedirs(f'{LOCAL_INPUT_DIR}/{FILE_NAME}/PARTS/', exist_ok=True)
    with open(f'{LOCAL_INPUT_DIR}/{FILE_NAME}/PARTS/{FILE_NAME}_{i+1}-{i+d}.pdf', 'wb') as f:
        writer.write(f)

In [9]:
def layout_to_text(layout: documentai.Document.Page.Layout, text: str) -> str:
    """
    Document AI identifies text in different parts of the document by their
    offsets in the entirety of the document's text. This function converts
    offsets to a string.
    """
    # If a text segment spans several lines, it will be stored in different text segments.
    return ''.join(text[int(segment.start_index): int(segment.end_index)] for segment in layout.text_anchor.text_segments)

In [10]:
def get_file_paths(dir_name: str) -> list:
    file_paths = []
    for file_name in os.listdir(dir_name):
        if os.path.isfile(os.path.join(dir_name, file_name)):
            file_path = os.path.join(dir_name, file_name)
            file_paths.append(file_path)
    return file_paths

In [11]:
@tenacity.retry(reraise=True, wait=tenacity.wait_fixed(5) + tenacity.wait_random_exponential(min=3, max=20), stop=tenacity.stop_after_attempt(5))
def ocr_docai(file_path: str) -> list:
    summary = ''
    with open(file_path, 'rb') as f:
        pdf = f.read()
        try:
            raw_document = documentai.RawDocument(content=pdf, mime_type='application/pdf')
            request = documentai.ProcessRequest(name=DOCAI_PROCESSOR_NAME, raw_document=raw_document)
            response = docai_client.process_document(request=request)
            entities = response.document.entities
            summary = entities.pop().normalized_value.text.strip()
        except Exception as e:
            logger.error(file_path)
            logger.error(e)
    return summary

In [12]:
%%time 

input_dir = f'./DATA/INPUT/{FILE_NAME}/PARTS/'
file_paths = get_file_paths(input_dir)

with ThreadPoolExecutor(max_workers=6) as executor:  
    summaries = list(tqdm(executor.map(ocr_docai, file_paths)))

concatenated_summaries = '\n'.join(summaries)
logger.info(concatenated_summaries)

11it [00:22,  1.25s/it]./DATA/INPUT/file-2/PARTS/file-2_601-615.pdf
500 Internal error encountered.
12it [00:26,  2.01s/it]./DATA/INPUT/file-2/PARTS/file-2_991-1005.pdf
500 Internal error encountered.
27it [00:51,  1.43s/it]./DATA/INPUT/file-2/PARTS/file-2_166-180.pdf
500 Internal error encountered.
59it [01:48,  2.16s/it]./DATA/INPUT/file-2/PARTS/file-2_586-600.pdf
500 Internal error encountered.
73it [02:04,  1.70s/it]
This rule implements the capital requirements for large banking organizations under the Basel III framework. The rule requires national banks and federal savings associations with total consolidated assets of $50 billion or more to make public disclosures about their risk-weighted assets and capital ratios. 
The rule also establishes four categories of national banks and federal savings associations based on their size and cross-jurisdictional activity. The category a bank or savings association falls into determines the level of public disclosure required. 
The rule a

CPU times: user 2.64 s, sys: 3.72 s, total: 6.36 s
Wall time: 2min 4s


In [14]:
os.makedirs(f'{LOCAL_OUTPUT_DIR}/{FILE_NAME}/', exist_ok=True)
with open(f'{LOCAL_OUTPUT_DIR}/{FILE_NAME}/summary.txt', 'w') as out:
    out.write(concatenated_summaries)