In [None]:
# installs
! pip install -U -q --user google-cloud-documentai
! pip install -U -q --user google-cloud-aiplatform

In [None]:
# restart kernel: top bar -> "Kernel" -> "Restart Kernel"

In [None]:
import os, json

In [None]:
from typing import Optional

from google.api_core.client_options import ClientOptions
from google.cloud import documentai  # type: ignore

In [None]:
import base64
import vertexai
from vertexai.generative_models import GenerativeModel, Part, FinishReason
import vertexai.preview.generative_models as generative_models

In [None]:
P = ! gcloud config list --format 'value(core.project)'
PROJECT_ID = P[0]
REGION = "us-central1"
PROJECT_ID

In [None]:
PN = ! gcloud projects list --filter="$(gcloud config get-value project)" --format="value(PROJECT_NUMBER)"
PROJECT_NUMBER = PN[0]
PROJECT_NUMBER

In [None]:
######################################################################################
#
# helper functions
#
######################################################################################

In [None]:
# TODO(developer): Uncomment these variables before running the sample.
processor_display_name = "conductiv_ocr_processor_1"
location = "us" # Format is "us" or "eu"
mime_type = "application/pdf" # Refer to https://cloud.google.com/document-ai/docs/file-types for supported file types
field_mask = "text,pages.pageNumber"  # Optional. The fields to return in the Document object.
processor_type = "OCR_PROCESSOR"

In [None]:
######################################################################################
#
# create an ocr processsor
#
######################################################################################

In [None]:
def create_processor_sample(
    project_id: str, location: str, processor_display_name: str, processor_type: str
) -> None:
    # You must set the api_endpoint if you use a location other than 'us'.
    opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")

    client = documentai.DocumentProcessorServiceClient(client_options=opts)

    # The full resource name of the location
    # e.g.: projects/project_id/locations/location
    parent = client.common_location_path(project_id, location)

    # Create a processor
    processor = client.create_processor(
        parent=parent,
        processor=documentai.Processor(
            display_name=processor_display_name, type_=processor_type
        ),
    )

    # Print the processor information
    print(f"Processor Name: {processor.name}")
    print(f"Processor Display Name: {processor.display_name}")
    print(f"Processor Type: {processor.type_}")
    return processor.name

In [None]:
cn = create_processor_sample(PROJECT_ID, location, processor_display_name, processor_type)
cn

In [None]:
processor_id = cn.split("/")[-1]
processor_id

In [None]:
######################################################################################
#
# ocr some pdfs
#
######################################################################################

In [None]:
def get_pdf_filenames():
    """Retrieves a list of PDF filenames in the current directory."""

    pdf_files = []
    for filename in os.listdir('.'):  # Iterate over files in the current dir
        if filename.endswith('.pdf'):  # Check for .pdf extension
            pdf_files.append(filename)

    return pdf_files

In [None]:
def process_document_sample(
    project_id: str,
    location: str,
    processor_id: str,
    file_path: str,
    mime_type: str,
    field_mask: Optional[str] = None,
    processor_version_id: Optional[str] = None,
) -> None:
    # You must set the `api_endpoint` if you use a location other than "us".
    opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")

    client = documentai.DocumentProcessorServiceClient(client_options=opts)

    if processor_version_id:
        # The full resource name of the processor version, e.g.:
        # `projects/{project_id}/locations/{location}/processors/{processor_id}/processorVersions/{processor_version_id}`
        name = client.processor_version_path(
            project_id, location, processor_id, processor_version_id
        )
    else:
        # The full resource name of the processor, e.g.:
        # `projects/{project_id}/locations/{location}/processors/{processor_id}`
        name = client.processor_path(project_id, location, processor_id)

    # Read the file into memory
    with open(file_path, "rb") as image:
        image_content = image.read()

    # Load binary data
    raw_document = documentai.RawDocument(content=image_content, mime_type=mime_type)

    # For more information: https://cloud.google.com/document-ai/docs/reference/rest/v1/ProcessOptions
    # Optional: Additional configurations for processing.
    process_options = documentai.ProcessOptions(
        ocr_config=documentai.OcrConfig(
            enable_native_pdf_parsing=True,
            enable_image_quality_scores=True,
            enable_symbol=True,
            # OCR Add Ons https://cloud.google.com/document-ai/docs/ocr-add-ons
            premium_features=documentai.OcrConfig.PremiumFeatures(
                compute_style_info=True,
                enable_math_ocr=False,  # Enable to use Math OCR Model
                enable_selection_mark_detection=True,
            ),
        )
    )

    # Configure the process request
    request = documentai.ProcessRequest(
        name=name,
        raw_document=raw_document,
        field_mask=field_mask,
        process_options=process_options,
    )

    result = client.process_document(request=request)

    # For a full list of `Document` object attributes, reference this page:
    # https://cloud.google.com/document-ai/docs/reference/rest/v1/Document
    document = result.document

    # Read the text recognition output from the processor
    return document.text

In [None]:
# Get the list of PDFs
pdf_filenames = get_pdf_filenames()
pdf_filenames

In [None]:
document_texts = list()

for pdf_filename in pdf_filenames:

    document_text = process_document_sample(
        PROJECT_NUMBER
        , location
        , processor_id
        , pdf_filename
        , mime_type
        , field_mask
        ,
    )
    
    document_texts.append(document_text)
    
for document_text in document_texts:
    print(document_text[:50])
    print('-'*30)

In [None]:
######################################################################################
#
# setup gemini
#
######################################################################################

In [None]:
vertexai.init(project = PROJECT_ID, location = REGION)

In [None]:
# define the LLM
model_name = "gemini-1.5-pro-001" # "gemini-1.5-flash-preview-0514"

In [None]:
generation_config = {
    "max_output_tokens": 8192,
    "temperature": 1,
    "top_p": 0.95,
}

safety_settings = {
    generative_models.HarmCategory.HARM_CATEGORY_HATE_SPEECH: generative_models.HarmBlockThreshold.BLOCK_NONE,
    generative_models.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: generative_models.HarmBlockThreshold.BLOCK_NONE,
    generative_models.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: generative_models.HarmBlockThreshold.BLOCK_NONE,
    generative_models.HarmCategory.HARM_CATEGORY_HARASSMENT: generative_models.HarmBlockThreshold.BLOCK_NONE,
}

In [None]:
system_instruction = "You research assistant capable of analyzing complex legal documents"

In [None]:
model = GenerativeModel(model_name
                        #, tools=tools
                        , system_instruction = [system_instruction]
                           )

In [None]:
######################################################################################
#
# build your use case
#
######################################################################################

In [None]:
instructions = """Compare and contrast the documents below""".replace("\n", " ")

In [None]:
context_documents = "<DOCUMENTS>" + "\n"
for idx, document_text in enumerate(document_texts):
    context_documents += f"<DOCUMENT-{idx}>{json.dumps(document_text)}</DOCUMENT-{idx}>" + "\n"
context_documents += "</DOCUMENTS>" + "\n"

In [None]:
prompt = f"""<INSTRUCTIONS>{instructions}</INSTRUCTIONS> 

<CONTEXT>{context_documents}</CONTEXT>
"""

In [None]:
# start the chat
chat = model.start_chat()
r = chat.send_message([prompt]
                      , generation_config=generation_config
                      , safety_settings=safety_settings
                     )

print(r.candidates[0].content.parts[0].text)

In [None]:
# ask follow up questions

chat_continuance = "your follow up question would go here"

r = chat.send_message([chat_continuance]
                      , generation_config=generation_config
                      , safety_settings=safety_settings
                     )

print(r.candidates[0].content.parts[0].text)