#### This Notebook use Doc AI and converts all PDF document to text so that down the line process can work

#### Author: Saurabh Mangal (saurabhmangal@google.com)
#### Editor / Reviewer: Wan Qi, Jing Le
##### Date: 21st Feb
##### Description: This notebook contains part 1 of lab

 Copyright (c) [2024] [saurabhmangal@] -- 
 This notebook is licensed under the Commercial License.

In [None]:
!pip install --quiet google-cloud-discoveryengine

In [None]:
#Please enter your name/initials (no spaces or special characters allowed), ensure that it is unique
UNIQUE_PREFIX="jingletest2"

In [None]:
import re

PROJECT_ID = !(gcloud config get-value core/project)
project_id = PROJECT_ID = PROJECT_ID[0]

SVC_ACC = !(gcloud config get-value core/account)
SVC_ACC = SVC_ACC[0]

PROJECT_NUMBER=str(int(re.search(r'\d+', SVC_ACC).group()))

In [None]:
# Please enter your name/initials (no spaces or special characters allowed), ensure that it is unique
UNIQUE_PREFIX="asia-notebooks" ### PLEASE UPDATE THIS

GCS_BUCKET_LOCATION = LOCATION = "asia-southeast1"

GCS_BUCKET_NAME = f"{PROJECT_ID}-{UNIQUE_PREFIX}"
GCS_BUCKET_URI = f"gs://{GCS_BUCKET_NAME}"

gcs_uri = GCS_BUCKET_URI

# Create a Cloud Storage Bucket
!gcloud storage buckets create $GCS_BUCKET_URI --location=$GCS_BUCKET_LOCATION

# Upload the PDFs located in the books/ directory into the GCS bucket that you created
!gsutil cp -r ./books/* $GCS_BUCKET_URI

# Verify that all Books 1 to 7 are uploaded to the GCS bucket (8 files in total, 2 for Part 1)
!gsutil ls $GCS_BUCKET_URI


In [None]:
from typing import Optional

from google.api_core.client_options import ClientOptions
from google.cloud import discoveryengine

#TODO(developer): Uncomment these variables before running the sample.
# project_id = "jingle-project-414801"


location = "global" # Values: "global"
#data_store_id = "YOUR_DATA_STORE_ID"

#Must specify either `gcs_uri` or (`bigquery_dataset` and `bigquery_table`)
#Format: `gs://bucket/directory/object.json` or `gs://bucket/directory/*.json`
# gcs_uri = "gs://jingle-project-414801-test"
#bigquery_dataset = "YOUR_BIGQUERY_DATASET"
#bigquery_table = "YOUR_BIGQUERY_TABLE"


def import_documents_sample(
    project_id: str,
    location: str,
    data_store_id: str,
    gcs_uri: Optional[str] = None,
    bigquery_dataset: Optional[str] = None,
    bigquery_table: Optional[str] = None,
) -> str:
    #  For more information, refer to:
    # https://cloud.google.com/generative-ai-app-builder/docs/locations#specify_a_multi-region_for_your_data_store
    client_options = (
        ClientOptions(api_endpoint=f"{location}-discoveryengine.googleapis.com")
        if location != "global"
        else None
    )

    # Create a client
    client = discoveryengine.DocumentServiceClient(client_options=client_options)

    # The full resource name of the search engine branch.
    # e.g. projects/{project}/locations/{location}/dataStores/{data_store_id}/branches/{branch}
    parent = client.branch_path(
        project=project_id,
        location=location,
        data_store=data_store_id,
        branch="default_branch",
    )

    if gcs_uri:
        request = discoveryengine.ImportDocumentsRequest(
            parent=parent,
            gcs_source=discoveryengine.GcsSource(
                input_uris=[gcs_uri], data_schema="custom"
            ),
            # Options: `FULL`, `INCREMENTAL`
            reconciliation_mode=discoveryengine.ImportDocumentsRequest.ReconciliationMode.INCREMENTAL,
        )
    else:
        request = discoveryengine.ImportDocumentsRequest(
            parent=parent,
            bigquery_source=discoveryengine.BigQuerySource(
                project_id=project_id,
                dataset_id=bigquery_dataset,
                table_id=bigquery_table,
                data_schema="custom",
            ),
            # Options: `FULL`, `INCREMENTAL`
            reconciliation_mode=discoveryengine.ImportDocumentsRequest.ReconciliationMode.INCREMENTAL,
        )

    # Make the request
    operation = client.import_documents(request=request)

    print(f"Waiting for operation to complete: {operation.operation.name}")
    response = operation.result()

    # Once the operation is complete,
    # get information from operation metadata
    metadata = discoveryengine.ImportDocumentsMetadata(operation.metadata)

    # Handle the response
    print(response)
    print(metadata)

    return operation.operation.name


#### We try both open source pdf option as well as DOC AI 

In [None]:
# !pip install PyMuPDF Pillow pytesseract

In [None]:
# !pip install langchain

In [None]:
%pip install PyPDF2
%pip install pdfreader

In [None]:
from pdfreader import PDFDocument, SimplePDFViewer

In [None]:
# Load the PDF document
pdf_file = "./books/Book1_HarryPotter_and_the_Sorcerers_Stone_pg15.pdf"


In [None]:
fd = open(pdf_file, "rb")
doc = PDFDocument(fd)

In [None]:
from io import BytesIO
with open(pdf_file, "rb") as f:
    stream = BytesIO(f.read())
doc2 = PDFDocument(stream)

In [None]:
doc2

In [None]:
page_one = next(doc.pages())

In [None]:
# Extract text from the page
page_one.Contents

In [None]:
all_pages = [p for p in doc.pages()]
len(all_pages)

### Doc ai - Easiest and faster solution for all PDF documents 

In [None]:
!pip3 install --upgrade google-cloud-documentai
!pip3 install --upgrade google-cloud-storage
!pip3 install --upgrade google-cloud-documentai-toolbox

In [None]:
!ls

In [None]:
! $GCS_BUCKET_URI

In [None]:
# !rm ./books
!mkdir ./books
!mkdir ./matchingengine
!mkdir ./matchingengine/embeddings

GCS_BUCKET_URI_books = GCS_BUCKET_URI+ '/matchingengine/books'

!gsutil cp -R ./books/* $GCS_BUCKET_URI_books

### Define helper functions

In [None]:
# Function to create Document AI Processor
def create_processor(project_id, location, processor_display_name, processor_type):
    # You must set the api_endpoint if you use a location other than 'us'.
    opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")

    client = documentai.DocumentProcessorServiceClient(client_options=opts)

    # The full resource name of the location
    # e.g.: projects/project_id/locations/location
    parent = client.common_location_path(project_id, location)

    # Create a processor
    processor = client.create_processor(
        parent=parent,
        processor=documentai.Processor(
            display_name=processor_display_name, type_=processor_type
        ),
    )
    
    processor_id = processor.name.split('/')[-1]

    # Print the processor information
    print(f"Processor Name: {processor.name}")
    print(f"Processor Display Name: {processor.display_name}")
    print(f"Processor ID: {processor_id}")
    print(f"Processor Type: {processor.type_}")
    
    
    return processor, processor_id

### Import Document AI libraries and set variables


In [None]:
from google.api_core.client_options import ClientOptions
from google.cloud import documentai

GCP_PROJECT = PROJECT_ID #'cloud-llm-preview1'
LOCATION = "us"  # Format is 'us' or 'eu'
GCP_REGION='us-central1'

# Variables for Document AI OCR Processor
PROCESSOR_DISPLAY_NAME = UNIQUE_PREFIX + '-ocr-processor' # Must be unique per project, e.g.: 'My Processor'
PROCESSOR_TYPE = 'OCR_PROCESSOR' # Use fetch_processor_types to get available processor types

### Create Document AI Document OCR Processor


In [None]:
PROCESSOR, PROCESSOR_ID = create_processor(PROJECT_ID, LOCATION,PROCESSOR_DISPLAY_NAME, PROCESSOR_TYPE)


In [None]:
# def list_processors_sample(project_id: str, location: str) -> None:
#     # You must set the api_endpoint if you use a location other than 'us'.
#     opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")

#     client = documentai.DocumentProcessorServiceClient(client_options=opts)

#     # The full resource name of the location
#     # e.g.: projects/project_id/locations/location
#     parent = client.common_location_path(project_id, location)

#     # Make ListProcessors request
#     processor_list = client.list_processors(parent=parent)

#     # Print the processor information
#     for processor in processor_list:
#         print(f"Processor Name: {processor.name}")
#         print(f"Processor Display Name: {processor.display_name}")
#         print(f"Processor Type: {processor.type_}")
#         print("")
        
# list_processors_sample(PROJECT_ID,LOCATION)

# #REPLACE WITH YOUR PROCESSOR_ID
# PROCESSOR_ID="2bfcbd882151f885"

In [None]:
%pip install --upgrade --quiet  google-cloud-documentai
%pip install --upgrade --quiet  google-cloud-documentai-toolbox

In [None]:
from google.api_core.client_options import ClientOptions
from google.cloud import documentai


# PROJECT_ID = "YOUR_PROJECT_ID"
GCP_PROJECT= PROJECT_ID #'cloud-llm-preview1'
LOCATION = "us"  # Format is 'us' or 'eu'
PROCESSOR_ID = PROCESSOR_ID  # Create processor in Cloud Console
GCP_REGION='us-central1'

# The local file in your current working directory
FILE_PATH = "./books/Book1_HarryPotter_and_the_Sorcerers_Stone_pg15.pdf"
# Refer to https://cloud.google.com/document-ai/docs/file-types
# for supported file types
MIME_TYPE = "application/pdf"

# Instantiates a client
docai_client = documentai.DocumentProcessorServiceClient(
    client_options=ClientOptions(api_endpoint=f"{LOCATION}-documentai.googleapis.com")
)

# The full resource name of the processor, e.g.:
# projects/project-id/locations/location/processor/processor-id
# You must create new processors in the Cloud Console first
RESOURCE_NAME = docai_client.processor_path(PROJECT_ID, LOCATION, PROCESSOR_ID)

# Read the file into memory
with open(FILE_PATH, "rb") as image:
    image_content = image.read()

# Load Binary Data into Document AI RawDocument Object
raw_document = documentai.RawDocument(content=image_content, mime_type=MIME_TYPE)

# Configure the process request
request = documentai.ProcessRequest(name=RESOURCE_NAME, raw_document=raw_document)

# Use the Document AI client to process the sample form
result = docai_client.process_document(request=request)

document_object = result.document
print("Document processing complete.")
print(f"Text: {document_object.text}")

page_text =document_object.text

### Running though the batch mode for procssing the full Books

In [None]:
import re
from typing import Optional

from google.api_core.client_options import ClientOptions
from google.api_core.exceptions import InternalServerError
from google.api_core.exceptions import RetryError
from google.cloud import documentai  # type: ignore
from google.cloud import storage

def batch_process_documents(
    project_id: str,
    location: str,
    processor_id: str,
    gcs_output_uri: str,
    processor_version_id: Optional[str] = None,
    gcs_input_uri: Optional[str] = None,
    input_mime_type: Optional[str] = None,
    gcs_input_prefix: Optional[str] = None,
    field_mask: Optional[str] = None,
    timeout: int = 40000000,
) -> None:
    # You must set the `api_endpoint` if you use a location other than "us".
    opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")

    client = documentai.DocumentProcessorServiceClient(client_options=opts)

    if gcs_input_uri:
        # Specify specific GCS URIs to process individual documents
        gcs_document = documentai.GcsDocument(
            gcs_uri=gcs_input_uri, mime_type=input_mime_type
        )
        # Load GCS Input URI into a List of document files
        gcs_documents = documentai.GcsDocuments(documents=[gcs_document])
        input_config = documentai.BatchDocumentsInputConfig(gcs_documents=gcs_documents)
    else:
        # Specify a GCS URI Prefix to process an entire directory
        gcs_prefix = documentai.GcsPrefix(gcs_uri_prefix=gcs_input_prefix)
        input_config = documentai.BatchDocumentsInputConfig(gcs_prefix=gcs_prefix)

    # Cloud Storage URI for the Output Directory
    gcs_output_config = documentai.DocumentOutputConfig.GcsOutputConfig(
        gcs_uri=gcs_output_uri, field_mask=field_mask
    )

    # Where to write results
    output_config = documentai.DocumentOutputConfig(gcs_output_config=gcs_output_config)

    if processor_version_id:
        # The full resource name of the processor version, e.g.:
        # projects/{project_id}/locations/{location}/processors/{processor_id}/processorVersions/{processor_version_id}
        name = client.processor_version_path(
            project_id, location, processor_id, processor_version_id
        )
    else:
        # The full resource name of the processor, e.g.:
        # projects/{project_id}/locations/{location}/processors/{processor_id}
        name = client.processor_path(project_id, location, processor_id)

    request = documentai.BatchProcessRequest(
        name=name,
        input_documents=input_config,
        document_output_config=output_config,
    )

    # BatchProcess returns a Long Running Operation (LRO)
    operation = client.batch_process_documents(request)

    # Continually polls the operation until it is complete.
    # This could take some time for larger files
    # Format: projects/{project_id}/locations/{location}/operations/{operation_id}
    try:
        print(f"Waiting for operation {operation.operation.name} to complete...")
        operation.result(timeout=timeout)
    # Catch exception when operation doesn't finish before timeout
    except (RetryError, InternalServerError) as e:
        print(e.message)

    # NOTE: Can also use callbacks for asynchronous processing
    #
    # def my_callback(future):
    #   result = future.result()
    #
    # operation.add_done_callback(my_callback)

    # Once the operation is complete,
    # get output document information from operation metadata
    metadata = documentai.BatchProcessMetadata(operation.metadata)

    if metadata.state != documentai.BatchProcessMetadata.State.SUCCEEDED:
        raise ValueError(f"Batch Process Failed: {metadata.state_message}")

    storage_client = storage.Client()

    print("Output files:")
    # One process per Input Document
    for process in list(metadata.individual_process_statuses):
        # output_gcs_destination format: gs://BUCKET/PREFIX/OPERATION_NUMBER/INPUT_FILE_NUMBER/
        # The Cloud Storage API requires the bucket name and URI prefix separately
        matches = re.match(r"gs://(.*?)/(.*)", process.output_gcs_destination)
        if not matches:
            print(
                "Could not parse output GCS destination:",
                process.output_gcs_destination,
            )
            continue

        output_bucket, output_prefix = matches.groups()

        # Get List of Document Objects from the Output Bucket
        output_blobs = storage_client.list_blobs(output_bucket, prefix=output_prefix)

        # Document AI may output multiple JSON files per source file
        for blob in output_blobs:
            # Document AI should only output JSON files to GCS
            if blob.content_type != "application/json":
                print(
                    f"Skipping non-supported file: {blob.name} - Mimetype: {blob.content_type}"
                )
                continue

            # Download JSON File as bytes object and convert to Document Object
            print(f"Fetching {blob.name}")
            document = documentai.Document.from_json(
                blob.download_as_bytes(), ignore_unknown_fields=True
            )

            # For a full list of Document object attributes, please reference this page:
            # https://cloud.google.com/python/docs/reference/documentai/latest/google.cloud.documentai_v1.types.Document
            
            # Read the text recognition output from the processor 
            print("The document contains the following text:")
            print(document.text)
            
        return(document.text)


In [None]:
import asyncio
import time
import re

async def my_async_function():
    # Your asynchronous code here
    await asyncio.sleep(5)  # Placeholder for some asynchronous task
    print("Async function completed")

# Trigger the function asynchronously
async def trigger_async_function():
    await my_async_function()

asyncio.create_task(trigger_async_function())


def save_text_to_file(text, filename):
    pattern = r".*/([^/.]+)\.pdf"

    # Extract the filename
    match = re.search(pattern, filename)
    if match:
        filename = match.group(1)
        print(filename + " has been processed successfully.\n")  # Output ex: Book4_The_Goblet_of_Fire
    else:
        print("No match found")
    
    filename_txt = './results/' + filename + ".txt"
    with open(filename_txt, 'w', encoding='utf-8') as f:
        f.write(text)

In [None]:
# PROJECT_ID = "YOUR_PROJECT_ID"
GCP_PROJECT= PROJECT_ID #'cloud-llm-preview1'
LOCATION = location = "us"  # Format is 'us' or 'eu'
processor_id=PROCESSOR_ID
GCP_REGION='us-central1'

# The local file in your current working directory
FILE_PATH = "./books/Book1_The_Sorcerers_Stone.pdf"
# Refer to https://cloud.google.com/document-ai/docs/file-types
# for supported file types

# TODO(developer): Uncomment these variables before running the sample.
gcs_output_uri = GCS_BUCKET_URI_books # Must end with a trailing slash `/`. Format: gs://bucket/directory/subdirectory/
processor_version_id = "" # Optional. Example: pretrained-ocr-v1.0-2020-09-23

# TODO(developer): You must specify either `gcs_input_uri` and `mime_type` or `gcs_input_prefix`
gcs_input_uri = f"{GCS_BUCKET_URI_books}/Book1_The_Sorcerers_Stone.pdf" # Format: gs://bucket/directory/file.pdf
MIME_TYPE = input_mime_type = "application/pdf"

gcs_input_prefix = f"{GCS_BUCKET_URI_books}/matchingengine/" # Format: gs://bucket/directory/
field_mask = "text,entities,pages.pageNumber"  # Optional. The fields to return in the Document object.
timeout = 400000

book_list = [f"{GCS_BUCKET_URI_books}/Book1_The_Sorcerers_Stone.pdf",
             f"{GCS_BUCKET_URI_books}/Book2_The_Chamber_of_Secrets.pdf",
             f"{GCS_BUCKET_URI_books}/Book3_The_Prisoner_of_Azkaban.pdf",
            f"{GCS_BUCKET_URI_books}/Book4_The_Goblet_of_Fire.pdf",
            f"{GCS_BUCKET_URI_books}/Book5_The_Order_of_the_Phoenix.pdf",
            f"{GCS_BUCKET_URI_books}/Book6_The_HalfBlood_Prince.pdf",
            f"{GCS_BUCKET_URI_books}/Book7_The_Deathly_Hallows.pdf"]

# book_list = ["gs://my-project-0004-346516/matchingengine/books/certificate2.pdf"]


for i in range(0,len(book_list)): 
    gcs_input_uri = book_list[i]
    print(gcs_input_uri + ":\n")
    
    page_text_batch = batch_process_documents(    project_id,
        location,
        processor_id,
        gcs_output_uri,
        None,
        gcs_input_uri,
        input_mime_type,
        gcs_input_prefix,
        field_mask,
        timeout)
    
    # Example usage: my_text = "This is the text from your OCR process."
    
    save_text_to_file(page_text_batch, gcs_input_uri)


    time.sleep(60)
    

#### Here is the documenation for DOC ai https://cloud.google.com/document-ai/docs/samples/documentai-batch-process-document?hl=en


### Setting up embeddings API and testing

In [None]:
# %pip install google-cloud-vertex-ai
%pip install --upgrade google-cloud-aiplatform -q
%pip install tqdm -q
%pip install langchain -q

In [None]:
# init the vertexai package
import vertexai
LOCATION="us-central1"

vertexai.init(project=PROJECT_ID, location=LOCATION)

In [None]:
# Load the text embeddings model
from vertexai.preview.language_models import TextEmbeddingModel 
embedding_model ="textembedding-gecko@003"
model = TextEmbeddingModel.from_pretrained(embedding_model)

In [None]:
import time
import tqdm  # to show a progress bar
import os
# get embeddings for a list of texts
BATCH_SIZE = 5


def get_embeddings_wrapper(texts):
    embs = []
    for i in tqdm.tqdm(range(0, len(texts), BATCH_SIZE)):
        time.sleep(1)  # to avoid the quota error
        result = model.get_embeddings(texts[i : i + BATCH_SIZE])
        embs = embs + [e.values for e in result]
    return embs[0]

def text_embedding(text) -> list:
    """Text embedding with a Large Language Model."""
    model = TextEmbeddingModel.from_pretrained(embedding_model)
    embeddings = model.get_embeddings(text)
    for embedding in embeddings:
        vector = embedding.values
        print(f"Length of Embedding Vector: {len(vector)}")
    return vector

In [None]:
# get embeddings for the question titles and add them as "embedding" column
# df = get_embeddings_wrapper(page_text)
# print(df)

trail_text = [page_text]

text_embedding(trail_text)

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=100,
    chunk_overlap=20,
    separators= ["/,", "##", ">", "-"],#'\n\n', '\n'],
    length_function=len,
    is_separator_regex=False,
)

splitted_texts = text_splitter.create_documents([page_text])
print("one \n",splitted_texts[0])
print("two \n",splitted_texts[1])

In [None]:
splitted_texts_list = text_splitter.split_text(page_text)#[:2]

In [None]:
import pandas as pd

# Create a DataFrame from the splitted texts
df = pd.DataFrame({'splitted_texts': splitted_texts_list})

# Add a row number column
df['id'] = df.index + 1

# Print the DataFrame
print(df)

In [None]:
get_embeddings_wrapper(["hello","apple"])
# print(x)


In [None]:
list(df.splitted_texts)

### Part 2

In [None]:
!pip install --upgrade --quiet  langchain-google-genai

In [None]:
project_id = PROJECT_ID
location = 'us' 
processor_id = PROCESSOR_ID
processor_version = 'rc' 
# file_path = "filepath.pdf" 
# mime_type = 'application/pdf'

GCP_PROJECT= PROJECT_ID
LOCATION = location = "us"  # Format is 'us' or 'eu'
PROCESSOR_ID = processor_id  # Create processor in Cloud Console
FILE_PATH =file_path = pdf_file = "./books/Book1_HarryPotter_and_the_Sorcerers_Stone_pg15.pdf"

# Refer to https://cloud.google.com/document-ai/docs/file-types
# for supported file types
MIME_TYPE =mime_type = "application/pdf"

In [None]:
def online_process(
    project_id: str,
    location: str,
    processor_id: str,
    file_path: str,
    mime_type: str,
) -> documentai.Document:
    """
    A function to process a document online using Google Document AI.
    """

    # Define an options dictionary, which includes the API's URL. This is used to connect to Google's Document AI service
    opts = {"api_endpoint": f"{location}-documentai.googleapis.com"}

    # Create a Document AI client, think of it as our bridge for communicating with Google's services
    documentai_client = documentai.DocumentProcessorServiceClient(client_options=opts)

    # Generate the complete name of the processor
    # You need to first create a processor in the Google Cloud console
    resource_name = documentai_client.processor_path(project_id, location, processor_id)

    # Read in the document you want to analyze (like an image or PDF), and store it in the variable image_content
    with open(file_path, "rb") as image:
        image_content = image.read()

        # Convert the read document into a format that Google Document AI can understand, i.e., a RawDocument object
        raw_document = documentai.RawDocument(
            content=image_content, mime_type=mime_type
        )
        # Create a request, which includes the name of the processor and the document we want to analyze
        request = documentai.ProcessRequest(
            name=resource_name, raw_document=raw_document
        )
        # Send our request and receive the analysis results
        result = documentai_client.process_document(request=request)
        
        print("Document processing complete.")
        # print(f"Text: {document_object.text}")
        
        # Return this analysis result
        return result.document

In [None]:
def trim_text(text: str): 
    """ Removes spaces and newline characters. """ 
    return text.strip().replace("\n", " ")

### https://cloud.google.com/document-ai/docs/handle-response

In [None]:
from typing import Optional, Sequence

from google.api_core.client_options import ClientOptions
from google.cloud import documentai

def print_page_dimensions(dimension: documentai.Document.Page.Dimension) -> None:
    print(f"    Width: {str(dimension.width)}")
    print(f"    Height: {str(dimension.height)}")


def print_detected_langauges(
    detected_languages: Sequence[documentai.Document.Page.DetectedLanguage],
) -> None:
    print("    Detected languages:")
    for lang in detected_languages:
        print(f"        {lang.language_code} ({lang.confidence:.1%} confidence)")


def print_blocks(blocks: Sequence[documentai.Document.Page.Block], text: str) -> None:
    print(f"    {len(blocks)} blocks detected:")
    first_block_text = layout_to_text(blocks[0].layout, text)
    print(f"        First text block: {repr(first_block_text)}")
    last_block_text = layout_to_text(blocks[-1].layout, text)
    print(f"        Last text block: {repr(last_block_text)}")


def print_paragraphs(
    paragraphs: Sequence[documentai.Document.Page.Paragraph], text: str
) -> None:
    print(f"    {len(paragraphs)} paragraphs detected:")
    first_paragraph_text = layout_to_text(paragraphs[0].layout, text)
    print(f"        First paragraph text: {repr(first_paragraph_text)}")

    last_paragraph_text = layout_to_text(paragraphs[-1].layout, text)
    print(f"        Last paragraph text: {repr(last_paragraph_text)}")


def print_lines(lines: Sequence[documentai.Document.Page.Line], text: str) -> None:
    print(f"    {len(lines)} lines detected:")
    first_line_text = layout_to_text(lines[0].layout, text)
    print(f"        First line text: {repr(first_line_text)}")
    last_line_text = layout_to_text(lines[-1].layout, text)
    print(f"        Last line text: {repr(last_line_text)}")


def print_tokens(tokens: Sequence[documentai.Document.Page.Token], text: str) -> None:
    print(f"    {len(tokens)} tokens detected:")
    first_token_text = layout_to_text(tokens[0].layout, text)
    first_token_break_type = tokens[0].detected_break.type_.name
    print(f"        First token text: {repr(first_token_text)}")
    print(f"        First token break type: {repr(first_token_break_type)}")
    if tokens[0].style_info:
        print_style_info(tokens[0].style_info)

    last_token_text = layout_to_text(tokens[-1].layout, text)
    last_token_break_type = tokens[-1].detected_break.type_.name
    print(f"        Last token text: {repr(last_token_text)}")
    print(f"        Last token break type: {repr(last_token_break_type)}")
    if tokens[-1].style_info:
        print_style_info(tokens[-1].style_info)


def print_symbols(
    symbols: Sequence[documentai.Document.Page.Symbol], text: str
) -> None:
    print(f"    {len(symbols)} symbols detected:")
    first_symbol_text = layout_to_text(symbols[0].layout, text)
    print(f"        First symbol text: {repr(first_symbol_text)}")
    last_symbol_text = layout_to_text(symbols[-1].layout, text)
    print(f"        Last symbol text: {repr(last_symbol_text)}")


def print_image_quality_scores(
    image_quality_scores: documentai.Document.Page.ImageQualityScores,
) -> None:
    print(f"    Quality score: {image_quality_scores.quality_score:.1%}")
    print("    Detected defects:")

    for detected_defect in image_quality_scores.detected_defects:
        print(f"        {detected_defect.type_}: {detected_defect.confidence:.1%}")


def print_style_info(style_info: documentai.Document.Page.Token.StyleInfo) -> None:
    """
    Only supported in version `pretrained-ocr-v2.0-2023-06-02`
    """
    print(f"           Font Size: {style_info.font_size}pt")
    print(f"           Font Type: {style_info.font_type}")
    print(f"           Bold: {style_info.bold}")
    print(f"           Italic: {style_info.italic}")
    print(f"           Underlined: {style_info.underlined}")
    print(f"           Handwritten: {style_info.handwritten}")
    print(
        f"           Text Color (RGBa): {style_info.text_color.red}, {style_info.text_color.green}, {style_info.text_color.blue}, {style_info.text_color.alpha}"
    )


def print_visual_elements(
    visual_elements: Sequence[documentai.Document.Page.VisualElement], text: str
) -> None:
    """
    Only supported in version `pretrained-ocr-v2.0-2023-06-02`
    """
    checkboxes = [x for x in visual_elements if "checkbox" in x.type]
    math_symbols = [x for x in visual_elements if x.type == "math_formula"]

    if checkboxes:
        print(f"    {len(checkboxes)} checkboxes detected:")
        print(f"        First checkbox: {repr(checkboxes[0].type)}")
        print(f"        Last checkbox: {repr(checkboxes[-1].type)}")

    if math_symbols:
        print(f"    {len(math_symbols)} math symbols detected:")
        first_math_symbol_text = layout_to_text(math_symbols[0].layout, text)
        print(f"        First math symbol: {repr(first_math_symbol_text)}")


def process_document(
    project_id: str,
    location: str,
    processor_id: str,
    processor_version: str,
    file_path: str,
    mime_type: str,
    process_options: Optional[documentai.ProcessOptions] = None,
) -> documentai.Document:
    # You must set the `api_endpoint` if you use a location other than "us".
    client = documentai.DocumentProcessorServiceClient(
        client_options=ClientOptions(
            api_endpoint=f"{location}-documentai.googleapis.com"
        )
    )

    # The full resource name of the processor version, e.g.:
    # `projects/{project_id}/locations/{location}/processors/{processor_id}/processorVersions/{processor_version_id}`
    # You must create a processor before running this sample.
    name = client.processor_version_path(
        project_id, location, processor_id, processor_version
    )

    # Read the file into memory
    with open(file_path, "rb") as image:
        image_content = image.read()

    # Configure the process request
    request = documentai.ProcessRequest(
        name=name,
        raw_document=documentai.RawDocument(content=image_content, mime_type=mime_type),
        # Only supported for Document OCR processor
        process_options=process_options,
    )

    result = client.process_document(request=request)

    # For a full list of `Document` object attributes, reference this page:
    # https://cloud.google.com/document-ai/docs/reference/rest/v1/Document
    return result.document


def layout_to_text(layout: documentai.Document.Page.Layout, text: str) -> str:
    """
    Document AI identifies text in different parts of the document by their
    offsets in the entirety of the document"s text. This function converts
    offsets to a string.
    """
    # If a text segment spans several lines, it will
    # be stored in different text segments.
    return "".join(
        text[int(segment.start_index) : int(segment.end_index)]
        for segment in layout.text_anchor.text_segments
    )


In [None]:
document = online_process(
    project_id=project_id,
    location=location,
    processor_id=processor_id,
    file_path=file_path,
    mime_type=mime_type,
)

names = []
name_confidence = []
values = []
value_confidence = []

## Using langchain for Doc AI on GCP

In [None]:
%pip install --upgrade --quiet  google-cloud-documentai
%pip install --upgrade --quiet  google-cloud-documentai-toolbox

In [None]:
# GCS_OUTPUT_PATH = "gs://BUCKET_NAME/FOLDER_PATH"

GCS_OUTPUT_PATH= GCS_BUCKET_URI#"gs://tianhaoz-test/saurabh"

# PROCESSOR_NAME = "projects/cloud-llm-preview1/locations/us-central1/processors/96c7b8734e4ddaba"

# PROCESSOR_NAME = 'projects/PROJECT_ID/locations/us/processors/PROCESSOR_ID'

PROCESSOR_NAME = f'projects/{PROJECT_NUMBER}/locations/us/processors/{PROCESSOR_ID}'

# endpoint="projects/cloud-llm-preview1/locations/us-central1/publishers/google/models/medlm-large", instances=instances, parameters=parameters, safety_settings=[]


In [None]:
from langchain_community.document_loaders.blob_loaders import Blob
from langchain_community.document_loaders.parsers import DocAIParser

In [None]:
parser = DocAIParser(
    location="us", processor_name=PROCESSOR_NAME, gcs_output_path=GCS_OUTPUT_PATH
)

In [None]:
# pdf_file = "./books/Book1_HarryPotter_and_the_Sorcerers_Stone_pg15.pdf"

!gsutil cp ./books/Book1_HarryPotter_and_the_Sorcerers_Stone_pg15.pdf "$GCS_BUCKET_URI_books"

In [None]:
blob = Blob(
    path=f"{GCS_BUCKET_URI_books}/Book1_HarryPotter_and_the_Sorcerers_Stone_pg15.pdf"
)

### This below would take a lot of time (about 2-3 mins for a small PDF)

In [None]:
docs = list(parser.lazy_parse(blob))
print(len(docs))



In [None]:
page_contents = [doc.page_content for doc in docs]


In [None]:
# docs[0].page_content
page_contents

In [None]:
import pandas as pd

# Create a DataFrame from the splitted texts
df = pd.DataFrame({'pagewise_texts': page_contents})

# Add a row number column
df['page_id'] = df.index + 1
# df['pagewise_texts'] = df['pagewise_texts'].page_content


# Print the DataFrame
print(df)

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

chunk_size=100
chunk_overlap=20

text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    separators= ["/,", "##", ">", "We"],#'\n\n', '\n'],
    length_function=len,
    is_separator_regex=False,
)

# splitted_texts = text_splitter.create_documents([page_text])
# print("one \n",splitted_texts[0])
# print("two \n",splitted_texts[1])

def split_text_chunks(text, chunk_size):
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]


In [None]:
splitted_texts = []

for index, row in df.iterrows():
    splitted_texts += text_splitter.create_documents(row['pagewise_texts'])
    splitted_texts2 = text_splitter.split_text(row['pagewise_texts'])
    print(splitted_texts2)

    # df['splitted_texts'] = splitted_texts2
    
    df['splitted_texts'] = df['pagewise_texts'].apply(lambda x: text_splitter.split_text(x))


In [None]:
df.head()

In [None]:
df_exploded = df.explode('splitted_texts')


In [None]:
df_exploded.head()

In [None]:
len(splitted_texts)

In [None]:
df_exploded['splitted_texts_chunks'] = df_exploded['splitted_texts'].apply(lambda x: split_text_chunks(x,chunk_size))


In [None]:
df_exploded_2 = df_exploded.explode('splitted_texts_chunks')


In [None]:
df_exploded_2.head(10)

In [None]:
# df_exploded_2 = df_exploded_2.rename(columns={'page_id': 'id'})

df_exploded_2x = df_exploded_2.copy()

df_exploded_2x = df_exploded_2x.reindex()
df_exploded_2x = df_exploded_2x.reset_index()

df_exploded_2x['id'] = df_exploded_2x.index
df_exploded_2x['id'] = df_exploded_2x['id'].astype(str)

# df_exploded_2a = df_exploded_2a.rename(columns={'splitted_texts_chunks_emb': 'embedding'})

df_exploded_2 = df_exploded_2x.copy()

In [None]:
df_exploded_2.head()

In [None]:
%pip install --upgrade --quiet  langchain-core langchain-google-vertexai

In [None]:
# from langchain_google_vertexai import VertexAIEmbeddings

# embeddings = VertexAIEmbeddings(model="models/embedding-003")

# text = "This is a test document."

# query_result = embeddings.embed_query(text)
# # print(query_result)

#### Convert text columns to embeddings

### Add embedding to the splitted texts chunk column
### Convert the dataframe into json files required

In [None]:
# load the BQ Table into a Pandas Dataframe
import pandas as pd
from google.cloud import bigquery

QUESTIONS_SIZE = df_exploded_2.shape[0] #1000

bq_client = bigquery.Client(project=PROJECT_ID)
QUERY_TEMPLATE = """
        SELECT distinct q.id, q.title
        FROM (SELECT * FROM `bigquery-public-data.stackoverflow.posts_questions`
        where Score > 0 ORDER BY View_Count desc) AS q
        LIMIT {limit} ;
        """
query = QUERY_TEMPLATE.format(limit=QUESTIONS_SIZE)
query_job = bq_client.query(query)
rows = query_job.result()
# df = rows.to_dataframe()

# examine the data
# df.head()

In [None]:
len(df_exploded_2.splitted_texts_chunks)

In [None]:
df_exploded_2.head()

In [None]:
import time
import tqdm  # to show a progress bar
from vertexai.language_models import TextEmbeddingModel

# get embeddings for a list of texts
BATCH_SIZE = 5


model_ai="textembedding-gecko@003"

model = TextEmbeddingModel.from_pretrained(model_ai)

def get_embeddings_wrapper(texts):
    embs = []
    for i in tqdm.tqdm(range(0, len(texts), BATCH_SIZE)):
        time.sleep(1)  # to avoid the quota error
        result = model.get_embeddings(texts[i : i + BATCH_SIZE])
        embs = embs + [e.values for e in result]
    return embs
# The following code will get embedding for the question titles and add them as a new column embedding to the DataFrame. This will take a few minutes.

# get embeddings for the question titles and add them as "embedding" column
df = df_exploded_2.assign(embedding=get_embeddings_wrapper(list(df_exploded_2.splitted_texts_chunks)))
df.head()

In [None]:
df_exploded_2.to_csv('df_exploded_2.csv', index=False)


In [None]:
# save id and embedding as a json file
jsonl_string = df[["id",'splitted_texts_chunks', "embedding"]].to_json(orient="records", lines=True)
with open("questions_test.json", "w") as f:
    f.write(jsonl_string)

# show the first few lines of the json file
! head -n 3 questions_test.json

In [None]:
! head -n 3 product-embs.json

### Upload the Json File to matching engine

In [None]:
# generate an unique id for this session
from datetime import datetime

UID = datetime.now().strftime("%m%d%H%M")

BUCKET_URI_ME=f"{GCS_BUCKET_URI}/matchingengine/embedding"
LOCATION = 'asia-southeast1'

from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=LOCATION)

In [None]:
! gsutil cp questions_test.json {BUCKET_URI_ME}

In [56]:
! gsutil ls {BUCKET_URI_ME}

CommandException: "ls" command does not support "file://" URLs. Did you mean to use a gs:// URL?


In [None]:
# create Index
my_index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
    display_name=f"vs-feature-index-{UID}",
    contents_delta_uri=BUCKET_URI_ME,
    dimensions=768,
    approximate_neighbors_count=10,
    project = PROJECT_ID
)

#### Create Index Endpoint and deploy the Index
To use the Index, you need to create an Index Endpoint. It works as a server instance accepting query requests for your Index.


In [None]:
# create IndexEndpoint
my_index_endpoint = aiplatform.MatchingEngineIndexEndpoint.create(
    display_name=f"vs-feature-index-endpoint-{UID}", public_endpoint_enabled=True
)

In [None]:
DEPLOYED_INDEX_ID = f"vs_feature_deployed_{UID}"
# deploy the Index to the Index Endpoint
my_index_endpoint.deploy_index(index=my_index, deployed_index_id=DEPLOYED_INDEX_ID)

from vertexai.language_models import TextEmbeddingModel### Go to you vertex AI console and check the index is CREATED successfully 

## PART 3

#### This Notebook use Vector Search and store Embedding into a vector store along with Indexing

#### Author: Saurabh Mangal (saurabhmangal@google.com)
##### Date: 21st Feb
##### Description: This notebook contains part 1 of lab

 Copyright (c) [2024] [saurabhmangal@] -- 
 This notebook is licensed under the Commercial License.

### Querying a created index

In [None]:
import json

# build dicts for product names and embs
product_names = {}
product_embs = {}
product_text = {}
with open("questions_test.json") as f:
    for l in f.readlines():
        p = json.loads(l)
        id = p["id"]
        product_names[id] = p["id"]
        product_text[id] = p['splitted_texts_chunks']
        product_embs[id] = p["embedding"]

In [None]:
# get the embedding for ID 6523 "cloudveil women's excursion short"
# you can also try with other IDs such as 12711, 18090, 19536 and 11863
query_emb = product_embs["0"]

In [None]:
# run query
response = my_index_endpoint.find_neighbors(
    deployed_index_id=DEPLOYED_INDEX_ID, queries=[query_emb], num_neighbors=3
)

# show the results
for idx, neighbor in enumerate(response[0]):
    print(f"{neighbor.distance:.2f} {product_names[neighbor.id]} {product_text[neighbor.id]}")

### Run Query
Finally it's ready to use Vector Search. In the following code, it creates an embedding for a test question, and find similar question with the Vector Search.

In [40]:
import time
import tqdm  # to show a progress bar

# get embeddings for a list of texts
BATCH_SIZE = 5

# Load the text embeddings model
from vertexai.preview.language_models import TextEmbeddingModel

model = TextEmbeddingModel.from_pretrained("textembedding-gecko@001")

def get_embeddings_wrapper(texts):
    embs = []
    for i in tqdm.tqdm(range(0, len(texts), BATCH_SIZE)):
        time.sleep(1)  # to avoid the quota error
        result = model.get_embeddings(texts[i : i + BATCH_SIZE])
        embs = embs + [e.values for e in result]
    return embs

In [41]:

df = pd.read_csv('df_exploded_2.csv')


test_embeddings = get_embeddings_wrapper(["Who is the best help to Harry Potter?"])
# Test query
response = my_index_endpoint.find_neighbors(
    deployed_index_id=DEPLOYED_INDEX_ID,
    queries=test_embeddings,
    num_neighbors=20,
)

# show the result
import numpy as np

for idx, neighbor in enumerate(response[0]):
    id = np.int64(neighbor.id)
    similar = df.query("id == @id", engine="python")
    print(f"{neighbor.distance:.4f} {similar.splitted_texts_chunks.values[0]}")


100%|██████████| 1/1 [00:01<00:00,  1.12s/it]


NameError: name 'my_index_endpoint' is not defined

### Get an existing Index
To get an index object that already exists, replace the following [your-index-id] with the index ID and run the cell. You can check the ID on the Vector Search Console > INDEXES tab.


In [None]:
%pip install --upgrade google-cloud-aiplatform -q

In [None]:
from google.cloud import aiplatform

REGION = LOCATION = "asia-southeast1"

aiplatform.init(project=PROJECT_ID, location=LOCATION)
vertexai.init(project=PROJECT_ID, location=LOCATION)

### Update all this information below 

#### this setting is obtained from matching engine end point
##### https://console.cloud.google.com/vertex-ai/locations/us-central1/index-endpoints/3345510418113101824/deployed-indexes/vs_quickstart_deployed_02060053?


In [None]:
# my_index_id = "vs-quickstart-index-endpoint-02051523"  # @param {type:"string"}
# my_index = aiplatform.MatchingEngineIndex(my_index_id)
# del(my_index)

my_index = aiplatform.MatchingEngineIndex(
    index_name=f'projects/{PROJECT_NUMBER}/locations/{LOCATION}/indexes/3789128175548628992'
)

my_index_endpoint_id = f"projects/{PROJECT_NUMBER}/locations/{LOCATION}/indexEndpoints/80712949571780608"


# my_index_endpoint_id = "[your-index-endpoint-id]"  # @param {type:"string"}
my_index_endpoint = aiplatform.MatchingEngineIndexEndpoint(my_index_endpoint_id)

#### Querying the earlier created index

In [None]:
# %pip install --upgrade --quiet langchain langchain-google-vertexai

In [None]:
from langchain_google_vertexai import VertexAIEmbeddings

embeddings = VertexAIEmbeddings(model="textembedding-gecko@003")

text = "This is a test document."

query_result = embeddings.embed_query(text)


In [None]:
# this is embedding vector (should be created by calling the embeddings models)

text = "harry potter owl and the green colur boy."

test_embeddings = embeddings.embed_query(text)
print("preview embeddings",test_embeddings[0:2])

In [None]:
# test_embeddings[0]

### Update the information below

In [51]:
# this setting is obtained from matching ending https://console.cloud.google.com/vertex-ai/locations/asia-southeast1/index-endpoints/3366088877738557440/deployed-indexes/vs_quickstart_deployed_02060053?project=jingle-project-414801

from google.cloud import aiplatform_v1

# Set variables for the current deployed index.
API_ENDPOINT="1357861364.asia-southeast1-777458322107.vdb.vertexai.goog"

# INDEX_ENDPOINT = my_index_endpoint_id 
INDEX_ENDPOINT = f"projects/{PROJECT_NUMBER}/locations/{LOCATION}/indexEndpoints/80712949571780608"

# DEPLOYED_INDEX_ID="vs_feature_deployed_02290700"
neighbor_count = 10


# API_ENDPOINT="393815653.asia-southeast1-255766800726.vdb.vertexai.goog"
# INDEX_ENDPOINT="projects/255766800726/locations/asia-southeast1/indexEndpoints/80712949571780608"
# DEPLOYED_INDEX_ID="vs_feature_deployed_03040407"


# Set variables for the current deployed index.
API_ENDPOINT="741446896.asia-southeast1-255766800726.vdb.vertexai.goog"
INDEX_ENDPOINT="projects/255766800726/locations/asia-southeast1/indexEndpoints/6015823939748495360"
DEPLOYED_INDEX_ID="deployed_index_id_unique"

In [52]:
# %pip install -U google-cloud-aiplatform
import pandas as pd

In [54]:
# Configure Vector Search client
client_options = {
  "api_endpoint": API_ENDPOINT
}
vector_search_client = aiplatform_v1.MatchServiceClient(
  client_options=client_options,
)

# Build FindNeighborsRequest object
datapoint = aiplatform_v1.IndexDatapoint(
  feature_vector=test_embeddings[0]
)
query = aiplatform_v1.FindNeighborsRequest.Query(
  datapoint=datapoint,
  # The number of nearest neighbors to be retrieved
  neighbor_count=neighbor_count
)

# print('query', query)

request = aiplatform_v1.FindNeighborsRequest(
  index_endpoint=INDEX_ENDPOINT,
  deployed_index_id=DEPLOYED_INDEX_ID,
  # Request can have multiple queries
  queries=[query],
  return_full_datapoint=True,
)

# Execute the request
response = vector_search_client.find_neighbors(request)

# Handle the response
print(response)

df_new = pd.DataFrame()
print('neighbor_count', neighbor_count)
for i in range(0,neighbor_count):
    x=response.nearest_neighbors[0]
    print('x',x)
    print('id',x.neighbors[i].datapoint.datapoint_id, 'type', type(x.neighbors[i].datapoint.datapoint_id), 'distance',x.neighbors[i].distance)
    
    df_match = df.loc[df['id'] == int(x.neighbors[i].datapoint.datapoint_id) ]

    # Append the matching rows to the new DataFrame
    df_new = pd.concat([df_new, df_match])

# Print the new DataFrame
print(df_new)

nearest_neighbors {
}

neighbor_count 10
x 


IndexError: list index (0) out of range

In [None]:
test_embeddings_2 = [test_embeddings, test_embeddings]

In [55]:
def vector_search_find_neighbors(
    project: str,
    location: str,
    index_endpoint_name: str,
    deployed_index_id: str,
    queries,
    num_neighbors: int,
) -> None:
    """Query the vector search index.

    Args:
        project (str): Required. Project ID
        location (str): Required. The region name
        index_endpoint_name (str): Required. Index endpoint to run the query
        against.
        deployed_index_id (str): Required. The ID of the DeployedIndex to run
        the queries against.
        queries (List[List[float]]): Required. A list of queries. Each query is
        a list of floats, representing a single embedding.
        num_neighbors (int): Required. The number of neighbors to return.
    """
    # Initialize the Vertex AI client
    aiplatform.init(project=project, location=location)

    # Create the index endpoint instance from an existing endpoint.
    my_index_endpoint = aiplatform.MatchingEngineIndexEndpoint(
        index_endpoint_name=index_endpoint_name
    )

    # Query the index endpoint for the nearest neighbors.
    resp = my_index_endpoint.find_neighbors(
        deployed_index_id=deployed_index_id,
        queries=queries,
        num_neighbors=num_neighbors,
    )
    print(resp)

    
# vector_search_find_neighbors(
#     PROJECT_ID,
#     LOCATION,
#     API_ENDPOINT,
#     INDEX_ENDPOINT,
#     test_embeddings[0],
#     10,
# ) 

from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=LOCATION)

# Create the index endpoint instance from an existing endpoint.
my_index_endpoint = aiplatform.MatchingEngineIndexEndpoint(
    index_endpoint_name=INDEX_ENDPOINT
)

    
resp = my_index_endpoint.find_neighbors(
    deployed_index_id="vs_feature_deployed_03040407",
    queries=test_embeddings_2,
    num_neighbors=3,
)

resp

[[], []]

In [None]:
def get_id_with_embedding_matching(test_embeddings) :
    
    datapoint = aiplatform_v1.IndexDatapoint(
      feature_vector=test_embeddings
    )
    query = aiplatform_v1.FindNeighborsRequest.Query(
      datapoint=datapoint,
      # The number of nearest neighbors to be retrieved
      neighbor_count=neighbor_count
    )
    request = aiplatform_v1.FindNeighborsRequest(
      index_endpoint=INDEX_ENDPOINT,
      deployed_index_id=DEPLOYED_INDEX_ID,
      # Request can have multiple queries
      queries=[query],
      return_full_datapoint=False,
    )

    # Execute the request
    response = vector_search_client.find_neighbors(request)
    
    df_new = pd.DataFrame()

    for i in range(0,neighbor_count):
        x=response.nearest_neighbors[0]
        # print('id',x.neighbors[i].datapoint.datapoint_id, 'distance',x.neighbors[i].distance)

        df_match = df.loc[df['id'] == int(x.neighbors[i].datapoint.datapoint_id) ]

        # Append the matching rows to the new DataFrame
        df_new = pd.concat([df_new, df_match])

    # Print the new DataFrame
    # print(df_new)
    
    i,j,k = df_new.index[0:3]
    print(i,j,k)
    
    pagewise_texts_v1 = df_new.loc[i, 'pagewise_texts']
    pagewise_texts_v2 = df_new.loc[j, 'pagewise_texts']
    pagewise_texts_v3 = df_new.loc[k, 'pagewise_texts']
    
    splitted_texts_v1 = df_new.loc[i, 'splitted_texts']
    splitted_texts_v2 = df_new.loc[j, 'pagewise_texts']
    splitted_texts_v3 = df_new.loc[k, 'pagewise_texts']
    
    splitted_texts_chunks_v1 = df_new.loc[i, 'splitted_texts_chunks']
    splitted_texts_chunks_v2 = df_new.loc[j, 'splitted_texts_chunks']
    splitted_texts_chunks_v3 = df_new.loc[k, 'splitted_texts_chunks']
    
    page_id_v1 = df_new.loc[i, 'page_id'] 
    page_id_v2 = df_new.loc[j, 'page_id'] 
    page_id_v3 = df_new.loc[k, 'page_id'] 
    
    return(pagewise_texts_v1,pagewise_texts_v2,pagewise_texts_v3,
           splitted_texts_v1,splitted_texts_v2,splitted_texts_v3,
           splitted_texts_chunks_v1,splitted_texts_chunks_v2,splitted_texts_chunks_v3,
        page_id_v1,page_id_v2,page_id_v3,i,j,k)

In [None]:
import pandas as pd
filename = "./harry_potte_qa.csv"
df_qa = pd.read_csv(filename, sep ="|")

df_qa.head()

In [None]:
df_qa.columns

In [None]:
# for i in range(0, len(df_qa)):
#     df_qa.loc[i, "Question_emb"] = embeddings.embed_query( df_qa.loc[i, "Question"])
#     # print("preview embeddings",test_embeddings[0:2])
    
import csv
import csv

with open('harry_potte_qa.csv', 'r') as input_file, open('harry_potte_qa_output.csv', 'w', newline='') as output_file:

  # Create CSV reader and writer objects
  reader = csv.reader(input_file, delimiter='|')
  writer = csv.writer(output_file, delimiter='|')

  # Read and write the header row
  header = next(reader) + ['i','j','k','pagewise_texts_v1','pagewise_texts_v2','pagewise_texts_v3','splitted_texts_v1','splitted_texts_v2','splitted_texts_v3','splitted_texts_chunks_v1','splitted_texts_chunks_v2','splitted_texts_chunks_v3','page_id_v1','page_id_v2','page_id_v3']
  writer.writerow(header)

  # Loop through the remaining rows
  for i, row in enumerate(reader):
    question = row[0].split('|')[0]  # Use 'i' to access the correct element in the row
    question_emb = embeddings.embed_query( question )
    pagewise_texts_v1,pagewise_texts_v2,pagewise_texts_v3,splitted_texts_v1,splitted_texts_v2,splitted_texts_v3,splitted_texts_chunks_v1,splitted_texts_chunks_v2,splitted_texts_chunks_v3,page_id_v1,page_id_v2,page_id_v3,i,j,k = get_id_with_embedding_matching(question_emb) 
    
    # print( i , question)
    row_out = row + [i,j,k,pagewise_texts_v1,pagewise_texts_v2,pagewise_texts_v3,splitted_texts_v1,splitted_texts_v2,splitted_texts_v3,splitted_texts_chunks_v1,splitted_texts_chunks_v2,splitted_texts_chunks_v3,page_id_v1,page_id_v2,page_id_v3]
    
    # Write the row to the output file
    writer.writerow(row_out)
get_id_with_embedding_matching
# Usage example:
! head -n 2 harry_potte_qa_output.csv

In [None]:
import pandas as pd
filename = "./harry_potte_qa_output.csv"
df_qa = pd.read_csv(filename, sep ="|")

df_qa.head()

### Installation of required libs for Gemini and PaLM


In [None]:
# !pip install streamlit

In [None]:
!pip install --upgrade google-cloud-aiplatform

In [None]:
!pip install --upgrade google-cloud-aiplatform
GCP_PROJECT= PROJECT_ID=PROJECT_ID=project_id
LOCATION = REGION = 'asia-southeast1'

### Vertex AI setup

In [None]:
### Defining PaLM Functions

In [None]:
import os

# import streamlit as st
import vertexai
from vertexai.preview.language_models import TextGenerationModel

vertexai.init(project=PROJECT_ID, location=LOCATION)


# @st.cache_resource
def get_model():
    generation_model = TextGenerationModel.from_pretrained("text-bison@002")
    return generation_model


def get_text_generation(prompt="", **parameters):
    generation_model = get_model()
    response = generation_model.predict(prompt=prompt, **parameters)

    return response.text

### Defining Gemini Functions

In [None]:
import vertexai
from vertexai.preview.generative_models import GenerativeModel, Part

# input_prompt = """can you give me details of paracetamol"""

def generate(input_prompt):
    model = GenerativeModel("gemini-ultra")
    responses = model.generate_content(
        input_prompt ,
    generation_config={
        "max_output_tokens": 2048,
        "temperature": 0.2,
        "top_p": 1,
        "top_k": 32
    },
        safety_settings=[],
        stream=True,
    )
    
    all_response  = []
    
    for response in responses:
        # print(response.text, end="")
        all_response.append(response.text)
    
    # print (all_response)
    
    return(" ".join(all_response))
    

def generate_pro(input_prompt):
    model = GenerativeModel("gemini-pro")
    responses = model.generate_content(
    input_prompt,
    generation_config={
        "max_output_tokens": 2048,
        "temperature": 0.2,
        "top_p": 1
    },stream=True,)
    
    all_response  = []
    
    for response in responses:
        all_response.append(response.text)
    
    # print (all_response)
    
    return(" ".join(all_response))


In [None]:
import vertexai
from vertexai.language_models import TextGenerationModel

vertexai.init(project=PROJECT_ID, location=LOCATION)
parameters = {
    "candidate_count": 1,
    "max_output_tokens": 1024,
    "temperature": 1,
    "top_k": 40
}

def generate_palm_unicorn_v1(input_prompt):
    
    model = TextGenerationModel.from_pretrained("text-unicorn@001")

    response = model.predict(
        input_prompt,
        **parameters
    )
    print(f"Response from Model: {response.text}")
    
    return(response.text)

def generate_palm_bison32k(input_prompt):
    
    model = TextGenerationModel.from_pretrained("text-bison-32k")

    response = model.predict(
        input_prompt,
        **parameters
    )
    print(f"Response from Model: {response.text}")
    
    return(response.text)


### Read the Q&A file

#### This uses the file from Matching Engine which has questions and retrieved document results

In [None]:
import pandas as pd
filename = "./harry_potte_qa_output.csv"
df_qa = pd.read_csv(filename, sep ="|")

# print(df_qa.head(1))
System_Prompts = """ You are an expert in reading harry potter books, but only provide evidences from the information provide and do not use an other information
so here are some search results : 
"""

Question_Prompts = """ -- Based on information above help to answer following user question
"""

df_qa['combine_prompt_RAG1'] = System_Prompts + ' ' +df_qa['pagewise_texts_v1'] + ' Please answers the Question : '+ df_qa['Question'] 
df_qa['combine_prompt_RAG2'] = System_Prompts + ' ' +df_qa['pagewise_texts_v2'] + ' Please answers the Questio : '+ df_qa['Question'] 
df_qa['combine_prompt_RAG3'] = System_Prompts + ' ' +df_qa['pagewise_texts_v3'] + ' Please answers the Question : '+ df_qa['Question'] 


# print(df['System Prompts'], df['RAG Results'] ,df['User Question'] )
# print(selected_column[0])

In [None]:
import re

In [None]:
for i in range(0, len(df_qa)):


    clean_text1 = re.sub(r'[^\w\s;]', '', df_qa.loc[i,'combine_prompt_RAG1'])
    clean_text2 = re.sub(r'[^\w\s;]', '', df_qa.loc[i,'combine_prompt_RAG2'])
    clean_text3 = re.sub(r'[^\w\s;]', '', df_qa.loc[i,'combine_prompt_RAG3'])

    if i<=1000:
        # df['Gemini_ultra_model_output'][i] = generate(df['combine_prompt'][i])
        print("iteration #", i, "test")
        if i==32 : 
            print("iteration #", i, "test", clean_text1, clean_text2, clean_text3)
    
    try:
        df_qa.loc[i, "Gemini_pro_model_output_v1"] = generate_pro(clean_text1)
        df_qa.loc[i, "Gemini_pro_model_output_v2"] = generate_pro(clean_text2)
        df_qa.loc[i, "Gemini_pro_model_output_v3"] = generate_pro(clean_text3)
    except :
        print("Prompt error at gemini ", i)
        df_qa.loc[i, "Gemini_pro_model_output_v1"] = "Prompt failed "
        df_qa.loc[i, "Gemini_pro_model_output_v2"] = "Prompt failed "
        df_qa.loc[i, "Gemini_pro_model_output_v3"] = "Prompt failed "

    try:
        df_qa.loc[i, "palm_bison32k_output_v1"] = generate_palm_bison32k(df_qa.loc[i,'combine_prompt_RAG1'])
        df_qa.loc[i, "palm_bison32k_output_v2"] = generate_palm_bison32k(df_qa.loc[i,'combine_prompt_RAG2'])
        df_qa.loc[i, "palm_bison32k_output_v3"] = generate_palm_bison32k(df_qa.loc[i,'combine_prompt_RAG3'])
    except ZeroDivisionError:
        print("Prompt error at palm ", i)
        df_qa.loc[i, "palm_bison32k_output_v1"] = "Prompt failed "
        df_qa.loc[i, "palm_bison32k_output_v2"] = "Prompt failed "
        df_qa.loc[i, "palm_bison32k_output_v3"] = "Prompt failed "
    

# generate_medllms_v1(input_prompt)
# generate_palm_unicorn_v1(input_prompt)
# input_prompt = "What are the symptoms of influenza?" 
# generate_medlpalm(input_prompt)    
    
# print( "/n output here ::" , df['Gemini_ultra_model_output'][i])
# df = df.assign(Gemini_ultra_model_output=generate(df.combine_prompt))
# df['combine_prompt'].head(3)

# df['Gemini_ultra_model_output'].head(3)


In [None]:
df_qa.columns

In [None]:

# Delete the 'col2' column
df_qa = df_qa.drop('combine_prompt_RAG1', axis=1)
df_qa = df_qa.drop('combine_prompt_RAG2', axis=1)
df_qa = df_qa.drop('combine_prompt_RAG3', axis=1)

output1 = "./results/harry_potte_qa_model_out.csv"

df_qa.to_csv(output1)


In [None]:
df_qa.head(1)

#### As we can see in the output above the poor search setup give a bad response 