#### This Notebook use Doc AI and converts all PDF document to text so that down the line process can work

#### Author: Saurabh Mangal (saurabhmangal@google.com)
#### Editor / Reviewer: Wan Qi, Jing Le
##### Date: 21st Feb
##### Description: This notebook contains part 1 of lab

 Copyright (c) [2024] [saurabhmangal@] -- 
 This notebook is licensed under the Commercial License.

In [208]:
!pip install --quiet google-cloud-discoveryengine
!pip install --upgrade --quiet google-cloud-storage

In [6]:
#Please enter your name/initials (no spaces or special characters allowed), ensure that it is unique
UNIQUE_PREFIX="jingletest"

In [174]:
import re

PROJECT_ID = !(gcloud config get-value core/project)
PROJECT_ID = PROJECT_ID[0]

SVC_ACC = !(gcloud config get-value core/account)
SVC_ACC = SVC_ACC[0]

PROJECT_NUMBER=str(re.search(r'\d+', SVC_ACC).group())

LOCATION="asia-southeast1"

In [211]:
from google.cloud import storage

client = storage.Client()

GCS_BUCKET_LOCATION = "asia-southeast1"

GCS_BUCKET_NAME = f"{PROJECT_ID}-{UNIQUE_PREFIX}"
GCS_BUCKET_URI = f"gs://{GCS_BUCKET_NAME}"

bucket = storage.Bucket(client, GCS_BUCKET_NAME)

if bucket.exists()==False:
    # Create a Cloud Storage Bucket
    !gcloud storage buckets create $GCS_BUCKET_URI --location=$GCS_BUCKET_LOCATION

    # Upload the PDFs located in the books/ directory into the GCS bucket that you created
    !gsutil cp -r ./books/* $GCS_BUCKET_URI

    # Verify that all Books 1 to 7 are uploaded to the GCS bucket (8 files in total, 2 for Part 1)
    !gsutil ls $GCS_BUCKET_URI
else:
    print(f"{GCS_BUCKET_NAME} already exists. Please proceed as usual.")

jingle-project-414801-jingletest already exists. Please proceed as usual.


In [8]:
from typing import Optional

from google.api_core.client_options import ClientOptions
from google.cloud import discoveryengine


#Must specify either `gcs_uri` or (`bigquery_dataset` and `bigquery_table`)
#Format: `gs://bucket/directory/object.json` or `gs://bucket/directory/*.json`
gcs_uri = GCS_BUCKET_URI


def import_documents_sample(
    project_id: str,
    location: str,
    data_store_id: str,
    gcs_uri: Optional[str] = None,
    bigquery_dataset: Optional[str] = None,
    bigquery_table: Optional[str] = None,
) -> str:
    #  For more information, refer to:
    # https://cloud.google.com/generative-ai-app-builder/docs/locations#specify_a_multi-region_for_your_data_store
    client_options = (
        ClientOptions(api_endpoint=f"{location}-discoveryengine.googleapis.com")
        if location != "global"
        else None
    )

    # Create a client
    client = discoveryengine.DocumentServiceClient(client_options=client_options)

    # The full resource name of the search engine branch.
    # e.g. projects/{project}/locations/{location}/dataStores/{data_store_id}/branches/{branch}
    parent = client.branch_path(
        project=project_id,
        location=location,
        data_store=data_store_id,
        branch="default_branch",
    )

    if gcs_uri:
        request = discoveryengine.ImportDocumentsRequest(
            parent=parent,
            gcs_source=discoveryengine.GcsSource(
                input_uris=[gcs_uri], data_schema="custom"
            ),
            # Options: `FULL`, `INCREMENTAL`
            reconciliation_mode=discoveryengine.ImportDocumentsRequest.ReconciliationMode.INCREMENTAL,
        )
    else:
        request = discoveryengine.ImportDocumentsRequest(
            parent=parent,
            bigquery_source=discoveryengine.BigQuerySource(
                project_id=project_id,
                dataset_id=bigquery_dataset,
                table_id=bigquery_table,
                data_schema="custom",
            ),
            # Options: `FULL`, `INCREMENTAL`
            reconciliation_mode=discoveryengine.ImportDocumentsRequest.ReconciliationMode.INCREMENTAL,
        )

    # Make the request
    operation = client.import_documents(request=request)

    print(f"Waiting for operation to complete: {operation.operation.name}")
    response = operation.result()

    # Once the operation is complete,
    # get information from operation metadata
    metadata = discoveryengine.ImportDocumentsMetadata(operation.metadata)

    # Handle the response
    print(response)
    print(metadata)

    return operation.operation.name


#### We try both open source pdf option as well as DOC AI 

In [9]:
# !pip install PyMuPDF Pillow pytesseract

In [10]:
# !pip install langchain

In [11]:
%pip install PyPDF2
%pip install pdfreader

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [12]:
from pdfreader import PDFDocument, SimplePDFViewer

In [13]:
# Load the PDF document
pdf_file = "./books/Book1_HarryPotter_and_the_Sorcerers_Stone_pg15.pdf"


In [14]:
fd = open(pdf_file, "rb")
doc = PDFDocument(fd)

In [15]:
from io import BytesIO
with open(pdf_file, "rb") as f:
    stream = BytesIO(f.read())
doc2 = PDFDocument(stream)

In [16]:
doc2

<pdfreader.document.PDFDocument at 0x7f741652ac20>

In [17]:
page_one = next(doc.pages())

In [18]:
# Extract text from the page
page_one.Contents

<Stream:len=3699,data=b'x\x9c\xdd]K\x8b\xe4\xc8\x11\xbe\xf7\xaf\xd0\xd9`\xad\xdeR\x811\xcc\xf4\xce\x18| ...'>

In [19]:
all_pages = [p for p in doc.pages()]
len(all_pages)

15

### Doc ai - Easiest and faster solution for all PDF documents 

In [20]:
!pip3 install --upgrade google-cloud-documentai
!pip3 install --upgrade google-cloud-storage
!pip3 install --upgrade google-cloud-documentai-toolbox



In [21]:
!ls

'$BUCKET_URI'				      my_video_harry_music_v1.mp4
 Imagen_on_questions.ipynb		     'part1_reading pdf.ipynb'
 backup_folder				     'part2_reading pdf.ipynb'
 books					      part3_RAG_VectorSearch.ipynb
 chkpt.mp3				      part4_QA_Gemini.ipynb
 df_exploded_2.csv			      part5_RAG_VertexSearch.ipynb
 harry_potte_qa.csv			      part6_Gemin_on_video.ipynb
 harry_potte_qa_output.csv		      questions.json
 harry_potter_RAG_Answers_VertexSearch.csv    questions_test.json
 harry_potter_qa_model_RAG_VertexSearch.csv   requirement2.txt
 images					      requirements.txt
 matchingengine				      results
 my_video.avi				      stable_diffusion_2_1.ipynb
 my_video_harry.mp4			      web-app
 my_video_harry_music.mp4


In [22]:

!mkdir ./books
!mkdir ./matchingengine
!mkdir ./matchingengine/embeddings

rm: cannot remove './books': Is a directory
mkdir: cannot create directory ‘./books’: File exists
mkdir: cannot create directory ‘./matchingengine’: File exists
mkdir: cannot create directory ‘./matchingengine/embeddings’: File exists


### Define helper functions

In [23]:
# Function to create Document AI Processor
def create_processor(project_id, location, processor_display_name, processor_type):
    # You must set the api_endpoint if you use a location other than 'us'.
    opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")

    client = documentai.DocumentProcessorServiceClient(client_options=opts)

    # The full resource name of the location
    # e.g.: projects/project_id/locations/location
    parent = client.common_location_path(project_id, location)

    # Create a processor
    processor = client.create_processor(
        parent=parent,
        processor=documentai.Processor(
            display_name=processor_display_name, type_=processor_type
        ),
    )
    
    processor_id = processor.name.split('/')[-1]

    # Print the processor information
    print(f"Processor Name: {processor.name}")
    print(f"Processor Display Name: {processor.display_name}")
    print(f"Processor ID: {processor_id}")
    print(f"Processor Type: {processor.type_}")
    
    
    return processor, processor_id

#Function to retrieve list of existing processors
def list_processors(project_id: str, location: str) -> None:
    processorID_list=[]
    
    # You must set the api_endpoint if you use a location other than 'us'.
    opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")

    client = documentai.DocumentProcessorServiceClient(client_options=opts)

    # The full resource name of the location
    # e.g.: projects/project_id/locations/location
    parent = client.common_location_path(PROJECT_ID, LOCATION)

    # Make ListProcessors request
    processor_list = client.list_processors(parent=parent)

    # Print the processor information
    for processor in processor_list:
        processor_id = processor.name.split('/')[-1]
        processorID_list.append(processor_id)
        
    return processorID_list

### Import Document AI libraries and set variables


In [24]:
from google.api_core.client_options import ClientOptions
from google.cloud import documentai

GCP_PROJECT = PROJECT_ID #'cloud-llm-preview1'
GCP_REGION='asia-southeast1'

# Variables for Document AI OCR Processor
PROCESSOR_DISPLAY_NAME = UNIQUE_PREFIX + '-ocr-processor' # Must be unique per project, e.g.: 'My Processor'
PROCESSOR_TYPE = 'OCR_PROCESSOR' # Use fetch_processor_types to get available processor types

### Create Document AI Document OCR Processor


In [29]:
processorList=list_processors(PROJECT_ID,LOCATION)

#Ensure that only 1 processor can be created & used under this project
if processorList==None:
    PROCESSOR, PROCESSOR_ID = create_processor(PROJECT_ID, LOCATION,PROCESSOR_DISPLAY_NAME, PROCESSOR_TYPE)
else:
    PROCESSOR_ID=processorList[0]
    
print(PROCESSOR_ID)

f1ade36c4f061463


In [26]:
%pip install --upgrade --quiet  google-cloud-documentai
%pip install --upgrade --quiet  google-cloud-documentai-toolbox

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [33]:
from google.api_core.client_options import ClientOptions
from google.cloud import documentai


# PROJECT_ID = "YOUR_PROJECT_ID"
GCP_PROJECT= PROJECT_ID #'cloud-llm-preview1'
LOCATION = "us"  # Format is 'us' or 'eu'
PROCESSOR_ID = PROCESSOR_ID  # Create processor in Cloud Console
GCP_REGION="asia-southeast1"

# The local file in your current working directory
FILE_PATH = "./books/Book1_HarryPotter_and_the_Sorcerers_Stone_pg15.pdf"
# Refer to https://cloud.google.com/document-ai/docs/file-types
# for supported file types
MIME_TYPE = "application/pdf"

# Instantiates a client
docai_client = documentai.DocumentProcessorServiceClient(
    client_options=ClientOptions(api_endpoint=f"{LOCATION}-documentai.googleapis.com")
)

# The full resource name of the processor, e.g.:
# projects/project-id/locations/location/processor/processor-id
# You must create new processors in the Cloud Console first
RESOURCE_NAME = docai_client.processor_path(PROJECT_ID, LOCATION, PROCESSOR_ID)

# Read the file into memory
with open(FILE_PATH, "rb") as image:
    image_content = image.read()

# Load Binary Data into Document AI RawDocument Object
raw_document = documentai.RawDocument(content=image_content, mime_type=MIME_TYPE)

# Configure the process request
request = documentai.ProcessRequest(name=RESOURCE_NAME, raw_document=raw_document)

# Use the Document AI client to process the sample form
result = docai_client.process_document(request=request)

document_object = result.document
print("Document processing complete.")
print(f"Text: {document_object.text}")

page_text =document_object.text

Document processing complete.
Text: CHAPTER ONE
THE BOY WHO LIVED
Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say
that they were perfectly normal, thank you very much. They were the last people
you'd expect to be involved in anything strange or mysterious, because they just
didn't hold with such nonsense.
Mr. Dursley was the director of a firm called Grunnings, which made
drills. He was a big, beefy man with hardly any neck, although he did have a
very large mustache. Mrs. Dursley was thin and blonde and had nearly twice the
usual amount of neck, which came in very useful as she spent so much of her
time craning over garden fences, spying on the neighbors. The Dursleys had a
small son called Dudley and in their opinion there was no finer boy anywhere.
The Dursleys had everything they wanted, but they also had a secret, and
their greatest fear was that somebody would discover it. They didn't think they
could bear it if anyone found out about the Potters. Mrs. Potte

### Running though the batch mode for procssing the full Books

In [31]:
import re
from typing import Optional

from google.api_core.client_options import ClientOptions
from google.api_core.exceptions import InternalServerError
from google.api_core.exceptions import RetryError
from google.cloud import documentai  # type: ignore
from google.cloud import storage

def batch_process_documents(
    project_id: str,
    location: str,
    processor_id: str,
    gcs_output_uri: str,
    processor_version_id: Optional[str] = None,
    gcs_input_uri: Optional[str] = None,
    input_mime_type: Optional[str] = None,
    gcs_input_prefix: Optional[str] = None,
    field_mask: Optional[str] = None,
    timeout: int = 40000000,
) -> None:
    # You must set the `api_endpoint` if you use a location other than "us".
    opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")

    client = documentai.DocumentProcessorServiceClient(client_options=opts)

    if gcs_input_uri:
        # Specify specific GCS URIs to process individual documents
        gcs_document = documentai.GcsDocument(
            gcs_uri=gcs_input_uri, mime_type=input_mime_type
        )
        # Load GCS Input URI into a List of document files
        gcs_documents = documentai.GcsDocuments(documents=[gcs_document])
        input_config = documentai.BatchDocumentsInputConfig(gcs_documents=gcs_documents)
    else:
        # Specify a GCS URI Prefix to process an entire directory
        gcs_prefix = documentai.GcsPrefix(gcs_uri_prefix=gcs_input_prefix)
        input_config = documentai.BatchDocumentsInputConfig(gcs_prefix=gcs_prefix)

    # Cloud Storage URI for the Output Directory
    gcs_output_config = documentai.DocumentOutputConfig.GcsOutputConfig(
        gcs_uri=gcs_output_uri, field_mask=field_mask
    )

    # Where to write results
    output_config = documentai.DocumentOutputConfig(gcs_output_config=gcs_output_config)

    if processor_version_id:
        # The full resource name of the processor version, e.g.:
        # projects/{project_id}/locations/{location}/processors/{processor_id}/processorVersions/{processor_version_id}
        name = client.processor_version_path(
            project_id, location, processor_id, processor_version_id
        )
    else:
        # The full resource name of the processor, e.g.:
        # projects/{project_id}/locations/{location}/processors/{processor_id}
        name = client.processor_path(project_id, location, processor_id)

    request = documentai.BatchProcessRequest(
        name=name,
        input_documents=input_config,
        document_output_config=output_config,
    )

    # BatchProcess returns a Long Running Operation (LRO)
    operation = client.batch_process_documents(request)

    # Continually polls the operation until it is complete.
    # This could take some time for larger files
    # Format: projects/{project_id}/locations/{location}/operations/{operation_id}
    try:
        print(f"Waiting for operation {operation.operation.name} to complete...")
        operation.result(timeout=timeout)
    # Catch exception when operation doesn't finish before timeout
    except (RetryError, InternalServerError) as e:
        print(e.message)

    # Once the operation is complete,
    # get output document information from operation metadata
    metadata = documentai.BatchProcessMetadata(operation.metadata)

    if metadata.state != documentai.BatchProcessMetadata.State.SUCCEEDED:
        raise ValueError(f"Batch Process Failed: {metadata.state_message}")

    storage_client = storage.Client()

    print("Output files:")
    # One process per Input Document
    for process in list(metadata.individual_process_statuses):
        # output_gcs_destination format: gs://BUCKET/PREFIX/OPERATION_NUMBER/INPUT_FILE_NUMBER/
        # The Cloud Storage API requires the bucket name and URI prefix separately
        matches = re.match(r"gs://(.*?)/(.*)", process.output_gcs_destination)
        if not matches:
            print(
                "Could not parse output GCS destination:",
                process.output_gcs_destination,
            )
            continue

        output_bucket, output_prefix = matches.groups()

        # Get List of Document Objects from the Output Bucket
        output_blobs = storage_client.list_blobs(output_bucket, prefix=output_prefix)

        # Document AI may output multiple JSON files per source file
        for blob in output_blobs:
            # Document AI should only output JSON files to GCS
            if blob.content_type != "application/json":
                print(
                    f"Skipping non-supported file: {blob.name} - Mimetype: {blob.content_type}"
                )
                continue

            # Download JSON File as bytes object and convert to Document Object
            print(f"Fetching {blob.name}")
            document = documentai.Document.from_json(
                blob.download_as_bytes(), ignore_unknown_fields=True
            )

            # For a full list of Document object attributes, please reference this page:
            # https://cloud.google.com/python/docs/reference/documentai/latest/google.cloud.documentai_v1.types.Document
            
            # Read the text recognition output from the processor 
            print("The document contains the following text:")
            print(document.text)
            
        return(document.text)


In [32]:
import asyncio
import time
import re

async def my_async_function():
    # Your asynchronous code here
    await asyncio.sleep(5)  # Placeholder for some asynchronous task
    print("Async function completed")

# Trigger the function asynchronously
async def trigger_async_function():
    await my_async_function()

asyncio.create_task(trigger_async_function())


def save_text_to_file(text, filename):
    pattern = r".*/([^/.]+)\.pdf"

    # Extract the filename
    match = re.search(pattern, filename)
    if match:
        filename = match.group(1)
        print(filename + " has been processed successfully.\n")  # Output ex: Book4_The_Goblet_of_Fire
    else:
        print("No match found")
    
    filename_txt = './results/' + filename + ".txt"
    with open(filename_txt, 'w', encoding='utf-8') as f:
        f.write(text)

Async function completed


In [None]:
# PROJECT_ID = "YOUR_PROJECT_ID"
GCP_PROJECT= PROJECT_ID #'cloud-llm-preview1'
LOCATION = location = "us"  # Format is 'us' or 'eu'
processor_id=PROCESSOR_ID

# The local file in your current working directory
FILE_PATH = "./books/Book1_The_Sorcerers_Stone.pdf"
# Refer to https://cloud.google.com/document-ai/docs/file-types
# for supported file types

# TODO(developer): Uncomment these variables before running the sample.
gcs_output_uri = GCS_BUCKET_URI # Must end with a trailing slash `/`. Format: gs://bucket/directory/subdirectory/
processor_version_id = "" # Optional. Example: pretrained-ocr-v1.0-2020-09-23

# TODO(developer): You must specify either `gcs_input_uri` and `mime_type` or `gcs_input_prefix`
gcs_input_uri = f"{GCS_BUCKET_URI}/Book1_The_Sorcerers_Stone.pdf" # Format: gs://bucket/directory/file.pdf
MIME_TYPE = input_mime_type = "application/pdf"

gcs_input_prefix = f"{GCS_BUCKET_URI}/matchingengine/" # Format: gs://bucket/directory/
field_mask = "text,entities,pages.pageNumber"  # Optional. The fields to return in the Document object.
timeout = 400000

book_list = [f"{GCS_BUCKET_URI}/Book1_The_Sorcerers_Stone.pdf",
             f"{GCS_BUCKET_URI}/Book2_The_Chamber_of_Secrets.pdf",
             f"{GCS_BUCKET_URI}/Book3_The_Prisoner_of_Azkaban.pdf",
             f"{GCS_BUCKET_URI}/Book4_The_Goblet_of_Fire.pdf",
             f"{GCS_BUCKET_URI}/Book5_The_Order_of_the_Phoenix.pdf",
             f"{GCS_BUCKET_URI}/Book6_The_HalfBlood_Prince.pdf",
             f"{GCS_BUCKET_URI}/Book7_The_Deathly_Hallows.pdf",]

for i in range(0,len(book_list)): 
    gcs_input_uri = book_list[i]
    print(gcs_input_uri + ":\n")
    
    page_text_batch = batch_process_documents(    project_id,
        location,
        processor_id,
        gcs_output_uri,
        None,
        gcs_input_uri,
        input_mime_type,
        gcs_input_prefix,
        field_mask,
        timeout)
    
    # Example usage: my_text = "This is the text from your OCR process."
    
    save_text_to_file(page_text_batch, gcs_input_uri)


    time.sleep(60)

gs://jingle-project-414801-jingletest/Book1_The_Sorcerers_Stone.pdf:

Waiting for operation projects/777458322107/locations/us/operations/5535580615225040522 to complete...


#### Here is the documenation for DOC ai https://cloud.google.com/document-ai/docs/samples/documentai-batch-process-document?hl=en


### Setting up embeddings API and testing

In [1]:
# %pip install google-cloud-vertex-ai
%pip install --upgrade google-cloud-aiplatform -q
%pip install tqdm -q
%pip install langchain -q

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [34]:
# init the vertexai package
import vertexai
LOCATION="us-central1"

vertexai.init(project=PROJECT_ID, location=LOCATION)

In [35]:
# Load the text embeddings model
from vertexai.preview.language_models import TextEmbeddingModel 
embedding_model ="textembedding-gecko@003"
model = TextEmbeddingModel.from_pretrained(embedding_model)

In [36]:
import time
import tqdm  # to show a progress bar
import os
# get embeddings for a list of texts
BATCH_SIZE = 5


def get_embeddings_wrapper(texts):
    embs = []
    for i in tqdm.tqdm(range(0, len(texts), BATCH_SIZE)):
        time.sleep(1)  # to avoid the quota error
        result = model.get_embeddings(texts[i : i + BATCH_SIZE])
        embs = embs + [e.values for e in result]
    return embs[0]

def text_embedding(text) -> list:
    """Text embedding with a Large Language Model."""
    model = TextEmbeddingModel.from_pretrained(embedding_model)
    embeddings = model.get_embeddings(text)
    for embedding in embeddings:
        vector = embedding.values
        print(f"Length of Embedding Vector: {len(vector)}")
    return vector

In [37]:
# get embeddings for the question titles and add them as "embedding" column
# df = get_embeddings_wrapper(page_text)
# print(df)

trail_text = [page_text]

text_embedding(trail_text)

Length of Embedding Vector: 768


[-0.007328844163566828,
 -0.029501991346478462,
 -0.035514626652002335,
 -0.016409659758210182,
 0.05693705379962921,
 0.012381366454064846,
 -0.01103196106851101,
 -0.000720357580576092,
 -0.004275532905012369,
 -0.0019124862737953663,
 0.003067644080147147,
 0.04406840726733208,
 0.009394695982336998,
 -0.008999423123896122,
 0.022097976878285408,
 0.016643699258565903,
 0.01131175085902214,
 0.04096732288599014,
 -0.007793920114636421,
 -0.07194190472364426,
 -0.02405644580721855,
 0.02122521586716175,
 -0.024772852659225464,
 0.00952995102852583,
 -0.00817031878978014,
 -0.06946851313114166,
 0.04419953376054764,
 -0.07594290375709534,
 0.00040734646609053016,
 0.0008518076501786709,
 -0.021139085292816162,
 0.028713060542941093,
 -0.03671541064977646,
 -0.017625100910663605,
 -0.017690062522888184,
 -0.05329732969403267,
 0.036659881472587585,
 0.05227610468864441,
 -0.024477336555719376,
 0.028329545632004738,
 0.020053138956427574,
 -0.04024481028318405,
 -0.008188542909920216,


In [38]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=100,
    chunk_overlap=20,
    separators= ["/,", "##", ">", "-"],#'\n\n', '\n'],
    length_function=len,
    is_separator_regex=False,
)

splitted_texts = text_splitter.create_documents([page_text])
print("one \n",splitted_texts[0])
print("two \n",splitted_texts[1])

one 
 page_content="CHAPTER ONE\nTHE BOY WHO LIVED\nMr. and Mrs. Dursley, of number four, Privet Drive, were proud to say\nthat they were perfectly normal, thank you very much. They were the last people\nyou'd expect to be involved in anything strange or mysterious, because they just\ndidn't hold with such nonsense.\nMr. Dursley was the director of a firm called Grunnings, which made\ndrills. He was a big, beefy man with hardly any neck, although he did have a\nvery large mustache. Mrs. Dursley was thin and blonde and had nearly twice the\nusual amount of neck, which came in very useful as she spent so much of her\ntime craning over garden fences, spying on the neighbors. The Dursleys had a\nsmall son called Dudley and in their opinion there was no finer boy anywhere.\nThe Dursleys had everything they wanted, but they also had a secret, and\ntheir greatest fear was that somebody would discover it. They didn't think they\ncould bear it if anyone found out about the Potters. Mrs. Potter 

In [39]:
splitted_texts_list = text_splitter.split_text(page_text)#[:2]

In [40]:
import pandas as pd

# Create a DataFrame from the splitted texts
df = pd.DataFrame({'splitted_texts': splitted_texts_list})

# Add a row number column
df['id'] = df.index + 1

# Print the DataFrame
print(df)

                                       splitted_texts  id
0   CHAPTER ONE\nTHE BOY WHO LIVED\nMr. and Mrs. D...   1
1                                                -for   2
2   -nothing\nhusband were as unDursleyish as it w...   3
3   -bye but missed, because\nDudley was now havin...   4
4                                                   -   5
..                                                ...  ..
65  - how could he have forgotten? Harry got slowl...  66
66  -six," he said, looking up at his mother and f...  67
67  -seven then," said Dudley, going red in the fa...  68
68  -nine, sweetums," said Aunt Petunia.\n"Oh." Du...  69
69                -her-name, your friend\n_\nYvonne?"  70

[70 rows x 2 columns]


In [41]:
get_embeddings_wrapper(["hello","apple"])
# print(x)


100%|██████████| 1/1 [00:02<00:00,  2.14s/it]


[0.017048263922333717,
 -0.009294979274272919,
 -0.011405007913708687,
 -0.04494941607117653,
 0.03010178543627262,
 -0.002902369014918804,
 0.017613200470805168,
 -0.007482072338461876,
 0.007690255995839834,
 0.017567314207553864,
 0.02621765062212944,
 0.034840889275074005,
 -0.046389926224946976,
 -0.06721099466085434,
 -0.00016969171701930463,
 -0.0008868438890203834,
 0.0009989439276978374,
 0.02182297222316265,
 -0.0010395842837169766,
 -0.029905909672379494,
 0.02649543620646,
 0.024655284360051155,
 -0.024191144853830338,
 -0.0031523345969617367,
 0.019972553476691246,
 -0.019207755103707314,
 0.016916699707508087,
 -0.04727542772889137,
 -0.016310982406139374,
 0.05416519194841385,
 -0.04295298829674721,
 0.014468264766037464,
 -0.04375581070780754,
 0.023770900443196297,
 0.04904501140117645,
 -0.06196769326925278,
 0.02642875351011753,
 0.01772790215909481,
 -0.002258715918287635,
 0.01409941166639328,
 0.005793617572635412,
 -0.069440558552742,
 -0.045242372900247574,
 -0.

In [42]:
list(df.splitted_texts)

["CHAPTER ONE\nTHE BOY WHO LIVED\nMr. and Mrs. Dursley, of number four, Privet Drive, were proud to say\nthat they were perfectly normal, thank you very much. They were the last people\nyou'd expect to be involved in anything strange or mysterious, because they just\ndidn't hold with such nonsense.\nMr. Dursley was the director of a firm called Grunnings, which made\ndrills. He was a big, beefy man with hardly any neck, although he did have a\nvery large mustache. Mrs. Dursley was thin and blonde and had nearly twice the\nusual amount of neck, which came in very useful as she spent so much of her\ntime craning over garden fences, spying on the neighbors. The Dursleys had a\nsmall son called Dudley and in their opinion there was no finer boy anywhere.\nThe Dursleys had everything they wanted, but they also had a secret, and\ntheir greatest fear was that somebody would discover it. They didn't think they\ncould bear it if anyone found out about the Potters. Mrs. Potter was Mrs.\nDursley'

Part 2

In [39]:
!pip install --upgrade --quiet  langchain-google-genai

In [43]:
project_id = PROJECT_ID
location = 'us' 
processor_id = PROCESSOR_ID
processor_version = 'rc' 
# file_path = "filepath.pdf" 
# mime_type = 'application/pdf'

GCP_PROJECT= PROJECT_ID
LOCATION = location = "us"  # Format is 'us' or 'eu'
PROCESSOR_ID = processor_id  # Create processor in Cloud Console
FILE_PATH =file_path = pdf_file = "./books/Book1_HarryPotter_and_the_Sorcerers_Stone_pg15.pdf"

# Refer to https://cloud.google.com/document-ai/docs/file-types
# for supported file types
MIME_TYPE =mime_type = "application/pdf"

In [44]:
def online_process(
    project_id: str,
    location: str,
    processor_id: str,
    file_path: str,
    mime_type: str,
) -> documentai.Document:
    """
    A function to process a document online using Google Document AI.
    """

    # Define an options dictionary, which includes the API's URL. This is used to connect to Google's Document AI service
    opts = {"api_endpoint": f"{location}-documentai.googleapis.com"}

    # Create a Document AI client, think of it as our bridge for communicating with Google's services
    documentai_client = documentai.DocumentProcessorServiceClient(client_options=opts)

    # Generate the complete name of the processor
    # You need to first create a processor in the Google Cloud console
    resource_name = documentai_client.processor_path(project_id, location, processor_id)

    # Read in the document you want to analyze (like an image or PDF), and store it in the variable image_content
    with open(file_path, "rb") as image:
        image_content = image.read()

        # Convert the read document into a format that Google Document AI can understand, i.e., a RawDocument object
        raw_document = documentai.RawDocument(
            content=image_content, mime_type=mime_type
        )
        # Create a request, which includes the name of the processor and the document we want to analyze
        request = documentai.ProcessRequest(
            name=resource_name, raw_document=raw_document
        )
        # Send our request and receive the analysis results
        result = documentai_client.process_document(request=request)
        
        print("Document processing complete.")
        # print(f"Text: {document_object.text}")
        
        # Return this analysis result
        return result.document

In [45]:
def trim_text(text: str): 
    """ Removes spaces and newline characters. """ 
    return text.strip().replace("\n", " ")

### https://cloud.google.com/document-ai/docs/handle-response

In [46]:
from typing import Optional, Sequence

from google.api_core.client_options import ClientOptions
from google.cloud import documentai

def print_page_dimensions(dimension: documentai.Document.Page.Dimension) -> None:
    print(f"    Width: {str(dimension.width)}")
    print(f"    Height: {str(dimension.height)}")


def print_detected_langauges(
    detected_languages: Sequence[documentai.Document.Page.DetectedLanguage],
) -> None:
    print("    Detected languages:")
    for lang in detected_languages:
        print(f"        {lang.language_code} ({lang.confidence:.1%} confidence)")


def print_blocks(blocks: Sequence[documentai.Document.Page.Block], text: str) -> None:
    print(f"    {len(blocks)} blocks detected:")
    first_block_text = layout_to_text(blocks[0].layout, text)
    print(f"        First text block: {repr(first_block_text)}")
    last_block_text = layout_to_text(blocks[-1].layout, text)
    print(f"        Last text block: {repr(last_block_text)}")


def print_paragraphs(
    paragraphs: Sequence[documentai.Document.Page.Paragraph], text: str
) -> None:
    print(f"    {len(paragraphs)} paragraphs detected:")
    first_paragraph_text = layout_to_text(paragraphs[0].layout, text)
    print(f"        First paragraph text: {repr(first_paragraph_text)}")

    last_paragraph_text = layout_to_text(paragraphs[-1].layout, text)
    print(f"        Last paragraph text: {repr(last_paragraph_text)}")


def print_lines(lines: Sequence[documentai.Document.Page.Line], text: str) -> None:
    print(f"    {len(lines)} lines detected:")
    first_line_text = layout_to_text(lines[0].layout, text)
    print(f"        First line text: {repr(first_line_text)}")
    last_line_text = layout_to_text(lines[-1].layout, text)
    print(f"        Last line text: {repr(last_line_text)}")


def print_tokens(tokens: Sequence[documentai.Document.Page.Token], text: str) -> None:
    print(f"    {len(tokens)} tokens detected:")
    first_token_text = layout_to_text(tokens[0].layout, text)
    first_token_break_type = tokens[0].detected_break.type_.name
    print(f"        First token text: {repr(first_token_text)}")
    print(f"        First token break type: {repr(first_token_break_type)}")
    if tokens[0].style_info:
        print_style_info(tokens[0].style_info)

    last_token_text = layout_to_text(tokens[-1].layout, text)
    last_token_break_type = tokens[-1].detected_break.type_.name
    print(f"        Last token text: {repr(last_token_text)}")
    print(f"        Last token break type: {repr(last_token_break_type)}")
    if tokens[-1].style_info:
        print_style_info(tokens[-1].style_info)


def print_symbols(
    symbols: Sequence[documentai.Document.Page.Symbol], text: str
) -> None:
    print(f"    {len(symbols)} symbols detected:")
    first_symbol_text = layout_to_text(symbols[0].layout, text)
    print(f"        First symbol text: {repr(first_symbol_text)}")
    last_symbol_text = layout_to_text(symbols[-1].layout, text)
    print(f"        Last symbol text: {repr(last_symbol_text)}")


def print_image_quality_scores(
    image_quality_scores: documentai.Document.Page.ImageQualityScores,
) -> None:
    print(f"    Quality score: {image_quality_scores.quality_score:.1%}")
    print("    Detected defects:")

    for detected_defect in image_quality_scores.detected_defects:
        print(f"        {detected_defect.type_}: {detected_defect.confidence:.1%}")


def print_style_info(style_info: documentai.Document.Page.Token.StyleInfo) -> None:
    """
    Only supported in version `pretrained-ocr-v2.0-2023-06-02`
    """
    print(f"           Font Size: {style_info.font_size}pt")
    print(f"           Font Type: {style_info.font_type}")
    print(f"           Bold: {style_info.bold}")
    print(f"           Italic: {style_info.italic}")
    print(f"           Underlined: {style_info.underlined}")
    print(f"           Handwritten: {style_info.handwritten}")
    print(
        f"           Text Color (RGBa): {style_info.text_color.red}, {style_info.text_color.green}, {style_info.text_color.blue}, {style_info.text_color.alpha}"
    )


def print_visual_elements(
    visual_elements: Sequence[documentai.Document.Page.VisualElement], text: str
) -> None:
    """
    Only supported in version `pretrained-ocr-v2.0-2023-06-02`
    """
    checkboxes = [x for x in visual_elements if "checkbox" in x.type]
    math_symbols = [x for x in visual_elements if x.type == "math_formula"]

    if checkboxes:
        print(f"    {len(checkboxes)} checkboxes detected:")
        print(f"        First checkbox: {repr(checkboxes[0].type)}")
        print(f"        Last checkbox: {repr(checkboxes[-1].type)}")

    if math_symbols:
        print(f"    {len(math_symbols)} math symbols detected:")
        first_math_symbol_text = layout_to_text(math_symbols[0].layout, text)
        print(f"        First math symbol: {repr(first_math_symbol_text)}")


def process_document(
    project_id: str,
    location: str,
    processor_id: str,
    processor_version: str,
    file_path: str,
    mime_type: str,
    process_options: Optional[documentai.ProcessOptions] = None,
) -> documentai.Document:
    # You must set the `api_endpoint` if you use a location other than "us".
    client = documentai.DocumentProcessorServiceClient(
        client_options=ClientOptions(
            api_endpoint=f"{location}-documentai.googleapis.com"
        )
    )

    # The full resource name of the processor version, e.g.:
    # `projects/{project_id}/locations/{location}/processors/{processor_id}/processorVersions/{processor_version_id}`
    # You must create a processor before running this sample.
    name = client.processor_version_path(
        project_id, location, processor_id, processor_version
    )

    # Read the file into memory
    with open(file_path, "rb") as image:
        image_content = image.read()

    # Configure the process request
    request = documentai.ProcessRequest(
        name=name,
        raw_document=documentai.RawDocument(content=image_content, mime_type=mime_type),
        # Only supported for Document OCR processor
        process_options=process_options,
    )

    result = client.process_document(request=request)

    # For a full list of `Document` object attributes, reference this page:
    # https://cloud.google.com/document-ai/docs/reference/rest/v1/Document
    return result.document


def layout_to_text(layout: documentai.Document.Page.Layout, text: str) -> str:
    """
    Document AI identifies text in different parts of the document by their
    offsets in the entirety of the document"s text. This function converts
    offsets to a string.
    """
    # If a text segment spans several lines, it will
    # be stored in different text segments.
    return "".join(
        text[int(segment.start_index) : int(segment.end_index)]
        for segment in layout.text_anchor.text_segments
    )


In [47]:
document = online_process(
    project_id=project_id,
    location=location,
    processor_id=processor_id,
    file_path=file_path,
    mime_type=mime_type,
)

names = []
name_confidence = []
values = []
value_confidence = []

Document processing complete.


## Using langchain for Doc AI on GCP

In [48]:
%pip install --upgrade --quiet  google-cloud-documentai
%pip install --upgrade --quiet  google-cloud-documentai-toolbox

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [49]:
GCS_OUTPUT_PATH= GCS_BUCKET_URI#"gs://tianhaoz-test/saurabh"

PROCESSOR_NAME = f'projects/{PROJECT_NUMBER}/locations/us/processors/{PROCESSOR_ID}'

In [50]:
from langchain_community.document_loaders.blob_loaders import Blob
from langchain_community.document_loaders.parsers import DocAIParser

In [51]:
parser = DocAIParser(
    location="us", processor_name=PROCESSOR_NAME, gcs_output_path=GCS_OUTPUT_PATH
)

In [52]:
# pdf_file = "./books/Book1_HarryPotter_and_the_Sorcerers_Stone_pg15.pdf"

!gsutil cp ./books/Book1_HarryPotter_and_the_Sorcerers_Stone_pg15.pdf "$GCS_BUCKET_URI"

Copying file://./books/Book1_HarryPotter_and_the_Sorcerers_Stone_pg15.pdf [Content-Type=application/pdf]...
/ [1 files][112.1 KiB/112.1 KiB]                                                
Operation completed over 1 objects/112.1 KiB.                                    


In [53]:
blob = Blob(
    path=f"{GCS_BUCKET_URI}/Book1_HarryPotter_and_the_Sorcerers_Stone_pg15.pdf"
)

### This below would take a lot of time (about 2-3 mins for a small PDF)

In [54]:
docs = list(parser.lazy_parse(blob))
print(len(docs))

15


In [55]:
page_contents = [doc.page_content for doc in docs]

In [56]:
# docs[0].page_content
page_contents

['CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. Dursley, of number four, Privet Drive, were proud to say\nthat they were perfectly normal, thank you very much. They were the last people\nyou’d expect to be involved in anything strange or mysterious, because they just\ndidn’t hold with such nonsense.\nMr. Dursley was the director of a firm called Grunnings, which made\ndrills. He was a big, beefy man with hardly any neck, although he did have a\nvery large mustache. Mrs. Dursley was thin and blonde and had nearly twice the\nusual amount of neck, which came in very useful as she spent so much of her\ntime craning over garden fences, spying on the neighbors. The Dursleys had a\nsmall\nson called Dudley and in their opinion there was no finer boy anywhere.\nThe Dursleys had everything they wanted, but they also had a secret, and\ntheir greatest fear was that somebody would discover it. They didn’t think they\ncould bear it if anyone found out about the Potters. Mrs. Potter was Mrs.\nDursle

In [57]:
import pandas as pd

# Create a DataFrame from the splitted texts
df = pd.DataFrame({'pagewise_texts': page_contents})

# Add a row number column
df['page_id'] = df.index + 1

# Print the DataFrame
print(df)

                                       pagewise_texts  page_id
0   CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...        1
1   something peculiar — a cat reading a map. For ...        2
2   “\n— yes, their son, Harry —”·\nMr. Dursley st...        3
3   learned a new word (“Won’t!”). Mr. Dursley tri...        4
4   agree.”\nHe didn’t say another word on the sub...        5
5   nearest street lamp went out with a little pop...        6
6   “It certainly seems so,” said Dumbledore. “We ...        7
7   Professor McGonagall’s voice trembled as she w...        8
8   see how much better off he’ll be, growing up a...        9
9   “Even if I could, I wouldn’t. Scars can come i...       10
10  happen. Harry Potter rolled over inside his bl...       11
11    HP 1 - Harry Potter and the\nSorcerer's Stone\n       12
12  CHAPTER TWO\nTHE VANISHING GLASS\nN early ten ...       13
13  pulling a spider off one of them, put them on....       14
14  “Darling, you haven’t counted Auntie Marge’s p...  

In [58]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

chunk_size=100
chunk_overlap=20

text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    separators= ["/,", "##", ">", "We"],#'\n\n', '\n'],
    length_function=len,
    is_separator_regex=False,
)

def split_text_chunks(text, chunk_size):
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]


In [59]:
splitted_texts = []

for index, row in df.iterrows():
    splitted_texts += text_splitter.create_documents(row['pagewise_texts'])
    splitted_texts2 = text_splitter.split_text(row['pagewise_texts'])
    print(splitted_texts2)
    
    df['splitted_texts'] = df['pagewise_texts'].apply(lambda x: text_splitter.split_text(x))


['CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. Dursley, of number four, Privet Drive, were proud to say\nthat they were perfectly normal, thank you very much. They were the last people\nyou’d expect to be involved in anything strange or mysterious, because they just\ndidn’t hold with such nonsense.\nMr. Dursley was the director of a firm called Grunnings, which made\ndrills. He was a big, beefy man with hardly any neck, although he did have a\nvery large mustache. Mrs. Dursley was thin and blonde and had nearly twice the\nusual amount of neck, which came in very useful as she spent so much of her\ntime craning over garden fences, spying on the neighbors. The Dursleys had a\nsmall\nson called Dudley and in their opinion there was no finer boy anywhere.\nThe Dursleys had everything they wanted, but they also had a secret, and\ntheir greatest fear was that somebody would discover it. They didn’t think they\ncould bear it if anyone found out about the Potters. Mrs. Potter was Mrs.\nDursle

In [60]:
df.head()

Unnamed: 0,pagewise_texts,page_id,splitted_texts
0,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,1,[CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs....
1,something peculiar — a cat reading a map. For ...,2,[something peculiar — a cat reading a map. For...
2,"“\n— yes, their son, Harry —”·\nMr. Dursley st...",3,"[“\n— yes, their son, Harry —”·\nMr. Dursley s..."
3,learned a new word (“Won’t!”). Mr. Dursley tri...,4,[learned a new word (“Won’t!”). Mr. Dursley tr...
4,agree.”\nHe didn’t say another word on the sub...,5,[agree.”\nHe didn’t say another word on the su...


In [61]:
df_exploded = df.explode('splitted_texts')


In [62]:
df_exploded.head()

Unnamed: 0,pagewise_texts,page_id,splitted_texts
0,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,1,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...
1,something peculiar — a cat reading a map. For ...,2,something peculiar — a cat reading a map. For ...
2,"“\n— yes, their son, Harry —”·\nMr. Dursley st...",3,"“\n— yes, their son, Harry —”·\nMr. Dursley st..."
3,learned a new word (“Won’t!”). Mr. Dursley tri...,4,learned a new word (“Won’t!”). Mr. Dursley tri...
3,learned a new word (“Won’t!”). Mr. Dursley tri...,4,"Well, Ted,” said the weatherman, “I don’t know..."


In [63]:
len(splitted_texts)

26857

In [64]:
df_exploded['splitted_texts_chunks'] = df_exploded['splitted_texts'].apply(lambda x: split_text_chunks(x,chunk_size))


In [65]:
df_exploded_2 = df_exploded.explode('splitted_texts_chunks')


In [66]:
df_exploded_2.head(10)

Unnamed: 0,pagewise_texts,page_id,splitted_texts,splitted_texts_chunks
0,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,1,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...
0,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,1,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,"\nthat they were perfectly normal, thank you v..."
0,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,1,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,"involved in anything strange or mysterious, be..."
0,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,1,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,rsley was the director of a firm called Grunni...
0,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,1,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,"ardly any neck, although he did have a\nvery l..."
0,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,1,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,"nearly twice the\nusual amount of neck, which..."
0,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,1,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,"raning over garden fences, spying on the neigh..."
0,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,1,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,n their opinion there was no finer boy anywher...
0,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,1,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,"lso had a secret, and\ntheir greatest fear was..."
0,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,1,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,y\ncould bear it if anyone found out about the...


In [67]:
df_exploded_2x = df_exploded_2.copy()

df_exploded_2x = df_exploded_2x.reindex()
df_exploded_2x = df_exploded_2x.reset_index()

df_exploded_2x['id'] = df_exploded_2x.index
df_exploded_2x['id'] = df_exploded_2x['id'].astype(str)

df_exploded_2 = df_exploded_2x.copy()

In [68]:
df_exploded_2.head()

Unnamed: 0,index,pagewise_texts,page_id,splitted_texts,splitted_texts_chunks,id
0,0,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,1,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,0
1,0,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,1,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,"\nthat they were perfectly normal, thank you v...",1
2,0,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,1,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,"involved in anything strange or mysterious, be...",2
3,0,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,1,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,rsley was the director of a firm called Grunni...,3
4,0,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,1,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,"ardly any neck, although he did have a\nvery l...",4


In [69]:
%pip install --upgrade --quiet  langchain-core langchain-google-vertexai

Note: you may need to restart the kernel to use updated packages.


In [70]:
from langchain_google_vertexai import VertexAIEmbeddings

embeddings = VertexAIEmbeddings(model="models/embedding-003")

text = "This is a test document."

query_result = embeddings.embed_query(text)



#### Convert text columns to embeddings

### Add embedding to the splitted texts chunk column
### Convert the dataframe into json files required

In [71]:
# load the BQ Table into a Pandas Dataframe
import pandas as pd
from google.cloud import bigquery

QUESTIONS_SIZE = df_exploded_2.shape[0] #1000

bq_client = bigquery.Client(project=PROJECT_ID)
QUERY_TEMPLATE = """
        SELECT distinct q.id, q.title
        FROM (SELECT * FROM `bigquery-public-data.stackoverflow.posts_questions`
        where Score > 0 ORDER BY View_Count desc) AS q
        LIMIT {limit} ;
        """
query = QUERY_TEMPLATE.format(limit=QUESTIONS_SIZE)
query_job = bq_client.query(query)
rows = query_job.result()

In [72]:
len(df_exploded_2.splitted_texts_chunks)

343

In [73]:
df_exploded_2.head()

Unnamed: 0,index,pagewise_texts,page_id,splitted_texts,splitted_texts_chunks,id
0,0,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,1,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,0
1,0,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,1,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,"\nthat they were perfectly normal, thank you v...",1
2,0,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,1,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,"involved in anything strange or mysterious, be...",2
3,0,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,1,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,rsley was the director of a firm called Grunni...,3
4,0,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,1,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,"ardly any neck, although he did have a\nvery l...",4


In [74]:
import time
import tqdm  # to show a progress bar
from vertexai.language_models import TextEmbeddingModel

# get embeddings for a list of texts
BATCH_SIZE = 5


model_ai="textembedding-gecko@003"

model = TextEmbeddingModel.from_pretrained(model_ai)

def get_embeddings_wrapper(texts):
    embs = []
    for i in tqdm.tqdm(range(0, len(texts), BATCH_SIZE)):
        time.sleep(1)  # to avoid the quota error
        result = model.get_embeddings(texts[i : i + BATCH_SIZE])
        embs = embs + [e.values for e in result]
    return embs
# The following code will get embedding for the question titles and add them as a new column embedding to the DataFrame. This will take a few minutes.

# get embeddings for the question titles and add them as "embedding" column
df = df_exploded_2.assign(embedding=get_embeddings_wrapper(list(df_exploded_2.splitted_texts_chunks)))
df.head()

100%|██████████| 69/69 [01:34<00:00,  1.37s/it]


Unnamed: 0,index,pagewise_texts,page_id,splitted_texts,splitted_texts_chunks,id,embedding
0,0,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,1,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,0,"[0.003317659953609109, -0.04790826886892319, -..."
1,0,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,1,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,"\nthat they were perfectly normal, thank you v...",1,"[0.013395305722951889, -0.08007742464542389, -..."
2,0,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,1,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,"involved in anything strange or mysterious, be...",2,"[0.020153993740677834, -0.03600302338600159, -..."
3,0,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,1,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,rsley was the director of a firm called Grunni...,3,"[0.016535211354494095, -0.014058639295399189, ..."
4,0,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,1,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,"ardly any neck, although he did have a\nvery l...",4,"[0.008282989263534546, -0.046268247067928314, ..."


In [75]:
df_exploded_2.to_csv('df_exploded_2.csv', index=False)


In [76]:
# save id and embedding as a json file
jsonl_string = df[["id",'splitted_texts_chunks', "embedding"]].to_json(orient="records", lines=True)
with open("questions_test.json", "w") as f:
    f.write(jsonl_string)

# show the first few lines of the json file
! head -n 3 questions_test.json

{"id":"0","splitted_texts_chunks":"CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. Dursley, of number four, Privet Drive, were proud to say","embedding":[0.00331766,-0.0479082689,-0.0160974599,0.0116275474,0.0583026372,0.0035153672,0.010107493,-0.0088840397,0.0022986734,0.0063093845,0.0147030614,0.0444023684,0.0337760076,-0.0061602024,0.0160618052,0.0424549431,0.0199845526,0.0161699541,-0.0075055822,-0.0430095233,-0.0260914303,0.0205040853,-0.0246986803,0.0146389743,0.0188306961,-0.0562081747,0.030479867,-0.1032175571,-0.0202636141,0.020874992,-0.0294418689,0.0250715278,-0.0764802545,-0.0115942517,0.0315131135,-0.0704317316,0.0127692958,0.0468112305,-0.0004749426,0.0534682795,0.0005832114,-0.004775404,-0.0113134282,0.0188628975,-0.0156529769,0.0383772999,-0.0436384566,0.0189775527,0.0181709286,-0.0362215303,0.0535730608,0.012083495,0.0190793332,-0.0438452885,-0.0129255848,-0.0609461665,0.0359720252,-0.0106597738,-0.0882806331,0.0098421164,-0.0236698724,0.0302701108,-0.0531803966,0.045661

In [77]:
! head -n 3 product-embs.json

head: cannot open 'product-embs.json' for reading: No such file or directory


### Upload the Json File to matching engine

In [169]:
# generate an unique id for this session
from datetime import datetime

UID = datetime.now().strftime("%m%d%H%M")

BUCKET_URI_ME=f"{GCS_BUCKET_URI}/matchingengine/embedding/"
LOCATION = 'asia-southeast1'

from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=LOCATION)

gs://jingle-project-414801-jingletest/matchingengine/embedding/


In [167]:
! gsutil cp questions_test.json {BUCKET_URI_ME}

Copying file://questions_test.json [Content-Type=application/json]...
/ [1 files][  3.4 MiB/  3.4 MiB]                                                
Operation completed over 1 objects/3.4 MiB.                                      


In [170]:
! gsutil ls {BUCKET_URI_ME}

gs://jingle-project-414801-jingletest/matchingengine/embedding/questions_test.json


In [171]:
# create Index
my_index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
    display_name=f"vs-feature-index-{UID}",
    contents_delta_uri=BUCKET_URI_ME,
    dimensions=768,
    approximate_neighbors_count=10,
    project = PROJECT_ID
)

Creating MatchingEngineIndex


INFO:google.cloud.aiplatform.matching_engine.matching_engine_index:Creating MatchingEngineIndex


Create MatchingEngineIndex backing LRO: projects/777458322107/locations/asia-southeast1/indexes/8681163270779830272/operations/8415568540509143040


INFO:google.cloud.aiplatform.matching_engine.matching_engine_index:Create MatchingEngineIndex backing LRO: projects/777458322107/locations/asia-southeast1/indexes/8681163270779830272/operations/8415568540509143040


MatchingEngineIndex created. Resource name: projects/777458322107/locations/asia-southeast1/indexes/8681163270779830272


INFO:google.cloud.aiplatform.matching_engine.matching_engine_index:MatchingEngineIndex created. Resource name: projects/777458322107/locations/asia-southeast1/indexes/8681163270779830272


To use this MatchingEngineIndex in another session:


INFO:google.cloud.aiplatform.matching_engine.matching_engine_index:To use this MatchingEngineIndex in another session:


index = aiplatform.MatchingEngineIndex('projects/777458322107/locations/asia-southeast1/indexes/8681163270779830272')


INFO:google.cloud.aiplatform.matching_engine.matching_engine_index:index = aiplatform.MatchingEngineIndex('projects/777458322107/locations/asia-southeast1/indexes/8681163270779830272')


#### Create Index Endpoint and deploy the Index
To use the Index, you need to create an Index Endpoint. It works as a server instance accepting query requests for your Index.


In [172]:
# create IndexEndpoint
my_index_endpoint = aiplatform.MatchingEngineIndexEndpoint.create(
    display_name=f"vs-feature-index-endpoint-{UID}", public_endpoint_enabled=True
)

Creating MatchingEngineIndexEndpoint


INFO:google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint:Creating MatchingEngineIndexEndpoint


Create MatchingEngineIndexEndpoint backing LRO: projects/777458322107/locations/asia-southeast1/indexEndpoints/8723120634495762432/operations/5249538002467684352


INFO:google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint:Create MatchingEngineIndexEndpoint backing LRO: projects/777458322107/locations/asia-southeast1/indexEndpoints/8723120634495762432/operations/5249538002467684352


MatchingEngineIndexEndpoint created. Resource name: projects/777458322107/locations/asia-southeast1/indexEndpoints/8723120634495762432


INFO:google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint:MatchingEngineIndexEndpoint created. Resource name: projects/777458322107/locations/asia-southeast1/indexEndpoints/8723120634495762432


To use this MatchingEngineIndexEndpoint in another session:


INFO:google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint:To use this MatchingEngineIndexEndpoint in another session:


index_endpoint = aiplatform.MatchingEngineIndexEndpoint('projects/777458322107/locations/asia-southeast1/indexEndpoints/8723120634495762432')


INFO:google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint:index_endpoint = aiplatform.MatchingEngineIndexEndpoint('projects/777458322107/locations/asia-southeast1/indexEndpoints/8723120634495762432')


In [173]:
DEPLOYED_INDEX_ID = f"vs_feature_deployed_{UID}"
# deploy the Index to the Index Endpoint
my_index_endpoint.deploy_index(index=my_index, deployed_index_id=DEPLOYED_INDEX_ID)

Deploying index MatchingEngineIndexEndpoint index_endpoint: projects/777458322107/locations/asia-southeast1/indexEndpoints/8723120634495762432


INFO:google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint:Deploying index MatchingEngineIndexEndpoint index_endpoint: projects/777458322107/locations/asia-southeast1/indexEndpoints/8723120634495762432


Deploy index MatchingEngineIndexEndpoint index_endpoint backing LRO: projects/777458322107/locations/asia-southeast1/indexEndpoints/8723120634495762432/operations/2133047060327301120


INFO:google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint:Deploy index MatchingEngineIndexEndpoint index_endpoint backing LRO: projects/777458322107/locations/asia-southeast1/indexEndpoints/8723120634495762432/operations/2133047060327301120


MatchingEngineIndexEndpoint index_endpoint Deployed index. Resource name: projects/777458322107/locations/asia-southeast1/indexEndpoints/8723120634495762432


INFO:google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint:MatchingEngineIndexEndpoint index_endpoint Deployed index. Resource name: projects/777458322107/locations/asia-southeast1/indexEndpoints/8723120634495762432


<google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint.MatchingEngineIndexEndpoint object at 0x7f73ea6d9570> 
resource name: projects/777458322107/locations/asia-southeast1/indexEndpoints/8723120634495762432

from vertexai.language_models import TextEmbeddingModel### Go to you vertex AI console and check the index is CREATED successfully 

PART 3

### Querying a created index

In [175]:
import json

# build dicts for product names and embs
product_names = {}
product_embs = {}
product_text = {}
with open("questions_test.json") as f:
    for l in f.readlines():
        p = json.loads(l)
        id = p["id"]
        product_names[id] = p["id"]
        product_text[id] = p['splitted_texts_chunks']
        product_embs[id] = p["embedding"]

In [176]:
# get the embedding for ID 6523 "cloudveil women's excursion short"
# you can also try with other IDs such as 12711, 18090, 19536 and 11863
query_emb = product_embs["0"]

In [177]:
# run query
response = my_index_endpoint.find_neighbors(
    deployed_index_id=DEPLOYED_INDEX_ID, queries=[query_emb], num_neighbors=3
)

# show the results
for idx, neighbor in enumerate(response[0]):
    print(f"{neighbor.distance:.2f} {product_names[neighbor.id]} {product_text[neighbor.id]}")

1.00 0 CHAPTER ONE
THE BOY WHO LIVED
M r. and Mrs. Dursley, of number four, Privet Drive, were proud to say
0.83 270 CHAPTER TWO
THE VANISHING GLASS
N early ten years had passed since the Dursleys had woken up to find
0.81 53 “
— yes, their son, Harry —”·
Mr. Dursley stopped dead. Fear flooded him. He looked back at the
whis


### Run Query
Finally it's ready to use Vector Search. In the following code, it creates an embedding for a test question, and find similar question with the Vector Search.

In [178]:
import time
import tqdm  # to show a progress bar

# get embeddings for a list of texts
BATCH_SIZE = 5

# Load the text embeddings model
from vertexai.preview.language_models import TextEmbeddingModel

model = TextEmbeddingModel.from_pretrained("textembedding-gecko@001")

def get_embeddings_wrapper(texts):
    embs = []
    for i in tqdm.tqdm(range(0, len(texts), BATCH_SIZE)):
        time.sleep(1)  # to avoid the quota error
        result = model.get_embeddings(texts[i : i + BATCH_SIZE])
        embs = embs + [e.values for e in result]
    return embs

In [179]:

df = pd.read_csv('df_exploded_2.csv')


test_embeddings = get_embeddings_wrapper(["Who is the best help to Harry Potter?"])
# Test query
response = my_index_endpoint.find_neighbors(
    deployed_index_id=DEPLOYED_INDEX_ID,
    queries=test_embeddings,
    num_neighbors=20,
)

# show the result
import numpy as np

for idx, neighbor in enumerate(response[0]):
    id = np.int64(neighbor.id)
    similar = df.query("id == @id", engine="python")
    print(f"{neighbor.distance:.4f} {similar.splitted_texts_chunks.values[0]}")


100%|██████████| 1/1 [00:01<00:00,  1.31s/it]

0.0592 the way up the street, screaming for sweets. Harry Potter come
and live here!”
“It’s the best place 
0.0421  last place you would expect astonishing things to

0.0290 heavily.

0.0286  calmly. “Voldemort had powers I will
never have.”
“Only because you’re too — well — noble to use th
0.0278 y way here.”
Professor McGonagall sniffed angrily.
“Oh yes, I’ve celebrating, all right,” she said i
0.0273 id Professor McGonagall. “And I don’t suppose you’re going to
tell me why you’re here, of all places
0.0272 long robes, a purple cloak that
swept the ground, and high-heeled, buckled boots. His blue eyes were
0.0269 Well, I just thought…maybe…it was something to do with…you
know…her crowd.”
Mrs. Dursley sipped her 
0.0268 cked Mrs.
Dursley on the cheek, and tried to kiss Dudley good-bye but missed, because
Dudley was now
0.0263 er sister and her good-for-nothing
husband were as unDursleyish as it was possible to be. The Dursle
0.0263 We’ve had precious
little to celebrate for eleven yea




### Get an existing Index
To get an index object that already exists, replace the following [your-index-id] with the index ID and run the cell. You can check the ID on the Vector Search Console > INDEXES tab.


In [180]:
%pip install --upgrade google-cloud-aiplatform -q

Note: you may need to restart the kernel to use updated packages.


In [181]:
from google.cloud import aiplatform

REGION = LOCATION = "asia-southeast1"

aiplatform.init(project=PROJECT_ID, location=LOCATION)
vertexai.init(project=PROJECT_ID, location=LOCATION)

### Update all this information below 

#### this setting is obtained from matching engine end point
##### https://console.cloud.google.com/vertex-ai/locations/us-central1/index-endpoints/3345510418113101824/deployed-indexes/vs_quickstart_deployed_02060053?


In [182]:
my_index_name = my_index._gca_resource.name
my_index_display_name = my_index.display_name
my_index_id = my_index.name.split('/')[-1]

my_index_endpoint_name = my_index_endpoint._gca_resource.name
my_index_endpoint_display_name = my_index_endpoint.display_name
my_index_endpoint_id = my_index_endpoint.name.split('/')[-1]
my_index_endpoint_public_domain = my_index_endpoint.public_endpoint_domain_name

my_index = aiplatform.MatchingEngineIndex(my_index_name)

my_index_endpoint_id = my_index_endpoint_id

# my_index_endpoint_id = "[your-index-endpoint-id]"  # @param {type:"string"}
my_index_endpoint = aiplatform.MatchingEngineIndexEndpoint(my_index_endpoint_id)

#### Querying the earlier created index

In [183]:
from langchain_google_vertexai import VertexAIEmbeddings

embeddings = VertexAIEmbeddings(model="textembedding-gecko@001")



In [184]:
# this is embedding vector (should be created by calling the embeddings models)

text = "harry potter owl and the green colour boy hello"

test_embeddings = embeddings.embed_query(text)
print("preview embeddings",test_embeddings[0:2])

preview embeddings [-0.012529210187494755, 0.027635833248496056]


### Update the information below

In [185]:
# this setting is obtained from matching ending https://console.cloud.google.com/vertex-ai/locations/asia-southeast1/index-endpoints/3366088877738557440/deployed-indexes/vs_quickstart_deployed_02060053?project=jingle-project-414801


from google.cloud import aiplatform_v1

# Set variables for the current deployed index.
API_ENDPOINT=my_index_endpoint_public_domain
INDEX_ENDPOINT=my_index_endpoint_name

indexendpoint_id=str(re.search(r'\d+', my_index_endpoint_display_name).group())

DEPLOYED_INDEX_ID="vs_feature_deployed_" + indexendpoint_id
neighbor_count = 3

print(API_ENDPOINT)
print(INDEX_ENDPOINT)
print(DEPLOYED_INDEX_ID)

1150799651.asia-southeast1-777458322107.vdb.vertexai.goog
projects/777458322107/locations/asia-southeast1/indexEndpoints/8723120634495762432
vs_feature_deployed_03040744


In [187]:
# Configure Vector Search client
client_options = {
  "api_endpoint": API_ENDPOINT
}
vector_search_client = aiplatform_v1.MatchServiceClient(
  client_options=client_options,
)
# Build FindNeighborsRequest object
datapoint = aiplatform_v1.IndexDatapoint(
  feature_vector=test_embeddings
)

query = aiplatform_v1.FindNeighborsRequest.Query(
  datapoint=datapoint,
  # The number of nearest neighbors to be retrieved
  neighbor_count=neighbor_count
)

request = aiplatform_v1.FindNeighborsRequest(
  index_endpoint=INDEX_ENDPOINT,
  deployed_index_id=DEPLOYED_INDEX_ID,
  # Request can have multiple queries
  queries=[query],
  return_full_datapoint=False,
)

# Execute the request
response = vector_search_client.find_neighbors(request)

df_new = pd.DataFrame()
print('neighbor_count', neighbor_count)
for i in range(0,neighbor_count):
    x=response.nearest_neighbors[0]
    
    df_match = df.loc[df['id'] == int(x.neighbors[i].datapoint.datapoint_id) ]

    # Append the matching rows to the new DataFrame
    df_new = pd.concat([df_new, df_match])

# Print the new DataFrame
print(df_new)

neighbor_count 3
     index                                     pagewise_texts  page_id  \
46       1  something peculiar — a cat reading a map. For ...        2   
124      4  agree.”\nHe didn’t say another word on the sub...        5   
147      5  nearest street lamp went out with a little pop...        6   

                                        splitted_texts  \
46   something peculiar — a cat reading a map. For ...   
124  agree.”\nHe didn’t say another word on the sub...   
147  nearest street lamp went out with a little pop...   

                                 splitted_texts_chunks   id  
46   five\ndifferent people. He made several import...   46  
124  long robes, a purple cloak that\nswept the gro...  124  
147  y way here.”\nProfessor McGonagall sniffed ang...  147  


In [188]:
def get_id_with_embedding_matching(test_embeddings) :
    
    datapoint = aiplatform_v1.IndexDatapoint(
      feature_vector=test_embeddings
    )
    query = aiplatform_v1.FindNeighborsRequest.Query(
      datapoint=datapoint,
      # The number of nearest neighbors to be retrieved
      neighbor_count=neighbor_count
    )
    request = aiplatform_v1.FindNeighborsRequest(
      index_endpoint=INDEX_ENDPOINT,
      deployed_index_id=DEPLOYED_INDEX_ID,
      # Request can have multiple queries
      queries=[query],
      return_full_datapoint=False,
    )

    # Execute the request
    response = vector_search_client.find_neighbors(request)
    
    df_new = pd.DataFrame()

    for i in range(0,neighbor_count):
        x=response.nearest_neighbors[0]

        df_match = df.loc[df['id'] == int(x.neighbors[i].datapoint.datapoint_id) ]

        # Append the matching rows to the new DataFrame
        df_new = pd.concat([df_new, df_match])
    
    i,j,k = df_new.index[0:3]
    print(i,j,k)
    
    pagewise_texts_v1 = df_new.loc[i, 'pagewise_texts']
    pagewise_texts_v2 = df_new.loc[j, 'pagewise_texts']
    pagewise_texts_v3 = df_new.loc[k, 'pagewise_texts']
    
    splitted_texts_v1 = df_new.loc[i, 'splitted_texts']
    splitted_texts_v2 = df_new.loc[j, 'pagewise_texts']
    splitted_texts_v3 = df_new.loc[k, 'pagewise_texts']
    
    splitted_texts_chunks_v1 = df_new.loc[i, 'splitted_texts_chunks']
    splitted_texts_chunks_v2 = df_new.loc[j, 'splitted_texts_chunks']
    splitted_texts_chunks_v3 = df_new.loc[k, 'splitted_texts_chunks']
    
    page_id_v1 = df_new.loc[i, 'page_id'] 
    page_id_v2 = df_new.loc[j, 'page_id'] 
    page_id_v3 = df_new.loc[k, 'page_id'] 
    
    return(pagewise_texts_v1,pagewise_texts_v2,pagewise_texts_v3,
           splitted_texts_v1,splitted_texts_v2,splitted_texts_v3,
           splitted_texts_chunks_v1,splitted_texts_chunks_v2,splitted_texts_chunks_v3,
        page_id_v1,page_id_v2,page_id_v3,i,j,k)

In [189]:
import pandas as pd
filename = "./harry_potte_qa.csv"
df_qa = pd.read_csv(filename, sep ="|")

df_qa.head()

Unnamed: 0,Question,Answer
0,What is the name of the magical creature that ...,Thestral
1,What is the name of the school newspaper at Ho...,The Daily Prophet
2,What is the name of the magical map that shows...,Marauder's Map
3,Which Hogwarts house does Luna Lovegood belong...,Ravenclaw
4,What magical creature is known for guarding Gr...,Ukrainian Ironbelly (a dragon)


In [190]:
df_qa.columns

Index(['Question', 'Answer'], dtype='object')

In [191]:
import csv
import csv

with open('harry_potte_qa.csv', 'r') as input_file, open('harry_potte_qa_output.csv', 'w', newline='') as output_file:

  # Create CSV reader and writer objects
  reader = csv.reader(input_file, delimiter='|')
  writer = csv.writer(output_file, delimiter='|')

  # Read and write the header row
  header = next(reader) + ['i','j','k','pagewise_texts_v1','pagewise_texts_v2','pagewise_texts_v3','splitted_texts_v1','splitted_texts_v2','splitted_texts_v3','splitted_texts_chunks_v1','splitted_texts_chunks_v2','splitted_texts_chunks_v3','page_id_v1','page_id_v2','page_id_v3']
  writer.writerow(header)

  # Loop through the remaining rows
  for i, row in enumerate(reader):
    question = row[0].split('|')[0]  # Use 'i' to access the correct element in the row
    question_emb = embeddings.embed_query( question )
    pagewise_texts_v1,pagewise_texts_v2,pagewise_texts_v3,splitted_texts_v1,splitted_texts_v2,splitted_texts_v3,splitted_texts_chunks_v1,splitted_texts_chunks_v2,splitted_texts_chunks_v3,page_id_v1,page_id_v2,page_id_v3,i,j,k = get_id_with_embedding_matching(question_emb) 
    
    # print( i , question)
    row_out = row + [i,j,k,pagewise_texts_v1,pagewise_texts_v2,pagewise_texts_v3,splitted_texts_v1,splitted_texts_v2,splitted_texts_v3,splitted_texts_chunks_v1,splitted_texts_chunks_v2,splitted_texts_chunks_v3,page_id_v1,page_id_v2,page_id_v3]
    
    # Write the row to the output file
    writer.writerow(row_out)

# Usage example:
! head -n 2 harry_potte_qa_output.csv

202 11 147
46 124 292
11 97 292
202 11 46
9 11 46
202 261 9
183 226 238
161 46 11
202 24 9
161 202 289
11 202 273
124 202 292
202 261 161
167 202 124
292 273 11
202 124 11
11 20 311
202 147 247
202 144 46
124 202 147
24 202 9
11 261 202
202 46 146
124 161 143
11 202 9
124 147 146
46 11 161
11 327 311
161 202 124
282 298 11
202 147 46
11 202 325
202 11 161
46 11 202
202 161 11
46 124 161
11 161 202
202 171 161
11 292 46
202 247 276
11 24 311
46 261 1
46 161 202
46 171 141
261 11 202
11 202 327
143 124 289
11 46 9
202 20 11
46 202 261
46 202 247
24 202 141
202 11 100
202 171 11
46 249 24
202 46 141
11 161 261
202 261 282
202 11 292
202 161 46
202 46 261
124 161 166
261 161 46
5 202 124
11 97 202
46 171 5
311 46 202
46 202 11
202 171 46
11 161 124
202 1 141
261 46 289
202 161 289
11 292 124
46 261 184
202 46 261
171 261 96
11 20 38
202 5 124
202 261 11
11 202 161
202 11 261
202 16 12
20 327 238
202 171 24
4 20 46
11 273 4
161 261 289
11 9 161
4 11 124
4 9 11
11 161 46
24 124 161
202 46 11

In [193]:
import pandas as pd
filename = "./harry_potte_qa_output.csv"
df_qa = pd.read_csv(filename, sep ="|")

df_qa.head()

Unnamed: 0,Question,Answer,i,j,k,pagewise_texts_v1,pagewise_texts_v2,pagewise_texts_v3,splitted_texts_v1,splitted_texts_v2,splitted_texts_v3,splitted_texts_chunks_v1,splitted_texts_chunks_v2,splitted_texts_chunks_v3,page_id_v1,page_id_v2,page_id_v3
0,What is the name of the magical creature that ...,Thestral,202,11,147,Professor McGonagall’s voice trembled as she w...,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,nearest street lamp went out with a little pop...,We may never know.”\nProfessor McGonagall pull...,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,nearest street lamp went out with a little pop...,"the way up the street, screaming for sweets. H...",er sister and her good-for-nothing\nhusband we...,y way here.”\nProfessor McGonagall sniffed ang...,8,1,6
1,What is the name of the school newspaper at Ho...,The Daily Prophet,46,124,292,something peculiar — a cat reading a map. For ...,agree.”\nHe didn’t say another word on the sub...,"pulling a spider off one of them, put them on....",something peculiar — a cat reading a map. For ...,agree.”\nHe didn’t say another word on the sub...,"pulling a spider off one of them, put them on....",five\ndifferent people. He made several import...,"long robes, a purple cloak that\nswept the gro...","the stairs was full of them, and that was whe...",2,5,14
2,What is the name of the magical map that shows...,Marauder's Map,11,97,292,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,learned a new word (“Won’t!”). Mr. Dursley tri...,"pulling a spider off one of them, put them on....",CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,learned a new word (“Won’t!”). Mr. Dursley tri...,"pulling a spider off one of them, put them on....",er sister and her good-for-nothing\nhusband we...,"After all,\nthey normally pretended she didn’...","the stairs was full of them, and that was whe...",1,4,14
3,Which Hogwarts house does Luna Lovegood belong...,Ravenclaw,202,11,46,Professor McGonagall’s voice trembled as she w...,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,something peculiar — a cat reading a map. For ...,We may never know.”\nProfessor McGonagall pull...,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,something peculiar — a cat reading a map. For ...,"the way up the street, screaming for sweets. H...",er sister and her good-for-nothing\nhusband we...,five\ndifferent people. He made several import...,8,1,2
4,What magical creature is known for guarding Gr...,Ukrainian Ironbelly (a dragon),9,11,46,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,something peculiar — a cat reading a map. For ...,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,something peculiar — a cat reading a map. For ...,y\ncould bear it if anyone found out about the...,er sister and her good-for-nothing\nhusband we...,five\ndifferent people. He made several import...,1,1,2


### Installation of required libs for Gemini and PaLM


In [195]:
!pip install --upgrade google-cloud-aiplatform



In [196]:
!pip install --upgrade google-cloud-aiplatform
GCP_PROJECT=PROJECT_ID
LOCATION = REGION = 'asia-southeast1'



### Vertex AI setup

In [198]:
import os

# import streamlit as st
import vertexai
from vertexai.preview.language_models import TextGenerationModel

vertexai.init(project=PROJECT_ID, location=LOCATION)


# @st.cache_resource
def get_model():
    generation_model = TextGenerationModel.from_pretrained("text-bison@002")
    return generation_model


def get_text_generation(prompt="", **parameters):
    generation_model = get_model()
    response = generation_model.predict(prompt=prompt, **parameters)

    return response.text

### Defining Gemini Functions

In [199]:
import vertexai
from vertexai.preview.generative_models import GenerativeModel, Part

def generate(input_prompt):
    model = GenerativeModel("gemini-ultra")
    responses = model.generate_content(
        input_prompt ,
    generation_config={
        "max_output_tokens": 2048,
        "temperature": 0.2,
        "top_p": 1,
        "top_k": 32
    },
        safety_settings=[],
        stream=True,
    )
    
    all_response  = []
    
    for response in responses:
        # print(response.text, end="")
        all_response.append(response.text)
    
    # print (all_response)
    
    return(" ".join(all_response))
    

def generate_pro(input_prompt):
    model = GenerativeModel("gemini-pro")
    responses = model.generate_content(
    input_prompt,
    generation_config={
        "max_output_tokens": 2048,
        "temperature": 0.2,
        "top_p": 1
    },stream=True,)
    
    all_response  = []
    
    for response in responses:
        all_response.append(response.text)
    
    # print (all_response)
    
    return(" ".join(all_response))


In [200]:
import vertexai
from vertexai.language_models import TextGenerationModel

vertexai.init(project=PROJECT_ID, location=LOCATION)
parameters = {
    "candidate_count": 1,
    "max_output_tokens": 1024,
    "temperature": 1,
    "top_k": 40
}

def generate_palm_unicorn_v1(input_prompt):
    
    model = TextGenerationModel.from_pretrained("text-unicorn@001")

    response = model.predict(
        input_prompt,
        **parameters
    )
    print(f"Response from Model: {response.text}")
    
    return(response.text)

def generate_palm_bison32k(input_prompt):
    
    model = TextGenerationModel.from_pretrained("text-bison-32k")

    response = model.predict(
        input_prompt,
        **parameters
    )
    print(f"Response from Model: {response.text}")
    
    return(response.text)


### Read the Q&A file

#### This uses the file from Matching Engine which has questions and retrieved document results

In [207]:
import pandas as pd
filename = "./harry_potte_qa_output.csv"
df_qa = pd.read_csv(filename, sep ="|")

System_Prompts = """ You are an expert in reading harry potter books, but only provide evidences from the information provided and do not use any other information
so here are some search results : 
"""

Question_Prompts = """ -- Based on information above help to answer following user question
"""

df_qa['combine_prompt_RAG1'] = System_Prompts + ' ' +df_qa['pagewise_texts_v1'] + ' Please answers the Question : '+ df_qa['Question'] 
df_qa['combine_prompt_RAG2'] = System_Prompts + ' ' +df_qa['pagewise_texts_v2'] + ' Please answers the Question : '+ df_qa['Question'] 
df_qa['combine_prompt_RAG3'] = System_Prompts + ' ' +df_qa['pagewise_texts_v3'] + ' Please answers the Question : '+ df_qa['Question'] 


In [202]:
import re

In [203]:
for i in range(0, len(df_qa)):


    clean_text1 = re.sub(r'[^\w\s;]', '', df_qa.loc[i,'combine_prompt_RAG1'])
    clean_text2 = re.sub(r'[^\w\s;]', '', df_qa.loc[i,'combine_prompt_RAG2'])
    clean_text3 = re.sub(r'[^\w\s;]', '', df_qa.loc[i,'combine_prompt_RAG3'])

    if i<=1000:
        # df['Gemini_ultra_model_output'][i] = generate(df['combine_prompt'][i])
        print("iteration #", i, "test")
        if i==32 : 
            print("iteration #", i, "test", clean_text1, clean_text2, clean_text3)
    
    try:
        df_qa.loc[i, "Gemini_pro_model_output_v1"] = generate_pro(clean_text1)
        df_qa.loc[i, "Gemini_pro_model_output_v2"] = generate_pro(clean_text2)
        df_qa.loc[i, "Gemini_pro_model_output_v3"] = generate_pro(clean_text3)
    except :
        print("Prompt error at gemini ", i)
        df_qa.loc[i, "Gemini_pro_model_output_v1"] = "Prompt failed "
        df_qa.loc[i, "Gemini_pro_model_output_v2"] = "Prompt failed "
        df_qa.loc[i, "Gemini_pro_model_output_v3"] = "Prompt failed "

    try:
        df_qa.loc[i, "palm_bison32k_output_v1"] = generate_palm_bison32k(df_qa.loc[i,'combine_prompt_RAG1'])
        df_qa.loc[i, "palm_bison32k_output_v2"] = generate_palm_bison32k(df_qa.loc[i,'combine_prompt_RAG2'])
        df_qa.loc[i, "palm_bison32k_output_v3"] = generate_palm_bison32k(df_qa.loc[i,'combine_prompt_RAG3'])
    except ZeroDivisionError:
        print("Prompt error at palm ", i)
        df_qa.loc[i, "palm_bison32k_output_v1"] = "Prompt failed "
        df_qa.loc[i, "palm_bison32k_output_v2"] = "Prompt failed "
        df_qa.loc[i, "palm_bison32k_output_v3"] = "Prompt failed "


iteration # 0 test
Response from Model:  This question is unanswerable from the given text because there is no mention of any magical creature that can only be seen by those who have witnessed death.
Response from Model:  This context does not mention anything about a magical creature that can only be seen by those who have witnessed death, so I cannot answer this question from the provided context.
Response from Model:  This text does not mention any magical creature that can only be seen by those who have witnessed death.
iteration # 1 test
Response from Model:  The provided text does not mention the name of the school newspaper at Hogwarts.
Response from Model:  The text provided does not mention the name of the school newspaper at Hogwarts.
Response from Model:  The provided text does not mention the name of the school newspaper at Hogwarts.
iteration # 2 test
Response from Model:  The provided text does not contain any information about the name of the magical map that shows the e

In [204]:
df_qa.columns

Index(['Question', 'Answer', 'i', 'j', 'k', 'pagewise_texts_v1',
       'pagewise_texts_v2', 'pagewise_texts_v3', 'splitted_texts_v1',
       'splitted_texts_v2', 'splitted_texts_v3', 'splitted_texts_chunks_v1',
       'splitted_texts_chunks_v2', 'splitted_texts_chunks_v3', 'page_id_v1',
       'page_id_v2', 'page_id_v3', 'combine_prompt_RAG1',
       'combine_prompt_RAG2', 'combine_prompt_RAG3',
       'Gemini_pro_model_output_v1', 'Gemini_pro_model_output_v2',
       'Gemini_pro_model_output_v3', 'palm_bison32k_output_v1',
       'palm_bison32k_output_v2', 'palm_bison32k_output_v3'],
      dtype='object')

In [205]:

# Delete the 'col2' column
df_qa = df_qa.drop('combine_prompt_RAG1', axis=1)
df_qa = df_qa.drop('combine_prompt_RAG2', axis=1)
df_qa = df_qa.drop('combine_prompt_RAG3', axis=1)

output1 = "./results/harry_potte_qa_model_out.csv"

df_qa.to_csv(output1)


In [206]:
df_qa.head(1)

Unnamed: 0,Question,Answer,i,j,k,pagewise_texts_v1,pagewise_texts_v2,pagewise_texts_v3,splitted_texts_v1,splitted_texts_v2,...,splitted_texts_chunks_v3,page_id_v1,page_id_v2,page_id_v3,Gemini_pro_model_output_v1,Gemini_pro_model_output_v2,Gemini_pro_model_output_v3,palm_bison32k_output_v1,palm_bison32k_output_v2,palm_bison32k_output_v3
0,What is the name of the magical creature that ...,Thestral,202,11,147,Professor McGonagall’s voice trembled as she w...,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,nearest street lamp went out with a little pop...,We may never know.”\nProfessor McGonagall pull...,CHAPTER ONE\nTHE BOY WHO LIVED\nM r. and Mrs. ...,...,y way here.”\nProfessor McGonagall sniffed ang...,8,1,6,The provided text does not mention anything ab...,The provided text does not contain any informa...,The provided text does not mention the name of...,This question is unanswerable from the given ...,This context does not mention anything about ...,This text does not mention any magical creatu...


#### As we can see in the output above the poor search setup gave a bad response 