In [None]:
#the first function returns the bucket uri which will be used
#in the second function

#1st cloud function
import pandas as pd
import vertexai
import time
import tqdm
from typing import Optional, Sequence
from google.api_core.client_options import ClientOptions
from google.cloud import documentai
from vertexai.preview.language_models import TextEmbeddingModel
from datetime import datetime



def chunking_and_embedding(request):
  #declare the required variables
  project_id = "Enter your project id"
  location = "us" # Format is "us" or "eu"
  processor_id = "2fa558c697497e1b" # Create processor before running sample
  processor_version = "pretrained-ocr-v2.0-2023-06-02" # Refer to https://cloud.google.com/document-ai/docs/manage-processor-versions for more information
  mime_type = "application/pdf"

  #extract file_path from the request
  request_json = request.get_json(silent=True)
  request_args = request.args
  file_path = "/content/2023-annual-report-1-15 (1).pdf"
  if request_json and "file_path" in request_json:
    file_path = request_json["file_path"]
  elif request_args and "file_path" in request_args:
    file_path = request_args["file_path"]
  else:
    #throw error saying file path missing
    return "Error: Missing FilePath"

  process_options = documentai.ProcessOptions(
    ocr_config=documentai.OcrConfig(
      enable_native_pdf_parsing=True,
      enable_image_quality_scores=True,
      enable_symbol=True,
      # OCR Add Ons https://cloud.google.com/document-ai/docs/ocr-add-ons
      premium_features=documentai.OcrConfig.PremiumFeatures(
          compute_style_info=True,
          enable_math_ocr=False,  # Enable to use Math OCR Model
          enable_selection_mark_detection=True,
      ),
    )
  )

  client = documentai.DocumentProcessorServiceClient(
    client_options=ClientOptions(
      api_endpoint=f"{location}-documentai.googleapis.com"
    )
  )
  name = client.processor_version_path(
    project_id, location, processor_id, processor_version
  )

  # Read the file into memory
  with open(file_path, "rb") as image:
    image_content = image.read()

  # Configure the process request
  request = documentai.ProcessRequest(
    name=name,
    raw_document=documentai.RawDocument(content=image_content, mime_type=mime_type),
    # Only supported for Document OCR processor
    process_options=process_options,
  )
  result = client.process_document(request=request)
  document = result.document

  text = document.text
  para = []

  #for converting the chunks
  def layout_to_text(layout: documentai.Document.Page.Layout, text: str) -> str:
    """
    Document AI identifies text in different parts of the document by their
    offsets in the entirety of the document"s text. This function converts
    offsets to a string.
    """
    # If a text segment spans several lines, it will
    # be stored in different text segments.
    return "".join(
        text[int(segment.start_index) : int(segment.end_index)]
        for segment in layout.text_anchor.text_segments
    )
  for page in document.pages:
    for p in page.paragraphs:
      t = layout_to_text(p.layout,text)
      para.append(t.replace("\n", ""))

  #create a dataframe and store the data
  d = {"title":text}
  df = pd.DataFrame.from_dict(d)
  df["id"] = df.index + 1

  #for embedding generation
  vertexai.init(project=project_id, location=location)
  model = TextEmbeddingModel.from_pretrained("textembedding-gecko@001")
  BATCH_SIZE = 1
  embs = []
  def get_embeddings_wrapper(texts):
    embs = []
    for i in tqdm.tqdm(range(0, len(texts), BATCH_SIZE)):
        time.sleep(1)  # to avoid the quota error
        result = model.get_embeddings(texts[i : i + BATCH_SIZE])
        embs = embs + [e.values for e in result]
    return embs
  df = df.assign(embedding=get_embeddings_wrapper(list(df.title)))
  jsonl_string = df[["id", "embedding"]].to_json(orient="records", lines=True)
  with open("questions.json", "w") as f:
    f.write(jsonl_string)
  UID = datetime.now().strftime("%m%d%H%M")
  BUCKET_URI = f"gs://{project_id}-{UID}"

  ! gsutil mb -l $LOCATION -p {project_id} {BUCKET_URI}
  ! gsutil cp questions.json {BUCKET_URI}

  #returning head of df as response
  return BUCKET_URI





#2nd cloud function
import numpy as np
import vertexai
from google.cloud import aiplatform
from vertexai.generative_models import GenerativeModel
from google.cloud import discoveryengine_v1alpha as discoveryengine
from google.api_core.future.polling import DEFAULT_POLLING

def inferencing(request):
  project_id = PROJECT_ID
  LOCATION = "us-central1"

  request_json = request.get_json(silent=True)
  request_args = request.args

  #extract prompt from the request
  prompt = ""
  if request_json and "prompt" in request_json:
    prompt = request_json["prompt"]
  elif request_args and "prompt" in request_args:
    prompt = request_args["prompt"]
  else:
    #throw error saying file path missing
    return "Error: Missing Prompt"

  #extract the bucket uri of vector datastore
  BUCKET_URI=""
  if request_json and "BUCKET_URI" in request_json:
    BUCKET_URI = request_json["BUCKET_URI"]
  elif request_args and "BUCKET_URI" in request_args:
    BUCKET_URI = request_args["BUCKET_URI"]
  else:
    #throw error saying file path missing
    return "Error: Missing BUCKET_URI"

  #index creation and deployment
  aiplatform.init(project=project_id, location=LOCATION)
  my_index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
    display_name="test_index_7634",
    contents_delta_uri=BUCKET_URI,
    dimensions=768,
    approximate_neighbors_count=20,
    distance_measure_type="DOT_PRODUCT_DISTANCE",
  )
  my_index_endpoint = aiplatform.MatchingEngineIndexEndpoint.create(
    display_name="test_index_7634",
    public_endpoint_enabled=True,
  )
  DEPLOYED_INDEX_ID = "test_index_7634"
  #for timeout issues
  DEFAULT_POLLING._timeout=3600
  # deploy the Index to the Index Endpoint
  my_index_endpoint.deploy_index(index=my_index, deployed_index_id=DEPLOYED_INDEX_ID)
  BATCH_SIZE = 1
  def get_embeddings_wrapper(texts):
    embs = []
    for i in tqdm.tqdm(range(0, len(texts), BATCH_SIZE)):
        time.sleep(1)  # to avoid the quota error
        result = model.get_embeddings(texts[i : i + BATCH_SIZE])
        embs = embs + [e.values for e in result]
    return embs
  test_embeddings = get_embeddings_wrapper([prompt])
  content = []

  #query the index
  response = my_index_endpoint.find_neighbors(
    deployed_index_id=DEPLOYED_INDEX_ID,
    queries=test_embeddings,
    num_neighbors=20,
    )
  cnt=0
  #read the datastore as dataframe
  df = pd.read_json(BUCKET_URI,lines=True)

  for idx, neighbor in enumerate(response[0]):
    id = np.int64(neighbor.id)
    similar = df.query("id == @id", engine="python")
    cnt+=1
    content.append(similar.title.values[0])
  #summarization using gemini
  query = f'''
  {content[0]}
  {content[1]}
  {content[2]}
  {content[3]}
  {content[4]}
  Summarize the above points.
  '''
  vertexai.init(project=project_id, location="us-central1")
  model = GenerativeModel("gemini-1.5-flash-001")
  response = model.generate_content(query)

  return response.text
