In [None]:
import os
import time
import uuid
from datetime import datetime

import fitz
import pandas as pd

from google.cloud import aiplatform
from PIL import Image as PIL_Image
from vertexai.generative_models import GenerativeModel, Image
from vertexai.language_models import TextEmbeddingModel
from google.cloud import storage

from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import DataFrameLoader

In [None]:
PROJECT_ID = "{Project name}" 
LOCATION = "{Region name}"  

In [None]:
multimodal_model = GenerativeModel('gemini-1.5-pro-preview-0409')

text_embedding_model = TextEmbeddingModel.from_pretrained("textembedding-gecko@003")

In [None]:
Image_Path = "./Images/"
if not os.path.exists(Image_Path):
    os.makedirs(Image_Path)

In [None]:
PDF_FILENAME = "pdfs/google_report.pdf"

In [None]:

zoom_x = 2.0  
zoom_y = 2.0  
mat = fitz.Matrix(zoom_x, zoom_y) 

doc = fitz.open(PDF_FILENAME) 
for page in doc: 
    pix = page.get_pixmap(matrix=mat)  
    outpath = f"./Images/{PDF_FILENAME}_{page.number}.jpg"
    pix.save(outpath)  


image_names = os.listdir(Image_Path)
Max_images = len(image_names)


page_source = []
page_content = []
page_id = []

p_id = 0  
rest_count = 0 

while p_id < Max_images:
    try:
        
        image_path = Image_Path + image_names[p_id]

        image = Image.load_from_file(image_path)

        prompt_text = "Extract all text content in the image"
        prompt_table = (
            "Detect table in this image. Extract content maintaining the structure"
        )

        contents = [image, prompt_text]
        response = multimodal_model.generate_content(contents)
        text_content = response.text

        contents = [image, prompt_table]
        response = multimodal_model.generate_content(contents)
        table_content = response.text

        print(f"processed image no: {p_id}")
        page_source.append(image_path)
        page_content.append(text_content + "\n" + table_content)
        page_id.append(p_id)
        p_id += 1

    except Exception as err:
        print(err)
        print("Taking Some Rest")
        time.sleep(1) 
        rest_count += 1
        if rest_count == 5:  
            rest_count = 0
            print(f"Can not process image no: {image_path}")
            p_id += 1 

df = pd.DataFrame(
    {"page_id": page_id, "page_source": page_source, "page_content": page_content}
)
del page_id, page_source, page_content  

In [None]:
def generate_text_embedding(text) -> list:
    embeddings = text_embedding_model.get_embeddings([text])
    vector = embeddings[0].values
    return vector

loader = DataFrameLoader(df, page_content_column="page_content")

documents = loader.load()
print(f"# of documents loaded (pre-chunking) = {len(documents)}")

text_splitter = CharacterTextSplitter(
    chunk_size=10000,  
    chunk_overlap=200,  
)

doc_splits = text_splitter.split_documents(documents)

for idx, split in enumerate(doc_splits):
    split.metadata["chunk"] = idx

print(f"# of documents = {len(doc_splits)}")

texts = [doc.page_content for doc in doc_splits]
text_embeddings_list = []
id_list = []
page_source_list = []
for doc in doc_splits:
    id = uuid.uuid4()
    text_embeddings_list.append(generate_text_embedding(doc.page_content))
    id_list.append(str(id))
    page_source_list.append(doc.metadata["page_source"])
    time.sleep(1) 

embedding_df = pd.DataFrame(
    {
        "id": id_list,
        "embedding": text_embeddings_list,
        "page_source": page_source_list,
        "text": texts,
    }
)
embedding_df.head()

In [None]:
VECTOR_SEARCH_REGION = "{Region name}"
VECTOR_SEARCH_INDEX_NAME = f"{PROJECT_ID}-vector-search-index-ht"
VECTOR_SEARCH_EMBEDDING_DIR = f"{PROJECT_ID}-vector-search-bucket-ht"
VECTOR_SEARCH_DIMENSIONS = 768

In [None]:
jsonl_string = embedding_df[["id", "embedding"]].to_json(orient="records", lines=True)
with open("data.json", "w") as f:
    f.write(jsonl_string)

In [None]:
# Creates a GCS bucket
BUCKET_NAME = "documents-bucket-123"
BUCKET_URI = f"gs://{BUCKET_NAME}"

In [None]:
# Get bucket if already created
storage_client = storage.Client()
bucket = storage_client.get_bucket(BUCKET_NAME)
blob = bucket.blob("data.json")
blob.upload_from_filename("data.json")

In [None]:
# create index
my_index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
    display_name=f"{VECTOR_SEARCH_INDEX_NAME}",
    contents_delta_uri=BUCKET_URI,
    dimensions=768,
    approximate_neighbors_count=20,
    distance_measure_type="DOT_PRODUCT_DISTANCE",
)

In [None]:
# create IndexEndpoint
my_index_endpoint = aiplatform.MatchingEngineIndexEndpoint.create(
    display_name=f"{VECTOR_SEARCH_INDEX_NAME}",
    public_endpoint_enabled=True,
)

In [None]:
# Deploy Index
DEPLOYED_INDEX_NAME = VECTOR_SEARCH_INDEX_NAME.replace(
    "-", "_"
)
DEPLOYED_INDEX_ID = f"{DEPLOYED_INDEX_NAME}"
my_index_endpoint.deploy_index(index=my_index, deployed_index_id=DEPLOYED_INDEX_ID)

In [None]:
def Test_LLM_Response(txt):

    classification_prompt = f""" Classify the text as one of the following categories:
        -Information Present
        -Infromation Not Present
        Text=The provided context does not contain information.
        Category:Infromation Not Present
        Text=I cannot answer this question from the provided context.
        Category:Infromation Not Present
        Text:{txt}
        Category:"""
    classification_response = multimodal_model.generate_content(classification_prompt).text

    if "Not Present" in classification_response:
        return False 
    else:
        return True


def get_prompt_text(question, context):
    prompt = """
      Answer the question using the context below. Respond with only from the text provided
      Question: {question}
      Context : {context}
      """.format(
        question=question, context=context
    )
    return prompt


def get_answer(query):

    neighbor_index = 0  
    answer_found_flag = 0  
    result = "" 
    
    page_source = "./Images/blank.jpg"  
    query_embeddings = generate_text_embedding(
        query
    ) 

    response = my_index_endpoint.find_neighbors(
        deployed_index_id=DEPLOYED_INDEX_ID,
        queries=[query_embeddings],
        num_neighbors=5,
    )  

    while answer_found_flag == 0 and neighbor_index < 4:
        context = embedding_df[
            embedding_df["id"] == response[0][neighbor_index].id
        ].text.values[
            0
        ] 

        prompt = get_prompt_text(
            query, context
        )  
        result = multimodal_model.generate_content(prompt).text  

        if Test_LLM_Response(result):
            answer_found_flag = 1 
        else:
            neighbor_index += (
                1 
            )

    if answer_found_flag == 1:
        page_source = embedding_df[
            embedding_df["id"] == response[0][neighbor_index].id
        ].page_source.values[
            0
        ]  
    return result, page_source

In [None]:
query = (
    "what is the consolidated revenue of 2020 ?"
)

result, page_source = get_answer(query)
print(result)

In [None]:
query = (
    "what is the operating income of 2020 ?"
)

result, page_source = get_answer(query)
print(result)

In [None]:
query = (
    "what is the EMEA revenues of 2021 ?"
)

result, page_source = get_answer(query)
print(result)

In [None]:
query = (
    "On what date did ALPHABET INC. CLASS C CAPITAL STOCK overtook NASDAQ Composite"
)

result, page_source = get_answer(query)
print(result)