In [1]:
import os
from pathlib import Path
from time import time

In [2]:
repo_dir = Path('.').absolute().parent


In [3]:
from langchain_community.llms.ollama import Ollama

llm = Ollama(model="mistrallite:latest")

prompt = 'What is the profit firm earned'
response_text = llm.invoke(prompt)

In [4]:
from llama_index.core import VectorStoreIndex
# from llama_index.readers.download_loader import download_loader


In [6]:
from langchain_community.embeddings.ollama import OllamaEmbeddings

def get_embedding_function():
    # embeddings = BedrockEmbeddings(
    #     credentials_profile_name="default", region_name="us-east-1"
    # )
    embeddings = OllamaEmbeddings(model="nomic-embed-text") # if completely local
    return embeddings

In [7]:
# Pinecone - similar to chroma
# document is tokenized and loaded into chroma for querying later
# This allows you to perform similarity search using similarity metrics 
# like euclidean distance 

from langchain.vectorstores.chroma import Chroma
from langchain.document_loaders import PyPDFLoader

start_time = time()

loader = PyPDFLoader(r'F:\cc_data\annualreport.pdf')
# split pages from pdf
pages = loader.load_and_split()
pages[0]
print('\n Time taken: ', time() - start_time)

# load documents int vector database aka chromaDB
# store = Chroma.from_documents(pages, collection_name='annual_report')



 Time taken:  24.381906032562256


In [8]:
pages[1]


Document(metadata={'source': 'F:\\cc_data\\annualreport.pdf', 'page': 1}, page_content='2022 Annual General Meeting\nMacquarie Group Limitedâ€™s 2022 AGM will be held at 10:30 am on \nThursday, 28 July 2022.\nDetails of the meeting will be sent to shareholders separately.\nCover image\nMacquarie manages investments in infrastructure integral to \nthe US economy. Long Beach Container Terminal is part of the \ncombined port complex of Los Angeles and Long Beach, the \nlargest cargo gateway in North America.\nMacquarie is a global financial \nservices group operating in \n33\xa0markets in asset management, \nretail and business banking, wealth \nmanagement, leasing and asset \nfinancing, market access, commodity \ntrading, renewables development, \nspecialist advice, access to capital \nand principal investment.')

In [9]:
def calculate_chunk_ids(chunks):

    # This will create IDs like "data/monopoly.pdf:6:2"
    # Page Source : Page Number : Chunk Index

    last_page_id = None
    current_chunk_index = 0

    for chunk in chunks:
        source = chunk.metadata.get("source")
        source = source[source.find('annualreport') : ]
        
        page = chunk.metadata.get("page")
        
        current_page_id = f"{source}:{page}"

        # If the page ID is the same as the last one, increment the index.
        if current_page_id == last_page_id:
            current_chunk_index += 1
        else:
            current_chunk_index = 0

        # Calculate the chunk ID.
        chunk_id = f"{current_page_id}:{current_chunk_index}"
        last_page_id = current_page_id

        # Add it to the page meta-data.
        chunk.metadata["id"] = chunk_id

    return chunks

In [10]:
CHROMA_PATH = 'chroma_AnnualReport'

In [11]:
from langchain.vectorstores.chroma import Chroma
from langchain.schema.document import Document

def add_to_chroma(chunks: list[Document]):
    # Load the existing database.
    db = Chroma(
        persist_directory=CHROMA_PATH, 
        embedding_function=get_embedding_function()
    )

    # Calculate Page IDs.
    chunks_with_ids = calculate_chunk_ids(chunks)

    # Add or Update the documents.
    existing_items = db.get(include=[])  # IDs are always included by default
    existing_ids = set(existing_items["ids"])
    print(f"Number of existing documents in DB: {len(existing_ids)}")

    # Only add documents that don't exist in the DB.
    new_chunks = []
    for chunk in chunks_with_ids:
        if chunk.metadata["id"] not in existing_ids:
            new_chunks.append(chunk)

    if len(new_chunks):
        print(f"ðŸ‘‰ Adding new documents: {len(new_chunks)}")
        new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
        db.add_documents(new_chunks, ids=new_chunk_ids)
        db.persist()
    else:
        print("âœ… No new documents to add")

In [None]:
start_time = time()
add_to_chroma(pages)

print('\n Time taken: ', time() - start_time)

  db = Chroma(


Number of existing documents in DB: 0
ðŸ‘‰ Adding new documents: 403


In [None]:
import shutil
import os
def clear_database():
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)

In [None]:
# clear_database()

In [None]:
PROMPT_TEMPLATE = """
Answer the question based only on the following context:

{context}

---

Answer the question based on the above context: {question}
"""

In [None]:
from langchain.prompts import ChatPromptTemplate
from langchain_community.llms.ollama import Ollama

def query_rag(query_text: str):
    # Prepare the DB.
    embedding_function = get_embedding_function()
    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)

    # Search the DB. Gives k most relevant chunks to the query
    results = db.similarity_search_with_score(query_text, k=5)

    context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    prompt = prompt_template.format(context=context_text, question=query_text)
    # print(prompt)

    model = Ollama(model="mistrallite:latest")
    response_text = model.invoke(prompt)

    sources = [doc.metadata.get("id", None) for doc, _score in results]
    formatted_response = f"Response: {response_text}\nSources: {sources}"
    print(formatted_response)
    return response_text, results

In [None]:
start_time = time()

query_text = 'What was the net profit of the company?'
response_text, results = query_rag(query_text)

print('\n Time taken: ', time() - start_time)

In [None]:
start_time = time()

query_text = 'What initiatives were taken by bank towards sustainability?'
query_rag(query_text)

print('\n Time taken: ', time() - start_time)

In [None]:
start_time = time()

query_text = 'Summarize the financial performance of the bank'
query_rag(query_text)

print('\n Time taken: ', time() - start_time)