In [1]:
import os
from pathlib import Path
from time import time

In [2]:
repo_dir = Path('.').absolute().parent


In [3]:
from langchain_community.llms.ollama import Ollama

llm = Ollama(model="mistrallite:latest")

prompt = 'What is the profit firm earned'
response_text = llm.invoke(prompt)

In [4]:
from llama_index.core import VectorStoreIndex
# from llama_index.readers.download_loader import download_loader


In [6]:
from langchain_community.embeddings.ollama import OllamaEmbeddings

def get_embedding_function():
    # embeddings = BedrockEmbeddings(
    #     credentials_profile_name="default", region_name="us-east-1"
    # )
    embeddings = OllamaEmbeddings(model="nomic-embed-text") # if completely local
    return embeddings

In [7]:
# Pinecone - similar to chroma
# document is tokenized and loaded into chroma for querying later
# This allows you to perform similarity search using similarity metrics 
# like euclidean distance 

from langchain.vectorstores.chroma import Chroma
from langchain.document_loaders import PyPDFLoader

start_time = time()

loader = PyPDFLoader(r'F:\cc_data\annualreport.pdf')
# split pages from pdf
pages = loader.load_and_split()
pages[0]
print('\n Time taken: ', time() - start_time)

# load documents int vector database aka chromaDB
# store = Chroma.from_documents(pages, collection_name='annual_report')



 Time taken:  24.381906032562256


In [8]:
pages[1]


Document(metadata={'source': 'F:\\cc_data\\annualreport.pdf', 'page': 1}, page_content='2022 Annual General Meeting\nMacquarie Group Limited’s 2022 AGM will be held at 10:30 am on \nThursday, 28 July 2022.\nDetails of the meeting will be sent to shareholders separately.\nCover image\nMacquarie manages investments in infrastructure integral to \nthe US economy. Long Beach Container Terminal is part of the \ncombined port complex of Los Angeles and Long Beach, the \nlargest cargo gateway in North America.\nMacquarie is a global financial \nservices group operating in \n33\xa0markets in asset management, \nretail and business banking, wealth \nmanagement, leasing and asset \nfinancing, market access, commodity \ntrading, renewables development, \nspecialist advice, access to capital \nand principal investment.')

In [9]:
def calculate_chunk_ids(chunks):

    # This will create IDs like "data/monopoly.pdf:6:2"
    # Page Source : Page Number : Chunk Index

    last_page_id = None
    current_chunk_index = 0

    for chunk in chunks:
        source = chunk.metadata.get("source")
        source = source[source.find('annualreport') : ]
        
        page = chunk.metadata.get("page")
        
        current_page_id = f"{source}:{page}"

        # If the page ID is the same as the last one, increment the index.
        if current_page_id == last_page_id:
            current_chunk_index += 1
        else:
            current_chunk_index = 0

        # Calculate the chunk ID.
        chunk_id = f"{current_page_id}:{current_chunk_index}"
        last_page_id = current_page_id

        # Add it to the page meta-data.
        chunk.metadata["id"] = chunk_id

    return chunks

In [10]:
CHROMA_PATH = r'F:\cc_data\chroma_AnnualReport'

In [11]:
from langchain.vectorstores.chroma import Chroma
from langchain.schema.document import Document

def add_to_chroma(chunks: list[Document]):
    # Load the existing database.
    db = Chroma(
        persist_directory=CHROMA_PATH, 
        embedding_function=get_embedding_function()
    )

    # Calculate Page IDs.
    chunks_with_ids = calculate_chunk_ids(chunks)

    # Add or Update the documents.
    existing_items = db.get(include=[])  # IDs are always included by default
    existing_ids = set(existing_items["ids"])
    print(f"Number of existing documents in DB: {len(existing_ids)}")

    # Only add documents that don't exist in the DB.
    new_chunks = []
    for chunk in chunks_with_ids:
        if chunk.metadata["id"] not in existing_ids:
            new_chunks.append(chunk)

    if len(new_chunks):
        print(f"👉 Adding new documents: {len(new_chunks)}")
        new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
        db.add_documents(new_chunks, ids=new_chunk_ids)
        db.persist()
    else:
        print("✅ No new documents to add")

In [12]:
start_time = time()
add_to_chroma(pages)

print('\n Time taken: ', time() - start_time)

  db = Chroma(


Number of existing documents in DB: 0
👉 Adding new documents: 403

 Time taken:  874.8912787437439


  db.persist()


In [13]:
import shutil
import os
def clear_database():
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)

In [14]:
# clear_database()

In [15]:
PROMPT_TEMPLATE = """
Answer the question based only on the following context:

{context}

---

Answer the question based on the above context: {question}
"""

In [22]:
from langchain.prompts import ChatPromptTemplate
from langchain_community.llms.ollama import Ollama

def query_rag(query_text: str):
    # Prepare the DB.
    embedding_function = get_embedding_function()
    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)

    # Search the DB. Gives k most relevant chunks to the query
    results = db.similarity_search_with_score(query_text, k=5)

    context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    prompt = prompt_template.format(context=context_text, question=query_text)
    # print(prompt)

    model = Ollama(model="mistrallite:latest")
    response_text = model.invoke(prompt)

    sources = [doc.metadata.get("id", None) for doc, _score in results]
    formatted_response = f"Response: {response_text}\nSources: {sources}"
    print(formatted_response)
    return response_text, results

In [23]:
start_time = time()

query_text = 'What was the net profit of the company?'
response_text, results = query_rag(query_text)

print('\n Time taken: ', time() - start_time)

Response:  The net profit of the company for FY2022 was $A4,706 million.
Sources: ['annualreport.pdf:164:0', 'annualreport.pdf:163:0', 'annualreport.pdf:159:0', 'annualreport.pdf:23:0', 'annualreport.pdf:15:0']

 Time taken:  21.364450454711914


In [24]:
results

[(Document(metadata={'id': 'annualreport.pdf:164:0', 'page': 164, 'source': 'F:\\cc_data\\annualreport.pdf'}, page_content='163Macquarie Group Limited and its subsidiaries\u20032022 Annual ReportFurther Information Directors’ Report Governance About Financial Report\nNote 3 \nSegment reporting  continued\n(i) Operating segments  continued\nThe following is an analysis of the Consolidated Entity’s revenue and results by reportable segment:\nMacquarie Asset \nManagementBanking and  \nFinancial ServicesCommodities and  \nGlobal Markets Macquarie Capital Corporate Total\n$m $m $m $m $m $m\nCONSOLIDATED 2022\nNet interest and trading (expense)/income (241) 1,972 4,732 327 66 6,856\nFee and commission income/(expense) 4,063 457 507 1,893 (33) 6,887\nNet operating lease income 63 – 335 – 4 402\nShare of net profits/(losses) from associates and joint\xa0ventures 268 (2) 40 (67) 1 240\nNet other operating income:\nCredit and other impairment reversals/(charges) 112 22 (65) (573) (5) (509)\nOthe

In [25]:
start_time = time()

query_text = 'What initiatives were taken by bank towards sustainability?'
query_rag(query_text)

print('\n Time taken: ', time() - start_time)

Response:  Macquarie, a global financial institution, has taken several initiatives towards sustainability in response to the challenge of climate change. Here are some of the key areas of their approach:

1. Developing and investing in green energy: Macquarage is committed to growing renewable energy capacity and has developed, constructed, operated, and financed over 50 GW of green energy projects around the world. They have invested $A2.3 billion into green energy projects in the past five years, including some that are no longer on their balance sheet but continue to generate green energy.
2. Creating climate resilient infrastructure: Macquarie is focusing on climate resilience and adaptation by investing in climate-specific risk assessments, revised operating procedures, physical enhancements, industrial technologies, and training for its portfolio companies, properties, and farmland. They are also designing new infrastructure projects with climate resilience in mind to deal with 

In [26]:
start_time = time()

query_text = 'Summarize the financial performance of the bank'
query_rag(query_text)

print('\n Time taken: ', time() - start_time)

Response:  Based on the information provided in the text, the bank's financial performance for the year ended March 31, 2022, is summarized as follows:

The bank reported a net operating income of $17,324 million for the year, which represents a 36% increase from the previous year's net operating income of $12,774 million. Operating expenses also increased by 22%, from $8,867 million in the prior year to $10,785 million in the current year. However, income tax expense increased significantly, from $899 million in the previous year to $1,586 million in the current year, leading to an overall profit attributable to ordinary equity holders of $4,706 million, a 56% increase from the prior year's profit of $3,015 million.

It is important to note that this summary only provides information about the bank's net operating income, operating expenses, and income tax expense. The text also mentions the importance of assessing the business model, including whether sales activity is consistent wit