In [1]:
import os
from huggingface_hub import hf_hub_download
from pathlib import Path
from time import time

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
repo_dir = Path('.').absolute().parent


# Download a hugging face model & make a Ollama modelfile
* Download huggingface CLI - [Hugging Face Hub](https://huggingface.co/docs/huggingface_hub/main/en/guides/cli)
* Login to Hugging face - `huggingface-cli login --token $HUGGINGFACE_TOKEN`
* `huggingface-cli whoami`
* Download a llm model - specifically GGUF one - [GGUF model download](https://www.youtube.com/watch?v=7BH4C6-HP14)
* Write a `modelfile` : `FROM ./huggingface_models/mistral-7b-instruct-v0.2.Q4_K_M.gguf`
* Create a model: `ollama create mistrallite -f mistral_lite_modelfile`

* First, we have our original data source, the PDFs.
* This data is going to be split into small chunks and then transformed into an embedding and stored inside of the vector database.
* Then when we want to ask a question, we'll also turn our query into an embedding.
* This will let us fetch the most relevant entries from the database.
* We can then use those entries together in a prompt and that's how we get our final response.

# Load Docs

In [3]:
from langchain.document_loaders.pdf import PyPDFDirectoryLoader

DATA_PATH = r'F:\cc_data\SB'

def load_documents():
    document_loader = PyPDFDirectoryLoader(DATA_PATH)
    return document_loader.load()

In [4]:
# Create (or update) the data store.
start_time = time()

documents = load_documents()
print('\n Time taken: ', time() - start_time)
# documents[0]



 Time taken:  11.895373582839966


* So each document is basically an object containing the text content of each page in the PDF. 
* It also has some metadata attached, which tells you the page number and the source of the text.

In [5]:
# Extract the page_content from each document
page_contents = [doc.page_content for doc in documents]

# If you want to access the page_content of the first document
first_page_content = page_contents[0]
print(first_page_content)

 
Scotiabank  First  Quarter  Press  Release  2024    1  
 
First  Quarter  2024  Earnings  Release  
 
Scotiabank  reports  first  quarter  results  
 
All amounts  are in Canadian  dollars  and  are based  on our unaudited  Interim  Condensed  Consolidated  Financial  Statements  for the quarter  ended  January  31, 2024  and  
related  notes  prepared  in accordance  with  International  Financial  Reporting  Standards  (IFRS)  as issued  by the International  Accounting  Standards  Board  (IASB),  unless  
otherwise  noted.  Our  complete  First Quarter  2024  Report  to Shareholders,  including  our unaudited  interim  financial  statements  for the period  ended  January  31, 2024,  can 
also  be found  on the SEDAR+  website  at www.sedarplus.ca  and  on the EDG AR section  of the SEC’s  website  at www.sec.gov . Supplementary  Financial  Information  is also  
available,  together  with  the First  Quarter  2024  Report  to Shareholders  on the Investor  Relations  page  at www

## Chunk

In [6]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document

def split_documents(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=80,
        length_function=len,
        is_separator_regex=False,
    )
    return text_splitter.split_documents(documents)

In [7]:
start_time = time()

documents = load_documents()
chunks = split_documents(documents)
print(chunks[0])

print('\n Time taken: ', time() - start_time)

page_content='Scotiabank  First  Quarter  Press  Release  2024    1  
 
First  Quarter  2024  Earnings  Release  
 
Scotiabank  reports  first  quarter  results  
 
All amounts  are in Canadian  dollars  and  are based  on our unaudited  Interim  Condensed  Consolidated  Financial  Statements  for the quarter  ended  January  31, 2024  and  
related  notes  prepared  in accordance  with  International  Financial  Reporting  Standards  (IFRS)  as issued  by the International  Accounting  Standards  Board  (IASB),  unless  
otherwise  noted.  Our  complete  First Quarter  2024  Report  to Shareholders,  including  our unaudited  interim  financial  statements  for the period  ended  January  31, 2024,  can' metadata={'source': 'F:\\cc_data\\SB\\Q124_Quarterly_Press_Release-EN.pdf', 'page': 0}

 Time taken:  11.88722825050354


## Chuck Ids

We'll use the source path, the page number, and then the chunk number of that page.

In [8]:
def calculate_chunk_ids(chunks):

    # This will create IDs like "data/monopoly.pdf:6:2"
    # Page Source : Page Number : Chunk Index

    last_page_id = None
    current_chunk_index = 0

    for chunk in chunks:
        source = chunk.metadata.get("source")
        source = source[source.find('SB') : ]
        
        page = chunk.metadata.get("page")
        
        current_page_id = f"{source}:{page}"

        # If the page ID is the same as the last one, increment the index.
        if current_page_id == last_page_id:
            current_chunk_index += 1
        else:
            current_chunk_index = 0

        # Calculate the chunk ID.
        chunk_id = f"{current_page_id}:{current_chunk_index}"
        last_page_id = current_page_id

        # Add it to the page meta-data.
        chunk.metadata["id"] = chunk_id

    return chunks

# Embedding Functions & VectorDB

In [9]:
# function returns embedding function
# used at 2 places - 
# The first is going to be when we create the database itself. 
# And the second is when we actually want to query the database

from langchain_community.embeddings.ollama import OllamaEmbeddings
# from langchain_community.embeddings.bedrock import BedrockEmbeddings


def get_embedding_function():
    # embeddings = BedrockEmbeddings(
    #     credentials_profile_name="default", region_name="us-east-1"
    # )
    embeddings = OllamaEmbeddings(model="nomic-embed-text") # if completely local
    return embeddings

In [10]:
CHROMA_PATH = "chroma_SB"


In [11]:
from langchain.vectorstores.chroma import Chroma

def add_to_chroma(chunks: list[Document]):
    # Load the existing database.
    db = Chroma(
        persist_directory=CHROMA_PATH, embedding_function=get_embedding_function()
    )

    # Calculate Page IDs.
    chunks_with_ids = calculate_chunk_ids(chunks)

    # Add or Update the documents.
    existing_items = db.get(include=[])  # IDs are always included by default
    existing_ids = set(existing_items["ids"])
    print(f"Number of existing documents in DB: {len(existing_ids)}")

    # Only add documents that don't exist in the DB.
    new_chunks = []
    for chunk in chunks_with_ids:
        if chunk.metadata["id"] not in existing_ids:
            new_chunks.append(chunk)

    if len(new_chunks):
        print(f"👉 Adding new documents: {len(new_chunks)}")
        new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
        db.add_documents(new_chunks, ids=new_chunk_ids)
        db.persist()
    else:
        print("✅ No new documents to add")

In [None]:
start_time = time()
add_to_chroma(chunks)

print('\n Time taken: ', time() - start_time)

  db = Chroma(


Number of existing documents in DB: 0
👉 Adding new documents: 278


In [None]:
import shutil
import os
def clear_database():
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)

In [None]:
# clear_database()

# Running RAG 

In [None]:
PROMPT_TEMPLATE = """
Answer the question based only on the following context:

{context}

---

Answer the question based on the above context: {question}
"""

In [None]:
from langchain.prompts import ChatPromptTemplate
from langchain_community.llms.ollama import Ollama

def query_rag(query_text: str):
    # Prepare the DB.
    embedding_function = get_embedding_function()
    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)

    # Search the DB. Gives k most relevant chunks to the query
    results = db.similarity_search_with_score(query_text, k=5)

    context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    prompt = prompt_template.format(context=context_text, question=query_text)
    # print(prompt)

    model = Ollama(model="mistrallite:latest")
    response_text = model.invoke(prompt)

    sources = [doc.metadata.get("id", None) for doc, _score in results]
    formatted_response = f"Response: {response_text}\nSources: {sources}"
    print(formatted_response)
    return response_text, results

In [None]:
start_time = time()

query_text = 'What was the net profit of the company?'
response_text, results = query_rag(query_text)

print('\n Time taken: ', time() - start_time)

# Test LLM

In [20]:
EVAL_PROMPT = """
Expected Response: {expected_response}
Actual Response: {actual_response}
---
(Answer with 'true' or 'false') Does the actual response match the expected response? 
"""

In [21]:
def query_and_validate(question: str, expected_response: str):
    response_text = query_rag(question)
    prompt = EVAL_PROMPT.format(
        expected_response=expected_response, actual_response=response_text
    )

    model = Ollama(model="mistrallite:latest")
    evaluation_results_str = model.invoke(prompt)
    evaluation_results_str_cleaned = evaluation_results_str.strip().lower()

    print(prompt)

    if "true" in evaluation_results_str_cleaned:
        # Print response in Green if it is correct.
        print("\033[92m" + f"Response: {evaluation_results_str_cleaned}" + "\033[0m")
        return True
    elif "false" in evaluation_results_str_cleaned:
        # Print response in Red if it is incorrect.
        print("\033[91m" + f"Response: {evaluation_results_str_cleaned}" + "\033[0m")
        return False
    else:
        raise ValueError(
            f"Invalid evaluation result. Cannot determine if 'true' or 'false'."
        )

In [22]:
start_time = time()

assert query_and_validate(
    question="How much total money does a player start with in Monopoly? (Answer with the number only)",
    expected_response="$1500",
)

print('\n Time taken: ', time() - start_time)

Response:  A player starts with $1,500 in Monopoly.
Sources: ['data\\monopoly.pdf:0:0', 'data\\monopoly.pdf:0:1', 'data\\monopoly.pdf:2:0', 'data\\monopoly.pdf:1:2', 'data\\monopoly.pdf:2:1']

Expected Response: $1500
Actual Response:  A player starts with $1,500 in Monopoly.
---
(Answer with 'true' or 'false') Does the actual response match the expected response? 

[92mResponse: true. the actual response describes a situation where a player starts the game of monopoly with $1,500, which is equivalent to the expected response of having a starting amount of $1,500.[0m

 Time taken:  27.884098768234253


In [23]:
start_time = time()

assert query_and_validate(
    question="How many points does the longest continuous train get in Ticket to Ride? (Answer with the number only)",
    expected_response="10 points",
)

print('\n Time taken: ', time() - start_time)

Response:  The longest continuous train gets a bonus of 10 points.
Sources: ['data\\ticket_to_ride.pdf:3:3', 'data\\ticket_to_ride.pdf:1:3', 'data\\ticket_to_ride.pdf:3:2', 'data\\ticket_to_ride.pdf:0:1', 'data\\ticket_to_ride.pdf:3:1']

Expected Response: 10 points
Actual Response:  The longest continuous train gets a bonus of 10 points.
---
(Answer with 'true' or 'false') Does the actual response match the expected response? 

[92mResponse: true. in the given context, both the expected and actual responses mean that a bonus of 10 points is given for having the longest continuous train.[0m

 Time taken:  24.800504207611084
