### Load documents

In [1]:
import tqdm as notebook_tqdm
from llama_index.core import SimpleDirectoryReader

def load_documents(docs_path):
    documents = SimpleDirectoryReader(docs_path).load_data()
    print(f"Loaded {len(documents)} documents")
    print(f"First document: {documents[0]}")
    return documents

In [2]:
docs_path = "./data/docs"
documents = load_documents(docs_path)

Loaded 232 documents
First document: Doc ID: 70e45dc8-0321-4586-aad6-f3ae277004d9
Text:


In [3]:
print(f"documents is a {type(documents)}, of length {len(documents)}, where each element is a {type(documents[0])} object")

documents is a <class 'list'>, of length 232, where each element is a <class 'llama_index.core.schema.Document'> object


### Chunking

In [4]:
import re

# Define the pattern for paragraphs and newlines
split_pattern = r"\r?\n\s*\r?\n+"

# Initialize lists to store the word counts of all chunks (now paragraphs) and entire texts across all documents
chunk_word_counts = []
entire_text_word_counts = []

# Initialize a variable to count the total number of paragraphs
total_paragraph_count = 0

# Iterate through each Document object in your list of documents
for doc in documents:
    # Assuming doc.text contains the full text of the PDF document
    paragraphs = re.split(split_pattern, doc.text)
    paragraphs = [paragraph.strip() for paragraph in paragraphs if paragraph.strip()]

    # Update the total paragraph count
    total_paragraph_count += len(paragraphs)

    # Calculate the number of words in each paragraph and store it
    chunk_word_counts.extend([len(paragraph.split()) for paragraph in paragraphs])

    # Calculate the number of words in the entire text and store it
    entire_word_count = len(doc.text.split())
    entire_text_word_counts.append(entire_word_count)

# Calculate summary statistics for paragraphs
average_paragraph_word_count = sum(chunk_word_counts) / len(chunk_word_counts)
max_paragraph_word_count = max(chunk_word_counts)

# Calculate average word count for entire texts
average_entire_text_word_count = sum(entire_text_word_counts) / len(entire_text_word_counts)

# Calculate the average number of paragraphs per document
average_paragraphs_per_document = total_paragraph_count / len(documents)

print(f"Average word count for a document: {average_entire_text_word_count}")
print(f"Average word count per paragraph: {average_paragraph_word_count}")
print(f"Longest paragraph: {max_paragraph_word_count}")
print(f"Total number of paragraphs: {total_paragraph_count}")
print(f"Average number of paragraphs per document: {average_paragraphs_per_document}")


Average word count for a document: 127.12068965517241
Average word count per paragraph: 70.05225653206651
Longest paragraph: 320
Total number of paragraphs: 421
Average number of paragraphs per document: 1.8146551724137931


In [5]:
from llama_index.core.node_parser import SentenceSplitter

text_splitter = SentenceSplitter(
    # paragraph_separator=r"\r?\n\s*\r?\n+", 
    chunk_size=512, 
    chunk_overlap=20
)
nodes = text_splitter.get_nodes_from_documents(documents)
print(f"Number of nodes: {len(nodes)}")

Number of nodes: 244


In [6]:
# checking the character count and word count for each node and getting avergaes
node_char_counts = [len(node.text) for node in nodes]
node_word_counts = [len(node.text.split()) for node in nodes]

average_node_char_count = sum(node_char_counts) / len(node_char_counts)
average_node_word_count = sum(node_word_counts) / len(node_word_counts)

print(f"Average character count for a node: {average_node_char_count}")
print(f"Average word count for a node: {average_node_word_count}")

Average character count for a node: 678.0286885245902
Average word count for a node: 120.94672131147541


In [7]:
# print all metadata values with keys for one node
for key in nodes[0].metadata.keys():
    print(f"{key}: {nodes[0].metadata[key]}")

page_label: 1
file_name: art of simple living.pdf
file_path: c:\Users\ALIMURTA\Documents\Work\Internal\GenAI Tech Workshop Group\Kickoff session\Workshop\workshop\data\docs\art of simple living.pdf
file_type: application/pdf
file_size: 953694
creation_date: 2024-03-28
last_modified_date: 2024-03-17


# Ingestion Pipeline

In [9]:
from llama_index.llms.ollama import Ollama
from llama_index.core.settings import Settings
from llama_index.core import SimpleDirectoryReader
from llama_index.embeddings.langchain import LangchainEmbedding
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

# Configuration for llm, embedding model, and node parsing
ollama_model = "mistral"
ollama_base_url = "http://127.0.0.1:11434"
embedding_model = "sentence-transformers/all-mpnet-base-v2"

Settings.llm = Ollama(model=ollama_model, base_url=ollama_base_url, temperature=0, request_timeout=300.0)
Settings.embed_model = LangchainEmbedding(HuggingFaceEmbeddings(model_name=embedding_model))


def load_documents(docs_path):
    documents = SimpleDirectoryReader(docs_path).load_data()
    print(f"Loaded {len(documents)} documents")
    if documents:
        print(f"First document: {documents[0]}")
    return documents

In [11]:
from llama_index.core.node_parser import SentenceSplitter
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext, VectorStoreIndex


def build_index(client, documents, index_name):

    chroma_collection = client.get_or_create_collection(index_name, metadata={"hnsw:space": "cosine"})
    print(f"Created/existing collection {chroma_collection}")
    vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    index = VectorStoreIndex.from_documents(
        documents=documents,
        transformations=[SentenceSplitter(chunk_size=512, chunk_overlap=20)],
        storage_context=storage_context,
        show_progress=True
    )
    return index

In [12]:
import chromadb

# save documents to vector store
def run_ingestion_pipeline(vectoredb_path, docs_path):
    
    print("Connecting to ChromaDB...")
    chromadb_client = chromadb.PersistentClient(path=vectoredb_path)

    print("Loading documents...")
    documents = load_documents(docs_path)

    print("Building index...")
    index = build_index(
        client=chromadb_client, 
        documents=documents, 
        index_name="test"
    )

    return index

In [13]:
vectordb_path = "./data/vectordb"
docs_path = "./data/docs"

run_ingestion_pipeline(vectordb_path, docs_path)

Connecting to ChromaDB...
Loading documents...
Loaded 232 documents
First document: Doc ID: 9ec08f43-646f-46bc-8406-81267d8beb60
Text:
Building index...
Created/existing collection name='test' id=UUID('e59cbf19-b47f-48e8-8f01-550bdf6a3579') metadata={'hnsw:space': 'cosine'} tenant='default_tenant' database='default_database'


Parsing nodes:   0%|          | 0/232 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/244 [00:00<?, ?it/s]

<llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x211cc37d690>

# Retrieval Pipeline

In [14]:
from llama_index.core import VectorStoreIndex
from llama_index.vector_stores.chroma import ChromaVectorStore

# def retrieve_from_index(chunk_size, llm, embed_model, chromadb_client, index_name):
def retrieve_from_index(chromadb_client, index_name):
    chroma_collection = chromadb_client.get_or_create_collection(index_name, metadata={"hnsw:space": "cosine"})
    vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
    index = VectorStoreIndex.from_vector_store(
        vector_store,
        # chunk_size=chunk_size,
        show_progress=True
    )
    return index

In [17]:
import chromadb
from llama_index.llms.ollama  import Ollama
from llama_index.core.query_engine import CitationQueryEngine
# from llama_index.core.settings import Settings


vectordb_path = "./data/vectordb"
docs_path = "./data"
index_name = "test"
chunk_size = 500
ollama_model = "mistral"
ollama_base_url = "http://127.0.0.1:11434"
embedding_model = "sentence-transformers/all-mpnet-base-v2"


print("Connecting to Chromadb")
chromadb_client = chromadb.PersistentClient(path=vectordb_path)

print("Loading Ollama...")
llm = Ollama(model=ollama_model, base_url=ollama_base_url, temperature=0, request_timeout=500.0)

print("Retrieving index...")
index = retrieve_from_index(chromadb_client, index_name)

print("Constructing query engine...")
query_engine = CitationQueryEngine.from_args(
    index=index,
    llm=llm,
    similarity_top_k=3,
    verbose=True
)

Connecting to Chromadb
Loading Ollama...
Retrieving index...
Constructing query engine...


In [20]:
response = query_engine.query("What is Buddhism?")
print(response)

 Buddhism is a Japanese philosophy derived from the Sanskrit word "dhyana," meaning meditation (Source 1, D. T. Suzuki). It emphasizes experiencing life in the present moment and removing dualistic distinctions between "I" and "you," as well as our spiritual and everyday activities (Source 1). The practice of Zen Buddhism involves learning through physical labor, such as cleaning, which strengthens the mind and enables us to wake up and become more aware (Source 2). Belief in an unconditional protector, the Buddha, infuses us with tremendous energy and encourages us to keep going despite challenges (Source 3).


#### testing the prompt

In [19]:
from IPython.display import Markdown, display

# define prompt viewing function
def display_prompt_dict(prompts_dict):
    for k, p in prompts_dict.items():
        text_md = f"**Prompt Key**: {k}<br>" f"**Text:** <br>"
        display(Markdown(text_md))
        print(p.get_template())
        display(Markdown("<br><br>"))


prompts_dict = query_engine.get_prompts()
display_prompt_dict(prompts_dict)

**Prompt Key**: response_synthesizer:text_qa_template<br>**Text:** <br>

Please provide an answer based solely on the provided sources. When referencing information from a source, cite the appropriate source(s) using their corresponding numbers. Every answer should include at least one source citation. Only cite a source when you are explicitly referencing it. If none of the sources are helpful, you should indicate that. For example:
Source 1:
The sky is red in the evening and blue in the morning.
Source 2:
Water is wet when the sky is red.
Query: When is water wet?
Answer: Water will be wet when the sky is red [2], which occurs in the evening [1].
Now it's your turn. Below are several numbered sources of information:
------
{context_str}
------
Query: {query_str}
Answer: 


<br><br>

**Prompt Key**: response_synthesizer:refine_template<br>**Text:** <br>

Please provide an answer based solely on the provided sources. When referencing information from a source, cite the appropriate source(s) using their corresponding numbers. Every answer should include at least one source citation. Only cite a source when you are explicitly referencing it. If none of the sources are helpful, you should indicate that. For example:
Source 1:
The sky is red in the evening and blue in the morning.
Source 2:
Water is wet when the sky is red.
Query: When is water wet?
Answer: Water will be wet when the sky is red [2], which occurs in the evening [1].
Now it's your turn. We have provided an existing answer: {existing_answer}Below are several numbered sources of information. Use them to refine the existing answer. If the provided sources are not helpful, you will repeat the existing answer.
Begin refining!
------
{context_msg}
------
Query: {query_str}
Answer: 


<br><br>