In [1]:
from llama_index.storage.storage_context import StorageContext
from llama_index.vector_stores import ChromaVectorStore
from llama_index import VectorStoreIndex, SimpleDirectoryReader, OpenAIEmbedding, set_global_service_context, ServiceContext
import chromadb
from llama_index import download_loader
from dotenv import load_dotenv, find_dotenv
import llama_index

import os
from dotenv import load_dotenv
import openai
from llama_index.llms import OpenAI

load_dotenv()  # This loads the environment variables from .env
llama_index.set_global_handler("simple")

llm = OpenAI(
            # temperature=0.1, 
             # model="gpt-4-1106-preview"
            model="gpt-3.5-turbo-1106"
)

service_context = ServiceContext.from_defaults(llm=llm)
set_global_service_context(service_context)

openai.api_key = os.environ.get("OPENAI_API_KEY")

In [9]:
WholeSiteReader = download_loader("WholeSiteReader")
# Initialize the scraper with a prefix URL and maximum depth
scraper = WholeSiteReader(
    prefix='https://www.paulgraham.com/users.html', # Example prefix
    max_depth=10
)

# Start scraping from a base URL
documents = scraper.load_data(base_url='https://www.paulgraham.com/articles.html') # Example base URL
index = VectorStoreIndex.from_documents(documents)

Downloading loader from Llama Hub... https://raw.githubusercontent.com/an-bluecat/llama-hub/main/llama_hub
Visiting: https://www.paulgraham.com/articles.html, 0 left
Found 224 new potential links
Visiting: https://www.paulgraham.com/users.html, 0 left
Found 4 new potential links


In [8]:
index.as_query_engine().query("whats PG's advice on talking to users?")

** Messages: **
system: You are an expert Q&A system that is trusted around the world.
Always answer the query using the provided context information, and not prior knowledge.
Some rules to follow:
1. Never directly reference the given context in your answer.
2. Avoid statements like 'Based on the context, ...' or 'The context information ...' or anything along those lines.
user: Context information is below.
---------------------
URL: https://www.paulgraham.com/users.html

September 2022

I recently told applicants to Y Combinator that the best advice I could give for getting in, per word, was
Explain what you've learned from users.
That tests a lot of things: whether you're paying attention to users, how well you understand them, and even how much they need what you're making.

Afterward I asked myself the same question. What have I learned from YC's users, the startups we've funded?

The first thing that came to mind was that most startups have the same problems. No two have exactly

Response(response="PG's advice on talking to users is to explain what you've learned from them. This advice tests various aspects, such as whether you're paying attention to users, how well you understand them, and even how much they need what you're making.", source_nodes=[NodeWithScore(node=TextNode(id_='aa29c87a-0e3b-4054-9e1b-ef9e4bf2c03f', embedding=None, metadata={'URL': 'https://www.paulgraham.com/users.html'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='19ab1e29-d855-43ae-a997-c290c0e94d1a', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'URL': 'https://www.paulgraham.com/users.html'}, hash='d6d778500a6b14dcc2972e64d1a10d0d215b275e1732e648d131912be8ff2255'), <NodeRelationship.PREVIOUS: '2'>: RelatedNodeInfo(node_id='b5e601ac-938c-44bc-9349-e2d9c5e8f162', node_type=<ObjectType.TEXT: '1'>, metadata={'URL': 'https://www.paulgraham.com/articles.html'}, hash='55445c39f0fe069416d05c096b46113c

## load documents

In [13]:

# UnstructuredURLLoader = download_loader("UnstructuredURLLoader")

# urls = [
#     "https://docs.splunk.com/Documentation/SplunkCloud/9.1.2308/ReleaseNotes/EdgeProcessor",
#     #  "https://www.understandingwar.org/backgrounder/russian-offensive-campaign-assessment-february-9-2023",
# ]

# loader = UnstructuredURLLoader(
#     urls=urls, continue_on_failure=False, headers={"User-Agent": "value"})
# # loader.load()


# # load some documents
# # documents = SimpleDirectoryReader("./data").load_data()
# documents = loader.load_data()
# print(documents)


[Document(id_='a4865345-47f5-4bf5-91e5-04e55138b5a1', embedding=None, metadata={'source': 'https://docs.splunk.com/Documentation/SplunkCloud/9.1.2308/ReleaseNotes/EdgeProcessor'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='c5d53220743ba10e9aa76bbe34528d3197e2dfd7ed84a8d70b9108de12c53f19', text='logo\n\nSupport\n\t            \n\t         \n\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n\n\t                       \n\t                           Support Portal\n\t                       \n\n\n\n\t                \n\t\t\t

Visiting: https://docs.llamaindex.ai/en/stable/


In [54]:




from BFS_web_scraper_reader import BFSWebScraperReader
scraper = BFSWebScraperReader(
    prefix='https://docs.llamaindex.ai/en/stable/'
    # 'https://www.paulgraham.com/'
    , max_depth=10)

# Start scraping from a base URL
documents = scraper.load_data(base_url='https://docs.llamaindex.ai/en/stable/'
                              # 'https://www.paulgraham.com/articles.html'
                             )
# print(documents)

Visiting: https://docs.llamaindex.ai/en/stable/, 0 left
Found 1022 new links
Visiting: https://docs.llamaindex.ai/en/stable/, 966 left
Found 55 new links
Visiting: https://docs.llamaindex.ai/en/stable/, 965 left
Found 55 new links
Visiting: https://docs.llamaindex.ai/en/stable/getting_started/installation.html, 964 left
Found 48 new links
Visiting: https://docs.llamaindex.ai/en/stable/getting_started/reading.html, 966 left
Found 46 new links
Visiting: https://docs.llamaindex.ai/en/stable/getting_started/starter_example.html, 965 left
Found 45 new links
Visiting: https://docs.llamaindex.ai/en/stable/getting_started/concepts.html, 964 left
Found 44 new links
Visiting: https://docs.llamaindex.ai/en/stable/getting_started/customization.html, 963 left
Found 43 new links
Visiting: https://docs.llamaindex.ai/en/stable/getting_started/discover_llamaindex.html, 962 left
Found 53 new links
Visiting: https://docs.llamaindex.ai/en/stable/use_cases/q_and_a.html, 961 left
Found 43 new links
Visiting

## Init DB


In [55]:

db = chromadb.PersistentClient(path="./chroma_db")

# create collection
chroma_collection = db.get_or_create_collection("llamaindex")
# chroma_collection = db.get_or_create_collection("PaulG")

# assign chroma as the vector_store to the context
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)



In [56]:
db.list_collections()
# chroma_collection.get()

[Collection(name=edge_processor),
 Collection(name=llamaindex),
 Collection(name=PaulG)]

In [74]:
chroma_collection.get()
print(len(documents))
print(documents[3].text)
print(documents[0])

990
LlamaIndex 🦙 0.9.22
Search
GETTING STARTED
Installation and Setup
How to read these docs
Starter Tutorial
High-Level Concepts
Customization Tutorial
Discover LlamaIndex Video Series
USE CASES
Q&A
Chatbots
Agents
Structured Data Extraction
Multi-modal
Toggle child pages in navigation
UNDERSTANDING
Building an LLM application
Using LLMs
Toggle child pages in navigation
Loading Data (Ingestion)
Toggle child pages in navigation
Indexing
Storing
Querying
Putting It All Together
Toggle child pages in navigation
Tracing and Debugging
Evaluating
Toggle child pages in navigation
OPTIMIZING
Basic Strategies
Toggle child pages in navigation
Advanced Retrieval Strategies
Toggle child pages in navigation
Agentic strategies
Toggle child pages in navigation
Evaluation
Toggle child pages in navigation
Fine-tuning
Toggle child pages in navigation
Building Performant RAG Applications for Production
Toggle child pages in navigation
Building RAG from Scratch (Lower-Level)
Toggle child pages in navigat

In [58]:
# save it into local storage
index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context, show_progress=True
)

Parsing nodes:   0%|          | 0/990 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/3985 [00:00<?, ?it/s]

In [11]:
# load from local storage
index = VectorStoreIndex.from_vector_store(
    vector_store, storage_context=storage_context, show_progress=True
)




In [69]:
index

<llama_index.indices.vector_store.base.VectorStoreIndex at 0x120e56370>

In [70]:
# create a query engine and query

# from llama_index.llms import ChatMessage, MessageRole
# from llama_index.prompts import ChatPromptTemplate

# # Text QA Prompt
# chat_text_qa_msgs = [
#     ChatMessage(
#         role=MessageRole.SYSTEM,
#         content=(
#             "Always answer the question, even if the context isn't helpful."
#         ),
#     ),
#     ChatMessage(
#         role=MessageRole.USER,
#         content=(
#             "Context information is below.\n"
#             "---------------------\n"
#             "{context_str}\n"
#             "---------------------\n"
#             "Given the context information and not prior knowledge, "
#             "answer the question: {query_str}\n"
#         ),
#     ),
# ]
# text_qa_template = ChatPromptTemplate(chat_text_qa_msgs)





# query_engine = index.as_query_engine(streaming=True, similarity_top_k=5)
# streaming_response = query_engine.query("what's is edge processor? ")
# # print(response)
# # for text in streaming_response.response_gen:
# #     # do something with text as they arrive.
# #     print(text)
# streaming_response.print_response_stream()


from llama_index.llms import ChatMessage, MessageRole
from llama_index.prompts import ChatPromptTemplate

# Text QA Prompt with Markdown and References
chat_text_qa_msgs = [
    ChatMessage(
        role=MessageRole.SYSTEM,
        content=(
            "Always answer the question, even if the context isn't helpful. "
            "Responses will be formatted in markdown. Citations to the URLs should be included with a hyperlink at the end of the corresponding sentence."
        ),
    ),
    ChatMessage(
        role=MessageRole.USER,
        content=(
            "Context information is below.\n"
            "---------------------\n"
            "{context_str}\n"
            "---------------------\n"
            "Given the context information and not prior knowledge, "
            "answer the question with appropriate citation: {query_str}\n"
        ),
    ),
]

text_qa_template = ChatPromptTemplate(chat_text_qa_msgs)

# Rest of your script for query engine initialization and querying

# For displaying responses
# query_engine = index.as_query_engine(streaming=True, similarity_top_k=1, text_qa_template=text_qa_template)
# streaming_response = query_engine.query("what's edge processor? ")

# for text in streaming_response.response_gen:
#     # Format and print text with markdown
#     print(f"```markdown\n{text}\n```")





# query_engine = index.as_query_engine(streaming=False, similarity_top_k=5, text_qa_template=text_qa_template)
# query_engine = index.as_chat_engine(streaming=False, similarity_top_k=3, text_qa_template=text_qa_template)
query_engine = index.as_chat_engine(streaming=False)
streaming_response = query_engine.chat("how to conduct hybrid search?")
# streaming_response = query_engine.chat("what's Paul Graham's opinion on how to get startup ideas")










In [71]:
from IPython.display import display, Markdown

# Assuming streaming_response is your string variable
markdown_content = f"```markdown\n{streaming_response}\n```"
display(Markdown(markdown_content))

```markdown
To conduct a hybrid search, which combines keyword-based search with vector search, you can follow these general steps:

1. **Prepare Your Dataset**: Ensure that your dataset contains both textual data for keyword search and vector representations for vector search. The textual data will be used for matching specific terms, while vector representations will allow for semantic similarity searches.

2. **Index Your Data**: Use a search engine that supports hybrid search capabilities, such as Elasticsearch with vector scoring plugins or specialized engines like Qdrant. Index your data by storing both keywords and vector embeddings.

3. **Formulate Your Query**: When performing a search, you will need to specify both the keyword query and the vector query. The keyword query is used to filter results based on exact or partial matches, while the vector query ranks the relevance of the results based on conceptual similarity.

4. **Execute the Search**: The search engine will process the hybrid query by first filtering the dataset using the keyword criteria and then ranking the filtered results using the vector similarity scores.

5. **Retrieve and Analyze Results**: The search engine will return a list of results that are both keyword-relevant and semantically similar according to the vector query. You can then analyze these results to find the most relevant information for your needs.

For a specific implementation, you would need to consult the documentation of the search engine or library you are using, as it will provide detailed instructions, code examples, and configuration options for setting up a hybrid search system. Tools like Elasticsearch, Qdrant, or LlamaIndex often have extensive documentation and community support to help with this process.
```

In [72]:
streaming_response

AgentChatResponse(response='To conduct a hybrid search, which combines keyword-based search with vector search, you can follow these general steps:\n\n1. **Prepare Your Dataset**: Ensure that your dataset contains both textual data for keyword search and vector representations for vector search. The textual data will be used for matching specific terms, while vector representations will allow for semantic similarity searches.\n\n2. **Index Your Data**: Use a search engine that supports hybrid search capabilities, such as Elasticsearch with vector scoring plugins or specialized engines like Qdrant. Index your data by storing both keywords and vector embeddings.\n\n3. **Formulate Your Query**: When performing a search, you will need to specify both the keyword query and the vector query. The keyword query is used to filter results based on exact or partial matches, while the vector query ranks the relevance of the results based on conceptual similarity.\n\n4. **Execute the Search**: The 

In [73]:
streaming_response = query_engine.chat("what are the three resources you mentioned eariler?")
streaming_response

AgentChatResponse(response="The three resources mentioned earlier in the context of hybrid search are:\n\n1. **Elasticsearch with Vector Scoring Plugins**: Elasticsearch is a popular open-source search engine that provides full-text search capabilities. It can be extended with plugins to support vector scoring, which allows for hybrid search by combining traditional keyword search with vector-based similarity search.\n\n2. **Qdrant**: Qdrant is a vector search engine that supports storing both keywords and vector embeddings. It is designed for similarity search and can be used to perform hybrid searches by leveraging both keyword matching and semantic similarity.\n\n3. **LlamaIndex**: While not explicitly mentioned in the previous response, LlamaIndex is another example of a tool that could be used for hybrid search. However, it's important to note that LlamaIndex is a hypothetical example and may not refer to an actual product. If you are looking for real-world tools, you should focus

In [15]:
prompts_dict = query_engine.get_prompts()
print(list(prompts_dict.keys()))

['response_synthesizer:text_qa_template', 'response_synthesizer:refine_template']


## condense chat query engine

In [15]:



# from llama_index.prompts import PromptTemplate
# from llama_index.llms import ChatMessage, MessageRole
# from llama_index.chat_engine.condense_question import (
#     CondenseQuestionChatEngine,
# )

# custom_prompt = PromptTemplate(
#     """\
# Given a conversation (between Human and Assistant) and a follow up message from Human, \
# rewrite the message to be a standalone question that captures all relevant context \
# from the conversation. Give corresponding link references for your answer.

# <Chat History>
# {chat_history}

# <Follow Up Message>
# {question}

# <Standalone question>
# """
# )

# # list of `ChatMessage` objects
# custom_chat_history = [
#     ChatMessage(
#         role=MessageRole.USER,
#         content="Hello assistant, we are having a insightful discussion about Paul Graham today.",
#     ),
#     ChatMessage(role=MessageRole.ASSISTANT, content="Okay, sounds good."),
# ]

# query_engine = index.as_query_engine()
# chat_engine = CondenseQuestionChatEngine.from_defaults(
#     query_engine=query_engine,
#     condense_question_prompt=custom_prompt,
#     chat_history=custom_chat_history,
#     verbose=True,
# )

In [14]:
# from llama_index import VectorStoreIndex, SimpleDirectoryReader

# documents = SimpleDirectoryReader("data").load_data()
# index = VectorStoreIndex.from_documents(documents)
query_engine = index.as_chat_engine(streaming=True)
response = query_engine.chat("What's edge processor")
# print(response)

# response = query_engine.chat("Oh interesting, tell me more.")
print(response)


** Messages: **
user: What's edge processor
**************************************************
** Response: **
assistant: An edge processor is a type of computer processor that is designed to perform computations and data processing at the edge of a network, closer to the source of data generation. It is typically used in edge computing systems, where data is processed and analyzed locally on devices or edge servers, rather than being sent to a centralized cloud server for processing.

Edge processors are optimized for low power consumption and high performance, allowing them to handle real-time data processing tasks efficiently. They are commonly used in applications such as Internet of Things (IoT) devices, industrial automation, autonomous vehicles, and smart cities, where low latency and real-time decision-making are critical.

By processing data at the edge, edge processors can reduce the amount of data that needs to be transmitted to the cloud, saving bandwidth and reducing laten