# Example 1

#

In [27]:
# Ingest documents from multiple sources 
import uuid
from llama_index.core import Document, SimpleDirectoryReader
from llama_index.readers.web import SimpleWebPageReader

documents = SimpleDirectoryReader("./data").load_data()
documents += [Document(text="The simplest way to store your indexed data is to use the built-in .persist() method of every Index, which writes all the data to disk at the location specified. This works for any type of index.",
                      doc_id=str(uuid.uuid4()),
                      metadata={"foo": "bar", "category": "documentation"}, # metadata will propagate to the nodes
                      excluded_llm_metadata_keys=["foo"] # some keys could be excluded from the text_content()
                      )]
documents += SimpleWebPageReader(html_to_text=True).load_data(urls=["https://docs.pinecone.io/home"])

In [28]:
len(documents)

3

In [92]:
documents[2]

Document(id_='https://docs.pinecone.io/home', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='[Pinecone Docs home page![light logo](https://mintlify.s3-us-\nwest-1.amazonaws.com/pinecone-2/logo/light.png)![dark\nlogo](https://mintlify.s3-us-\nwest-1.amazonaws.com/pinecone-2/logo/dark.png)](/)\n\nLatest\n\nSearch or ask...\n\n  * [Sign up free](https://app.pinecone.io/?sessionType=signup)\n  * [Status](https://status.pinecone.io)\n  * [Support](https://support.pinecone.io)\n  * [Log In](https://app.pinecone.io/?sessionType=login)\n  * [Sign up free](https://app.pinecone.io/?sessionType=signup)\n\nSearch\n\nNavigation\n\n[Home](/)[Guides](/guides/get-\nstarted/quickstart)[Reference](/reference/api/introduction)[Examples](/examples/notebooks)[Integrations](/integrations/overview)[Tools](/tools/pinecone-\nutilities)[Troubleshooting](/troubleshooting/contact-\nsupport)[Releases](/release-notes/2024)\n\n![](https://mintlify

### Transformation

In [30]:
# Creating nodes/chunks 
from llama_index.core.node_parser import SimpleNodeParser, SentenceSplitter, TokenTextSplitter, TextSplitter
from llama_index.core.ingestion import IngestionPipeline

# creating text nodes
parser = SimpleNodeParser.from_defaults()
nodes = parser.get_nodes_from_documents(documents)
print(len(nodes))

24


In [37]:
# using a different splitter -> this will create different number of nodes
text_splitter = SentenceSplitter(chunk_size=1024, chunk_overlap=20)
pipeline = IngestionPipeline(transformations=[text_splitter])
nodes = pipeline.run(documents=documents)
print(len(nodes))

20


In [38]:
[n.metadata.keys() for n in nodes]

[dict_keys(['file_path', 'file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date']),
 dict_keys(['file_path', 'file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date']),
 dict_keys(['file_path', 'file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date']),
 dict_keys(['file_path', 'file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date']),
 dict_keys(['file_path', 'file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date']),
 dict_keys(['file_path', 'file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date']),
 dict_keys(['file_path', 'file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date']),
 dict_keys(['file_path', 'file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date']),
 dict_keys(['file_path', 'file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date']),
 dict_keys(['file_path', 'file_name', 'file_type', 'fil

In [41]:
# creating nodes with automatic metadata extraction
# here we need to start making API requests to an LLM
# you NEED to set the OPENAI_API_KEY env variable 
import nest_asyncio

nest_asyncio.apply()

from llama_index.core.extractors import TitleExtractor, KeywordExtractor
from llama_index.core.schema import MetadataMode
from llama_index.llms.openai import OpenAI
llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)

enrich_metadata_pipeline = IngestionPipeline(transformations=[SentenceSplitter(chunk_size=1024, chunk_overlap=20),
                                                              TitleExtractor(llm=llm, metadata_mode=MetadataMode.EMBED),
                                                              KeywordExtractor(llm=llm, metadata_mode=MetadataMode.EMBED),
                                                             ])
nodes = enrich_metadata_pipeline.run(documents=documents)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:01<00:00,  3.25it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.66it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.57it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:03<00:00,  5.08it/s]


In [42]:
[n.metadata.keys() for n in nodes]

[dict_keys(['file_path', 'file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'document_title', 'excerpt_keywords']),
 dict_keys(['file_path', 'file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'document_title', 'excerpt_keywords']),
 dict_keys(['file_path', 'file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'document_title', 'excerpt_keywords']),
 dict_keys(['file_path', 'file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'document_title', 'excerpt_keywords']),
 dict_keys(['file_path', 'file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'document_title', 'excerpt_keywords']),
 dict_keys(['file_path', 'file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'document_title', 'excerpt_keywords']),
 dict_keys(['file_path', 'file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'document_title', 'excerpt_keywords']),

In [45]:
[n.metadata["excerpt_keywords"] for n in nodes]

['writing, programming, philosophy, art, language learning',
 'AI, Lisp, SHRDLU, Artificial Intelligence, Programming',
 'exploration, programming, art, philosophy, language learning',
 'painting, art, learning, language, exploration',
 'Paul Graham, Interleaf, Lisp, RISD, Art School',
 'painting, signature style, New York, World Wide Web, art galleries',
 'art galleries, online stores, web app, Viaweb, WYSIWYG site builder',
 'software, programming, ecommerce, retail, Viaweb',
 'startup growth, internet bubble, Viaweb, Yahoo acquisition, painting',
 'painting, New York, web apps, Lisp, software as a service',
 'Exploration, Programming, Philosophy, Art, Language Learning',
 'spam filters, painting, startup founders, angel investing, Y Combinator',
 'startups, Y Combinator, batch model, funding, community',
 'writing, programming, philosophy, art, language learning',
 'writing, programming, philosophy, art, language learning',
 'Lisp, Programming Language, Bel, McCarthy, Interpreter',


### Indexing

In [47]:
from llama_index.core import VectorStoreIndex
# On a high-level, index can be created from documents directly, this will use a default node parser
# index = VectorStoreIndex.from_documents(documents, show_progress=True)

index = VectorStoreIndex(nodes, show_progress=True)

  from .autonotebook import tqdm as notebook_tqdm
Generating embeddings: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:01<00:00, 18.43it/s]


In [48]:
len(index.index_struct.nodes_dict)

20

In [49]:
# no direct way to show the actual vector embeddings :/
index.ref_doc_info

{'5daac5eb-7e75-44d8-aee3-c308f359d2b3': RefDocInfo(node_ids=['b7cd3755-8335-40e5-875e-b06a357ba38f', '8ebbf89f-3196-4b9f-865b-f5fde85479dc', '6c841bcb-3c1c-4189-ae34-979575c95693', '9c9bae0a-f961-47b3-96f3-5c5675b06c74', '1ee49e3d-2b90-43c5-b735-46552a9f71b8', '438ba729-223f-4ffe-bbc8-a3aabb9331b0', 'c13c2f90-42eb-4356-9b52-926376f215b6', '8a135b25-ecd0-4bbc-b907-b17722419151', '0db7e674-47a3-487c-9393-a47ad2104545', '970e3b9b-d05d-412c-8141-ba9454774491', 'df8070e7-3cec-4375-ad30-609f39907071', '494f771d-d55c-43d4-a2d7-08bfb9c471fc', '6968eafd-98af-4a6c-94ed-18a173174c0f', '34733c08-6120-412f-80f8-7cd6c1ee7488', '63d0a688-7cf1-48c2-8c12-f4c03fd71de9', '40c7ceb5-420a-4cb2-94c2-23bab804dc31', '20cc598b-a66e-44e6-927a-4a4282ce7993', 'b6b1f44b-9cf4-45ae-8985-59d21430ee0c'], metadata={'file_path': '/Users/mohamedadelabdelhady/workspace/kaggle-sandbox/llm-playground/data/paul_graham_essay.txt', 'file_name': 'paul_graham_essay.txt', 'file_type': 'text/plain', 'file_size': 75041, 'creation_d

### Storage

In [77]:
# this will overwrite all the json files in storage
index.storage_context.persist(persist_dir="./storage")

In [67]:
import os
from pinecone import Pinecone, ServerlessSpec
from llama_index.core import StorageContext
from llama_index.vector_stores.pinecone import PineconeVectorStore

pc = Pinecone(
    api_key=os.environ.get("PINECONE_API_KEY")
)
pinecone_index = pc.Index("quickstart-index")

In [68]:
vector_store = PineconeVectorStore(pinecone_index=pinecone_index)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
# index = VectorStoreIndex(nodes, storage_context=storage_context, show_progress=True)

In [71]:
# not sure if this works
index.storage_context.add_vector_store(vector_store, namespace="default")

### Loading

In [78]:
from llama_index.core import StorageContext, load_index_from_storage

storage_context = StorageContext.from_defaults(persist_dir="./storage")
index = load_index_from_storage(storage_context)

In [79]:
index.ref_doc_info

{'5daac5eb-7e75-44d8-aee3-c308f359d2b3': RefDocInfo(node_ids=['b7cd3755-8335-40e5-875e-b06a357ba38f', '8ebbf89f-3196-4b9f-865b-f5fde85479dc', '6c841bcb-3c1c-4189-ae34-979575c95693', '9c9bae0a-f961-47b3-96f3-5c5675b06c74', '1ee49e3d-2b90-43c5-b735-46552a9f71b8', '438ba729-223f-4ffe-bbc8-a3aabb9331b0', 'c13c2f90-42eb-4356-9b52-926376f215b6', '8a135b25-ecd0-4bbc-b907-b17722419151', '0db7e674-47a3-487c-9393-a47ad2104545', '970e3b9b-d05d-412c-8141-ba9454774491', 'df8070e7-3cec-4375-ad30-609f39907071', '494f771d-d55c-43d4-a2d7-08bfb9c471fc', '6968eafd-98af-4a6c-94ed-18a173174c0f', '34733c08-6120-412f-80f8-7cd6c1ee7488', '63d0a688-7cf1-48c2-8c12-f4c03fd71de9', '40c7ceb5-420a-4cb2-94c2-23bab804dc31', '20cc598b-a66e-44e6-927a-4a4282ce7993', 'b6b1f44b-9cf4-45ae-8985-59d21430ee0c'], metadata={'file_path': '/Users/mohamedadelabdelhady/workspace/kaggle-sandbox/llm-playground/data/paul_graham_essay.txt', 'file_name': 'paul_graham_essay.txt', 'file_type': 'text/plain', 'file_size': 75041, 'creation_d

In [80]:
len(index.index_struct.nodes_dict)

20

### Querying

In [84]:
from llama_index.core import VectorStoreIndex, get_response_synthesizer, StorageContext, load_index_from_storage
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor, KeywordNodePostprocessor
from llama_index.core.response_synthesizers import ResponseMode
from llama_index.core.response.pprint_utils import pprint_source_node

storage_context = StorageContext.from_defaults(persist_dir="./storage")
index = load_index_from_storage(storage_context)

retriever = VectorIndexRetriever(index=index, similarity_top_k=5)
response_synthesizer = get_response_synthesizer(response_mode=ResponseMode.COMPACT)

# assemble the query engine
query_engine = RetrieverQueryEngine(retriever=retriever, response_synthesizer=response_synthesizer, node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.8)])

In [83]:
response = query_engine.query("Where did paul graham study?")
print(response)

Paul Graham studied at Harvard University.


In [86]:
for node in response.source_nodes:
    pprint_source_node(node)

Node ID: 6c841bcb-3c1c-4189-ae34-979575c95693
Similarity: 0.8319346545834482
Text: There were some surplus Xerox Dandelions floating around the
computer lab at one point. Anyone who wanted one to play around with
could have one. I was briefly tempted, but they were so slow by
present standards; what was the point? No one else wanted one either,
so off they went. That was what happened to systems work.  I wanted
not just to bui...
Node ID: 1ee49e3d-2b90-43c5-b735-46552a9f71b8
Similarity: 0.8277081509817129
Text: Our teacher, professor Ulivi, was a nice guy. He could see I
worked hard, and gave me a good grade, which he wrote down in a sort
of passport each student had. But the Accademia wasn't teaching me
anything except Italian, and my money was running out, so at the end
of the first year I went back to the US.  I wanted to go back to RISD,
but I was ...
Node ID: b7cd3755-8335-40e5-875e-b06a357ba38f
Similarity: 0.8157514284606912
Text: What I Worked On  February 2021  Before college t

In [89]:
response = query_engine.query("what is the simplest way to store indexed data?")
print(response)

The simplest way to store indexed data is to use the built-in .persist() method of every Index, which writes all the data to disk at the location specified.


In [90]:
for node in response.source_nodes:
    pprint_source_node(node)

Node ID: 335a9ddb-3a4e-410a-bde3-80143a7bea83
Similarity: 0.8419226761880413
Text: The simplest way to store your indexed data is to use the built-
in .persist() method of every Index, which writes all the data to disk
at the location specified. This works for any type of index.


In [96]:
response = query_engine.query("Pinecone Documentation and Resources: A Comprehensive Guide")
print(response)
for node in response.source_nodes:
    pprint_source_node(node)

The Pinecone Documentation and Resources provide a comprehensive guide that includes practical guides, detailed information about the Pinecone API, SDKs, and architecture, hands-on notebooks and sample apps with common AI patterns and tools, third-party integrations, Pinecone utilities and reference architectures, a troubleshooting guide to resolve common Pinecone issues, and news about features and changes in Pinecone and related tools.
Node ID: ef41f6b7-ff90-4613-a121-f9be2f3c406e
Similarity: 0.8953077987680225
Text: [Pinecone Docs home page![light logo](https://mintlify.s3-us-
west-1.amazonaws.com/pinecone-2/logo/light.png)![dark
logo](https://mintlify.s3-us-
west-1.amazonaws.com/pinecone-2/logo/dark.png)](/)  Latest  Search or
ask...    * [Sign up
free](https://app.pinecone.io/?sessionType=signup)   *
[Status](https://status.pinecone.io)   * [Support](http...


<llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x168f25910>