# Example 1

#

In [8]:
# Ingest documents from multiple sources 
import uuid
from llama_index.core import Document, SimpleDirectoryReader
from llama_index.readers.web import SimpleWebPageReader

documents = SimpleDirectoryReader("./data").load_data()
# documents += [Document(text="The simplest way to store your indexed data is to use the built-in .persist() method of every Index, which writes all the data to disk at the location specified. This works for any type of index.",
#                       doc_id=str(uuid.uuid4()),
#                       metadata={"foo": "bar", "category": "documentation"}, # metadata will propagate to the nodes
#                       excluded_llm_metadata_keys=["foo"] # some keys could be excluded from the text_content()
#                       )]
documents += SimpleWebPageReader(html_to_text=True).load_data(urls=["https://www.moabdelhady.com"])

In [9]:
len(documents)

14

In [10]:
documents[2]

Document(id_='49e68adf-8dc9-49b1-a45f-44f652790f90', embedding=None, metadata={'page_label': '1', 'file_name': 'Reuse_oriented_SLAM_submission___reviewed.pdf', 'file_path': '/Users/mohamedadelabdelhady/workspace/kaggle-sandbox/llm-playground/data/Reuse_oriented_SLAM_submission___reviewed.pdf', 'file_type': 'application/pdf', 'file_size': 352194, 'creation_date': '2024-06-13', 'last_modified_date': '2024-06-13'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='Reuse-oriented SLAM Framework using Software\nProduct Lines\nMohamed A. Abdelhady∗, Douwe Dresscher∗and Jan F. Broenink∗\n∗Faculty of Electrical Engineering, Mathematics and Computer Science\nUniversity of Twente, 7500 AE Enschede, The Netherlands\nEmails: m.adel.abdelhady@gmail.com, {d.dresscher, j

### Transformation

In [12]:
# Creating nodes/chunks 
from llama_index.core.node_parser import SimpleNodeParser, SentenceSplitter, TokenTextSplitter, TextSplitter
from llama_index.core.ingestion import IngestionPipeline

# creating text nodes
parser = SimpleNodeParser.from_defaults()
nodes = parser.get_nodes_from_documents(documents)
print(len(nodes))

23


In [13]:
# using a different splitter -> this will create different number of nodes
text_splitter = SentenceSplitter(chunk_size=1024, chunk_overlap=20)
pipeline = IngestionPipeline(transformations=[text_splitter])
nodes = pipeline.run(documents=documents)
print(len(nodes))

23


In [14]:
[n.metadata.keys() for n in nodes]

[dict_keys(['page_label', 'file_name', 'file_path', 'file_type', 'file_size', 'creation_date', 'last_modified_date']),
 dict_keys(['page_label', 'file_name', 'file_path', 'file_type', 'file_size', 'creation_date', 'last_modified_date']),
 dict_keys(['page_label', 'file_name', 'file_path', 'file_type', 'file_size', 'creation_date', 'last_modified_date']),
 dict_keys(['page_label', 'file_name', 'file_path', 'file_type', 'file_size', 'creation_date', 'last_modified_date']),
 dict_keys(['page_label', 'file_name', 'file_path', 'file_type', 'file_size', 'creation_date', 'last_modified_date']),
 dict_keys(['page_label', 'file_name', 'file_path', 'file_type', 'file_size', 'creation_date', 'last_modified_date']),
 dict_keys(['page_label', 'file_name', 'file_path', 'file_type', 'file_size', 'creation_date', 'last_modified_date']),
 dict_keys(['page_label', 'file_name', 'file_path', 'file_type', 'file_size', 'creation_date', 'last_modified_date']),
 dict_keys(['page_label', 'file_name', 'file_pat

In [15]:
# creating nodes with automatic metadata extraction
# here we need to start making API requests to an LLM
# you NEED to set the OPENAI_API_KEY env variable 
import nest_asyncio

nest_asyncio.apply()

from llama_index.core.extractors import TitleExtractor, KeywordExtractor
from llama_index.core.schema import MetadataMode
from llama_index.llms.openai import OpenAI
llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)

enrich_metadata_pipeline = IngestionPipeline(transformations=[SentenceSplitter(chunk_size=1024, chunk_overlap=20),
                                                              TitleExtractor(llm=llm, metadata_mode=MetadataMode.EMBED),
                                                              KeywordExtractor(llm=llm, metadata_mode=MetadataMode.EMBED),
                                                             ])
nodes = enrich_metadata_pipeline.run(documents=documents)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.06it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.12s/it]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  2.60it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  2.15it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  2.32it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████

In [16]:
[n.metadata.keys() for n in nodes]

[dict_keys(['page_label', 'file_name', 'file_path', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'document_title', 'excerpt_keywords']),
 dict_keys(['page_label', 'file_name', 'file_path', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'document_title', 'excerpt_keywords']),
 dict_keys(['page_label', 'file_name', 'file_path', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'document_title', 'excerpt_keywords']),
 dict_keys(['page_label', 'file_name', 'file_path', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'document_title', 'excerpt_keywords']),
 dict_keys(['page_label', 'file_name', 'file_path', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'document_title', 'excerpt_keywords']),
 dict_keys(['page_label', 'file_name', 'file_path', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'document_title', 'excerpt_keywords']),
 dict_keys(['page_label', 'file_name', 'file_path', 'file_

In [17]:
[n.metadata["excerpt_keywords"] for n in nodes]

['Tech Lead, Machine Learning, Robotics Engineering, Fraud Detection, Computer Vision',
 'Robotics, AI, Reinforcement learning, SLAM, Neural networks',
 'Software Reusability, Robotics, Component-Based Development, Refactoring Techniques, SLAM Framework',
 'Component-Based Development, Refactoring Techniques, Reuse-oriented SLAM Framework, Robotics Applications, Software Reusability',
 'SLAM, Reuse-oriented, Software product lines, Reusability, Domain analysis',
 'SLAM, Reuse-Oriented, Inference Back-end, Platform-Specific, Map Representation',
 'SLAM, Product Line Development, Derivation, Modularity, Extensibility',
 'SLAM, reusability, standardization, software product line, interoperability',
 'Software Product Line Engineering, SLAM Systems, Rao-Blackwellized Particle Filters, Brain-Based Systems, Robotics',
 'SLAM, Software Product Line Engineering, Rao-Blackwellized Particle Filters, Brain-Based Systems, Robotics',
 'document understanding, transformation, metadata extraction, em

### Using pinecone

In [23]:
import os
from pinecone import Pinecone, ServerlessSpec, PodSpec
from llama_index.vector_stores.pinecone import PineconeVectorStore
from llama_index.core import VectorStoreIndex
from llama_index.core.response.pprint_utils import pprint_source_node
from llama_index.core import StorageContext

pc = Pinecone(
    api_key=os.environ.get("PINECONE_API_KEY")
)
pc.list_indexes()

{'indexes': [{'dimension': 1536,
              'host': 'quickstart-index-es1tgmv.svc.aped-4627-b74a.pinecone.io',
              'metric': 'euclidean',
              'name': 'quickstart-index',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'}}]}

In [24]:
pinecone_index = pc.Index("quickstart-index")

In [33]:
pinecone_index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 23}},
 'total_vector_count': 23}

In [25]:
vector_store = PineconeVectorStore(pinecone_index=pinecone_index)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [26]:
index = VectorStoreIndex(
    nodes, storage_context=storage_context, show_progress=True
)

Generating embeddings: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 23/23 [00:01<00:00, 16.02it/s]
Upserted vectors: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 23/23 [00:01<00:00, 15.17it/s]


In [27]:

# set Logging to DEBUG for more detailed outputs
query_engine = index.as_query_engine()
response = query_engine.query("What is reusable SLAM?")

In [28]:
print(response)

A reusable SLAM is a framework that emphasizes reusability as a primary requirement of SLAM software. It introduces structured development processes and creates a SLAM infrastructure with core separable modules implemented as encapsulated interchangeable components forming a software product line. The reusability of the framework is evaluated based on reuse-readiness levels criteria, resulting in improved modularity and reduction in development and deployment time and effort.


In [30]:
for node in response.source_nodes:
    pprint_source_node(node)

Node ID: 9225c8d0-49c2-4def-85e0-cb899774b8d3
Similarity: 0.258820891
Text: Reuse-oriented SLAM Framework using Software Product Lines
Mohamed A. Abdelhady∗, Douwe Dresscher∗and Jan F. Broenink∗ ∗Faculty
of Electrical Engineering, Mathematics and Computer Science University
of Twente, 7500 AE Enschede, The Netherlands Emails:
m.adel.abdelhady@gmail.com, {d.dresscher, j.f.broenink }@utwente.nl
Abstract —Simultaneous Loca...
Node ID: 86e837bc-b907-4dfb-9177-2af65b442169
Similarity: 0.275452733
Text: commonly used perception algorithms and data structures. The
results yield a perception framework with standalone atomic software
components and harmonized interfaces that can easily be used to
interchange algorithms and benchmark them. Thus, allowing the
developers to decide in an early stage which is the most suitable
algorithm for the applica...


### Indexing

In [108]:
from llama_index.core import VectorStoreIndex
# On a high-level, index can be created from documents directly, this will use a default node parser
# index = VectorStoreIndex.from_documents(documents, show_progress=True)

index = VectorStoreIndex(nodes, show_progress=True)

Generating embeddings: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 22/22 [00:01<00:00, 16.37it/s]


In [109]:
len(index.index_struct.nodes_dict)

22

In [110]:
# no direct way to show the actual vector embeddings :/
index.ref_doc_info

{'dc634446-09b3-48e3-9079-bcbb63643d06': RefDocInfo(node_ids=['f7d41b6a-e257-4210-9495-d0ef69f92262'], metadata={'page_label': '1', 'file_name': 'CV-M-Abdelhady.pdf', 'file_path': '/Users/mohamedadelabdelhady/workspace/kaggle-sandbox/llm-playground/data/CV-M-Abdelhady.pdf', 'file_type': 'application/pdf', 'file_size': 108188, 'creation_date': '2024-06-11', 'last_modified_date': '2024-06-11', 'document_title': '"Tech Lead in Machine Learning and Robotics Engineering Resume"', 'excerpt_keywords': 'Tech Lead, Machine Learning, Robotics Engineering, Fraud Detection, Computer Vision'}),
 '90e77d42-d332-4fd7-9091-05e89aa86980': RefDocInfo(node_ids=['63d896de-ff20-473d-ba52-b595089cd2c7'], metadata={'page_label': '2', 'file_name': 'CV-M-Abdelhady.pdf', 'file_path': '/Users/mohamedadelabdelhady/workspace/kaggle-sandbox/llm-playground/data/CV-M-Abdelhady.pdf', 'file_type': 'application/pdf', 'file_size': 108188, 'creation_date': '2024-06-11', 'last_modified_date': '2024-06-11', 'document_title'

### Storage

In [111]:
# this will overwrite all the json files in storage
index.storage_context.persist(persist_dir="./storage")

In [112]:
import os
from pinecone import Pinecone, ServerlessSpec
from llama_index.core import StorageContext
from llama_index.vector_stores.pinecone import PineconeVectorStore

pc = Pinecone(
    api_key=os.environ.get("PINECONE_API_KEY")
)
pinecone_index = pc.Index("quickstart-index")

In [68]:
vector_store = PineconeVectorStore(pinecone_index=pinecone_index)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
# index = VectorStoreIndex(nodes, storage_context=storage_context, show_progress=True)

In [71]:
# not sure if this works
index.storage_context.add_vector_store(vector_store, namespace="default")

### Loading

In [78]:
from llama_index.core import StorageContext, load_index_from_storage

storage_context = StorageContext.from_defaults(persist_dir="./storage")
index = load_index_from_storage(storage_context)

In [79]:
index.ref_doc_info

{'5daac5eb-7e75-44d8-aee3-c308f359d2b3': RefDocInfo(node_ids=['b7cd3755-8335-40e5-875e-b06a357ba38f', '8ebbf89f-3196-4b9f-865b-f5fde85479dc', '6c841bcb-3c1c-4189-ae34-979575c95693', '9c9bae0a-f961-47b3-96f3-5c5675b06c74', '1ee49e3d-2b90-43c5-b735-46552a9f71b8', '438ba729-223f-4ffe-bbc8-a3aabb9331b0', 'c13c2f90-42eb-4356-9b52-926376f215b6', '8a135b25-ecd0-4bbc-b907-b17722419151', '0db7e674-47a3-487c-9393-a47ad2104545', '970e3b9b-d05d-412c-8141-ba9454774491', 'df8070e7-3cec-4375-ad30-609f39907071', '494f771d-d55c-43d4-a2d7-08bfb9c471fc', '6968eafd-98af-4a6c-94ed-18a173174c0f', '34733c08-6120-412f-80f8-7cd6c1ee7488', '63d0a688-7cf1-48c2-8c12-f4c03fd71de9', '40c7ceb5-420a-4cb2-94c2-23bab804dc31', '20cc598b-a66e-44e6-927a-4a4282ce7993', 'b6b1f44b-9cf4-45ae-8985-59d21430ee0c'], metadata={'file_path': '/Users/mohamedadelabdelhady/workspace/kaggle-sandbox/llm-playground/data/paul_graham_essay.txt', 'file_name': 'paul_graham_essay.txt', 'file_type': 'text/plain', 'file_size': 75041, 'creation_d

In [80]:
len(index.index_struct.nodes_dict)

20

### Querying

In [114]:
from llama_index.core import VectorStoreIndex, get_response_synthesizer, StorageContext, load_index_from_storage
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor, KeywordNodePostprocessor
from llama_index.core.response_synthesizers import ResponseMode
from llama_index.core.response.pprint_utils import pprint_source_node

storage_context = StorageContext.from_defaults(persist_dir="./storage")
index = load_index_from_storage(storage_context)

retriever = VectorIndexRetriever(index=index, similarity_top_k=5)
response_synthesizer = get_response_synthesizer(response_mode=ResponseMode.COMPACT)

# assemble the query engine
query_engine = RetrieverQueryEngine(retriever=retriever, response_synthesizer=response_synthesizer, node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.8)])

In [115]:
response = query_engine.query("Where did paul graham study?")
print(response)

Paul Graham studied philosophy in college before switching to AI.


In [116]:
for node in response.source_nodes:
    pprint_source_node(node)

Node ID: 0b7dcca8-d36b-4e3a-b089-dfb63945b946
Similarity: 0.8290162462369435
Text: There were some surplus Xerox Dandelions floating around the
computer lab at one point. Anyone who wanted one to play around with
could have one. I was briefly tempted, but they were so slow by
present standards; what was the point? No one else wanted one either,
so off they went. That was what happened to systems work.  I wanted
not just to bui...
Node ID: 43cb3779-4e11-44b6-bd48-497639f8f727
Similarity: 0.8274202550584204
Text: Our teacher, professor Ulivi, was a nice guy. He could see I
worked hard, and gave me a good grade, which he wrote down in a sort
of passport each student had. But the Accademia wasn't teaching me
anything except Italian, and my money was running out, so at the end
of the first year I went back to the US.  I wanted to go back to RISD,
but I was ...
Node ID: f7867f2f-3723-4582-a6e9-fd102933ee6a
Similarity: 0.814594074107659
Text: What I Worked On  February 2021  Before college th

In [117]:
response = query_engine.query("what is the simplest way to store indexed data?")
print(response)

The simplest way to store indexed data is to use the built-in .persist() method of every Index, which writes all the data to disk at the location specified.


In [118]:
for node in response.source_nodes:
    pprint_source_node(node)

Node ID: c3a89c37-809d-4f8b-b077-b6baad787e14
Similarity: 0.8419226761880413
Text: The simplest way to store your indexed data is to use the built-
in .persist() method of every Index, which writes all the data to disk
at the location specified. This works for any type of index.


In [119]:
response = query_engine.query("Pinecone Documentation and Resources: A Comprehensive Guide")
print(response)
for node in response.source_nodes:
    pprint_source_node(node)

The Pinecone Documentation and Resources provide a comprehensive guide for users, offering practical guides, detailed information about the Pinecone API, SDKs, and architecture, hands-on examples and sample apps, details on third-party integrations, Pinecone utilities and reference architectures, troubleshooting guides, and news about features and changes in Pinecone and related tools.
Node ID: a0754d56-b619-4267-aea0-ab1b3c6be53a
Similarity: 0.89678371872626
Text: [Pinecone Docs home page![light logo](https://mintlify.s3-us-
west-1.amazonaws.com/pinecone-2/logo/light.png)![dark
logo](https://mintlify.s3-us-
west-1.amazonaws.com/pinecone-2/logo/dark.png)](/)  Latest  Search or
ask...    * [Sign up
free](https://app.pinecone.io/?sessionType=signup)   *
[Status](https://status.pinecone.io)   * [Support](http...


In [123]:
response = query_engine.query("Who is Mohamed A. Abdelhady ?")
print(response)
for node in response.source_nodes:
    pprint_source_node(node)

Empty Response
