# Welocate demo

#

In [23]:
# Ingest documents from multiple sources 
import uuid
from llama_index.core import Document, SimpleDirectoryReader
from llama_index.readers.web import SimpleWebPageReader

documents = SimpleDirectoryReader("./welocate-data").load_data()
# documents += [Document(text="The simplest way to store your indexed data is to use the built-in .persist() method of every Index, which writes all the data to disk at the location specified. This works for any type of index.",
#                       doc_id=str(uuid.uuid4()),
#                       metadata={"foo": "bar", "category": "documentation"}, # metadata will propagate to the nodes
#                       excluded_llm_metadata_keys=["foo"] # some keys could be excluded from the text_content()
#                       )]

welocate_urls = [
    "https://welocate.com/",
    "https://welocate.com/buying_a_house_in_the_netherlands.aspx",
    "https://welocate.com/selling_your_house_in_the_netherlands.aspx",
    "https://welocate.com/renting_out_a_property_in_the_netherlands.aspx",
    "https://welocate.com/renting_a_property_in_the_netherlands.aspx",
    "https://welocate.com/properties_for_rent.aspx",
    "https://welocate.com/relocating_to_the_netherlands.aspx",
    "https://welocate.com/about_welocate.aspx",
]
documents += SimpleWebPageReader(html_to_text=True).load_data(urls=welocate_urls)

Ignoring wrong pointing object 34 0 (offset 0)


In [24]:
len(documents)

12

In [27]:
for doc in documents:
    if doc.id_.startswith("https://welocate.com/"):
        print("link: ", doc.id_)
        doc.metadata["access_link"] = doc.id_
    else:
        print("file: ", doc.metadata["file_name"])
        doc.metadata["access_link"] = doc.metadata["file_name"]

file:  WelocateHomeBuyersPlan_ABN.pdf
file:  WelocateHomeBuyersPlan_ABN.pdf
file:  WelocateHomeSellersPlan.pdf
file:  WelocateHomeSellersPlan.pdf
link:  https://welocate.com/
link:  https://welocate.com/buying_a_house_in_the_netherlands.aspx
link:  https://welocate.com/selling_your_house_in_the_netherlands.aspx
link:  https://welocate.com/renting_out_a_property_in_the_netherlands.aspx
link:  https://welocate.com/renting_a_property_in_the_netherlands.aspx
link:  https://welocate.com/properties_for_rent.aspx
link:  https://welocate.com/relocating_to_the_netherlands.aspx
link:  https://welocate.com/about_welocate.aspx


In [28]:
for doc in documents: 
    print(doc.metadata["access_link"])

WelocateHomeBuyersPlan_ABN.pdf
WelocateHomeBuyersPlan_ABN.pdf
WelocateHomeSellersPlan.pdf
WelocateHomeSellersPlan.pdf
https://welocate.com/
https://welocate.com/buying_a_house_in_the_netherlands.aspx
https://welocate.com/selling_your_house_in_the_netherlands.aspx
https://welocate.com/renting_out_a_property_in_the_netherlands.aspx
https://welocate.com/renting_a_property_in_the_netherlands.aspx
https://welocate.com/properties_for_rent.aspx
https://welocate.com/relocating_to_the_netherlands.aspx
https://welocate.com/about_welocate.aspx


### Transformation

In [29]:
# Creating nodes/chunks 
from llama_index.core.node_parser import SimpleNodeParser, SentenceSplitter, TokenTextSplitter, TextSplitter
from llama_index.core.ingestion import IngestionPipeline

# creating text nodes
parser = SimpleNodeParser.from_defaults()
nodes = parser.get_nodes_from_documents(documents)
print(len(nodes))

39


In [30]:
# using a different splitter -> this will create different number of nodes
text_splitter = SentenceSplitter(chunk_size=1024, chunk_overlap=20)
pipeline = IngestionPipeline(transformations=[text_splitter])
nodes = pipeline.run(documents=documents)
print(len(nodes))

39


In [17]:
# [n.metadata.keys() for n in nodes]

In [31]:
# creating nodes with automatic metadata extraction
# here we need to start making API requests to an LLM
# you NEED to set the OPENAI_API_KEY env variable 
import nest_asyncio

nest_asyncio.apply()

from llama_index.core.extractors import TitleExtractor, KeywordExtractor
from llama_index.core.schema import MetadataMode
from llama_index.llms.openai import OpenAI
llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)

enrich_metadata_pipeline = IngestionPipeline(transformations=[SentenceSplitter(chunk_size=1024, chunk_overlap=20),
                                                              TitleExtractor(llm=llm, metadata_mode=MetadataMode.EMBED),
                                                              KeywordExtractor(llm=llm, metadata_mode=MetadataMode.EMBED),
                                                             ])
nodes = enrich_metadata_pipeline.run(documents=documents)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:01<00:00,  1.77it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.04s/it]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.09it/s]
100%|████████████████████████████████████████████████████████

In [32]:
[n.metadata.keys() for n in nodes]

[dict_keys(['page_label', 'file_name', 'file_path', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'access_link', 'document_title', 'excerpt_keywords']),
 dict_keys(['page_label', 'file_name', 'file_path', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'access_link', 'document_title', 'excerpt_keywords']),
 dict_keys(['page_label', 'file_name', 'file_path', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'access_link', 'document_title', 'excerpt_keywords']),
 dict_keys(['page_label', 'file_name', 'file_path', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'access_link', 'document_title', 'excerpt_keywords']),
 dict_keys(['page_label', 'file_name', 'file_path', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'access_link', 'document_title', 'excerpt_keywords']),
 dict_keys(['access_link', 'document_title', 'excerpt_keywords']),
 dict_keys(['access_link', 'document_title', 'excerpt_keywords']),
 dict_

In [33]:
[n.metadata["excerpt_keywords"] for n in nodes]

['Home Buyers Plan, Purchase contract, Construction check, Appraisal, Notary',
 'Home Buying, Planning, Negotiations, Real Estate, Property Ownership',
 'Real-estate agent, Construction report, Appraisal report, Notarial deeds, Home Buyers Plan',
 'Home Sellers Plan, Real Estate Agent, Selling Process, Presentation, Negotiations',
 'selling plan, expert support, hassle-free process, home sellers, presentation',
 'Real Estate Services, Netherlands, Buying, Selling, Renting',
 'Home Buying Services, Netherlands, Expats, Complete Package, Real Estate Agent',
 'appraisal report, notary, real-estate agent, construction report, home buying services',
 'Home buying services, Netherlands, Comprehensive package, Real estate agent, Notary',
 'Welocate, Home Sellers Plan, Netherlands, Real-estate agent, Selling house',
 'Netherlands, Selling, House, Welocate, Home Sellers Plan',
 'Welocate, Landlord Rental Plan, renting out, Netherlands, property',
 'Landlord Rental Plan, Property advertising, Te

### Indexing

In [34]:
from llama_index.core import VectorStoreIndex
# On a high-level, index can be created from documents directly, this will use a default node parser
# index = VectorStoreIndex.from_documents(documents, show_progress=True)

index = VectorStoreIndex(nodes, show_progress=True)

  from .autonotebook import tqdm as notebook_tqdm
Generating embeddings: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 39/39 [00:01<00:00, 29.49it/s]


In [35]:
len(index.index_struct.nodes_dict)

39

In [36]:
# no direct way to show the actual vector embeddings :/
index.ref_doc_info

{'d6342288-6973-4cd9-ae85-d8ee1f8e48c9': RefDocInfo(node_ids=['a339df3d-a353-4e4c-aaaf-ceb649f42790', 'a7198243-e487-4587-a52e-7388f17b4bd5'], metadata={'page_label': '1', 'file_name': 'WelocateHomeBuyersPlan_ABN.pdf', 'file_path': '/Users/mohamedadelabdelhady/workspace/kaggle-sandbox/llm-playground/welocate-data/WelocateHomeBuyersPlan_ABN.pdf', 'file_type': 'application/pdf', 'file_size': 181451, 'creation_date': '2024-07-25', 'last_modified_date': '2024-07-25', 'access_link': 'WelocateHomeBuyersPlan_ABN.pdf', 'document_title': 'The Ultimate Guide to Home Buying: From Planning to Negotiations', 'excerpt_keywords': 'Home Buyers Plan, Purchase contract, Construction check, Appraisal, Notary'}),
 'e46dfe73-a887-4c00-babd-1e247214e4f4': RefDocInfo(node_ids=['6513e78f-96c5-410b-af5f-74702d8657b9'], metadata={'page_label': '2', 'file_name': 'WelocateHomeBuyersPlan_ABN.pdf', 'file_path': '/Users/mohamedadelabdelhady/workspace/kaggle-sandbox/llm-playground/welocate-data/WelocateHomeBuyersPlan

### Storage

In [37]:
# this will overwrite all the json files in storage
index.storage_context.persist(persist_dir="./welocate-storage")

### Loading

In [38]:
from llama_index.core import StorageContext, load_index_from_storage

storage_context = StorageContext.from_defaults(persist_dir="./welocate-storage")
index = load_index_from_storage(storage_context)

In [40]:
len(index.index_struct.nodes_dict)

39

### Querying

In [47]:
from llama_index.core import VectorStoreIndex, get_response_synthesizer, StorageContext, load_index_from_storage
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor, KeywordNodePostprocessor
from llama_index.core.response_synthesizers import ResponseMode
from llama_index.core.response.pprint_utils import pprint_source_node

storage_context = StorageContext.from_defaults(persist_dir="./welocate-storage")
index = load_index_from_storage(storage_context)

retriever = VectorIndexRetriever(index=index, similarity_top_k=3)
response_synthesizer = get_response_synthesizer(response_mode=ResponseMode.COMPACT)

# assemble the query engine
query_engine = RetrieverQueryEngine(retriever=retriever, response_synthesizer=response_synthesizer, node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.4)])

In [48]:
response = query_engine.query("How much does the home seller plan cost?")
print(response)

1.1% of the selling price (excluding 21% VAT)


In [49]:
response.source_nodes

[NodeWithScore(node=TextNode(id_='ab6d6513-3406-41fe-8c97-f6c6810e00e9', embedding=None, metadata={'page_label': '2', 'file_name': 'WelocateHomeSellersPlan.pdf', 'file_path': '/Users/mohamedadelabdelhady/workspace/kaggle-sandbox/llm-playground/welocate-data/WelocateHomeSellersPlan.pdf', 'file_type': 'application/pdf', 'file_size': 179209, 'creation_date': '2024-07-25', 'last_modified_date': '2024-07-25', 'access_link': 'WelocateHomeSellersPlan.pdf', 'document_title': 'The Ultimate Guide to Selling Your Home: Hassle-Free Process with Expert Support', 'excerpt_keywords': 'selling plan, expert support, hassle-free process, home sellers, presentation'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='122f7e4d-6

In [12]:
if response.source_nodes:
    print("A")
else:
    print("B")

B


In [13]:
response = query_engine.query("who is Mohamed A. Abdelhady")
print(response)

Mohamed A. Abdelhady is an individual with expertise in various programming languages such as Python, SQL, and Bash, as well as tools and frameworks like TensorFlow, Keras, Pytorch, Git, PySpark, Pandas, Flask, and Docker. He has worked on projects related to reinforcement learning, TensorFlow model monitoring, image classification, pedestrian detection, Kaggle competitions, robotics algorithms, and more. Additionally, he has authored publications related to SLAM frameworks and recurrent neural networks for state estimation. In his extra-curricular activities, he has been involved in team strategy development, robotics competitions, makerspace development, teaching assistance, workshops, and talent development programs.


In [15]:
s = ""
s += response.response

In [21]:
for node in response.source_nodes:
    pprint_source_node(node)

Node ID: b4ffeb56-7a47-42c9-b045-1cc9aac0da0a
Similarity: 0.8014585457430086
Text: SKILLS Languages Python, SQL, Bash Tools & Frameworks
TensorFlow, Keras, Pytorch, Git, PySpark, Pandas, Flask, Docker
PROJECTS Reinforcement learning approach for traﬃc lights optimization
(2nd place in C4F2 Hackathon) /external_link Monitoring of Tensorﬂow
models using Prometheus, metricbeat, and Kibana/Grafana
Reproducibility of ﬁne-grained-im...


In [24]:
metadata = node.to_dict()["node"]['metadata']

In [26]:
metadata

{'page_label': '2',
 'file_name': 'CV-M-Abdelhady.pdf',
 'file_path': '/Users/mohamedadelabdelhady/workspace/kaggle-sandbox/llm-playground/data/CV-M-Abdelhady.pdf',
 'file_type': 'application/pdf',
 'file_size': 108188,
 'creation_date': '2024-06-11',
 'last_modified_date': '2024-06-11'}

In [28]:
f"{metadata['file_name']} | {metadata['creation_date']} | {metadata['last_modified_date']} | {metadata['file_path']} | {round(node.score, 2)} | {node.text[:100]}|\n"

'CV-M-Abdelhady.pdf | 2024-06-11 | 2024-06-11 | /Users/mohamedadelabdelhady/workspace/kaggle-sandbox/llm-playground/data/CV-M-Abdelhady.pdf | 0.8 | SKILLS\nLanguages Python, SQL, Bash\nTools & Frameworks TensorFlow, Keras, Pytorch, Git, PySpark, Pand|\n'

In [6]:
"""
score
metadata.page_label
metadata.file_name
metadata.page_label
metadata.creation_date
metadata.last_modified_date
metadata.file_path
text
"""
node.to_dict()

{'node': {'id_': 'b4ffeb56-7a47-42c9-b045-1cc9aac0da0a',
  'embedding': None,
  'metadata': {'page_label': '2',
   'file_name': 'CV-M-Abdelhady.pdf',
   'file_path': '/Users/mohamedadelabdelhady/workspace/kaggle-sandbox/llm-playground/data/CV-M-Abdelhady.pdf',
   'file_type': 'application/pdf',
   'file_size': 108188,
   'creation_date': '2024-06-11',
   'last_modified_date': '2024-06-11'},
  'excluded_embed_metadata_keys': ['file_name',
   'file_type',
   'file_size',
   'creation_date',
   'last_modified_date',
   'last_accessed_date'],
  'excluded_llm_metadata_keys': ['file_name',
   'file_type',
   'file_size',
   'creation_date',
   'last_modified_date',
   'last_accessed_date'],
  'relationships': {<NodeRelationship.SOURCE: '1'>: {'node_id': '1cc56f45-6c99-45f6-aaab-0f8f21cf5881',
    'node_type': <ObjectType.DOCUMENT: '4'>,
    'metadata': {'page_label': '2',
     'file_name': 'CV-M-Abdelhady.pdf',
     'file_path': '/Users/mohamedadelabdelhady/workspace/kaggle-sandbox/llm-playg

In [117]:
response = query_engine.query("what is the simplest way to store indexed data?")
print(response)

The simplest way to store indexed data is to use the built-in .persist() method of every Index, which writes all the data to disk at the location specified.


In [118]:
for node in response.source_nodes:
    pprint_source_node(node)

Node ID: c3a89c37-809d-4f8b-b077-b6baad787e14
Similarity: 0.8419226761880413
Text: The simplest way to store your indexed data is to use the built-
in .persist() method of every Index, which writes all the data to disk
at the location specified. This works for any type of index.


In [119]:
response = query_engine.query("Pinecone Documentation and Resources: A Comprehensive Guide")
print(response)
for node in response.source_nodes:
    pprint_source_node(node)

The Pinecone Documentation and Resources provide a comprehensive guide for users, offering practical guides, detailed information about the Pinecone API, SDKs, and architecture, hands-on examples and sample apps, details on third-party integrations, Pinecone utilities and reference architectures, troubleshooting guides, and news about features and changes in Pinecone and related tools.
Node ID: a0754d56-b619-4267-aea0-ab1b3c6be53a
Similarity: 0.89678371872626
Text: [Pinecone Docs home page![light logo](https://mintlify.s3-us-
west-1.amazonaws.com/pinecone-2/logo/light.png)![dark
logo](https://mintlify.s3-us-
west-1.amazonaws.com/pinecone-2/logo/dark.png)](/)  Latest  Search or
ask...    * [Sign up
free](https://app.pinecone.io/?sessionType=signup)   *
[Status](https://status.pinecone.io)   * [Support](http...


In [123]:
response = query_engine.query("Who is Mohamed A. Abdelhady ?")
print(response)
for node in response.source_nodes:
    pprint_source_node(node)

Empty Response
