In [2]:
import os
from dotenv import load_dotenv

import os.path
from llama_index.core import (
    VectorStoreIndex,
    SimpleDirectoryReader,
    StorageContext,
    load_index_from_storage,
)

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [3]:
from llama_index.readers.web import BeautifulSoupWebReader

loader = BeautifulSoupWebReader()

In [4]:
# check if storage already exists
PERSIST_DIR = "./storage-woxstai"
urls = ["https://www.ratsit.se/20000920-Alexander_Gunnar_Leander_Woxstrom_Malmo/gJbgl_nvI8cLdFQJv3wY8oUCfdUwua_h9AQ80Wgnx2c"]
if not os.path.exists(PERSIST_DIR):
    # load the documents and create the index
    documents = loader.load_data(urls)
    index = VectorStoreIndex.from_documents(documents)
    # store it for later
    index.storage_context.persist(persist_dir=PERSIST_DIR)
else:
    # load the existing index
    storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
    index = load_index_from_storage(storage_context)

In [5]:
query_engine = index.as_query_engine(response_mode="tree_summarize")
response = query_engine.query("WWho are the websites about?")
print(response)

The websites are about Alexander Gunnar Leander Woxström, a 24-year-old individual living in Malmö. The information includes details about his address, phone number, civil status, upcoming birthday, income, and involvement in a company named Woxst AB. It also provides information on his neighbors, average income in his area, and popular individuals with the name Alexander in Sweden.


In [6]:
query_engine = index.as_query_engine(response_mode="tree_summarize", streaming=True)
response = query_engine.query("WHo is the website about? What information is available about the person?")
print(response.source_nodes)

[NodeWithScore(node=TextNode(id_='4e0dcd90-6e6b-403f-a64b-1cc78aba7a3f', embedding=None, metadata={'URL': 'https://www.ratsit.se/20000920-Alexander_Gunnar_Leander_Woxstrom_Malmo/gJbgl_nvI8cLdFQJv3wY8oUCfdUwua_h9AQ80Wgnx2c'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='https://www.ratsit.se/20000920-Alexander_Gunnar_Leander_Woxstrom_Malmo/gJbgl_nvI8cLdFQJv3wY8oUCfdUwua_h9AQ80Wgnx2c', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'URL': 'https://www.ratsit.se/20000920-Alexander_Gunnar_Leander_Woxstrom_Malmo/gJbgl_nvI8cLdFQJv3wY8oUCfdUwua_h9AQ80Wgnx2c'}, hash='164d37c426387e879e3894d1fc75ab4cda3a1797ffc37fd98698c5b075dcd01f'), <NodeRelationship.PREVIOUS: '2'>: RelatedNodeInfo(node_id='de13bd97-c7ae-4ea6-97e7-235422581afe', node_type=<ObjectType.TEXT: '1'>, metadata={'URL': 'https://www.ratsit.se/20000920-Alexander_Gunnar_Leander_Woxstrom_Malmo/gJbgl_nvI8cLdFQJv3wY8oUCfdUwua_h9AQ80Wgnx2c'}, hash='