# Indexing & Storage

In [43]:
# let's try to index documents
from llama_index.readers.web import SimpleWebPageReader
from llama_index.core import SimpleDirectoryReader
from llama_index.core import Document
from llama_index.llms.openai import OpenAI
from llama_index.core import VectorStoreIndex

documents = SimpleDirectoryReader("./data").load_data(show_progress=True)
len(documents)

Loading files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 391.84file/s]


1

In [31]:
# At this point the nodes are created and embedded!! 
index = VectorStoreIndex.from_documents(documents, show_progress=True)

Parsing nodes: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 15.23it/s]
Generating embeddings: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 22/22 [00:00<00:00, 22.86it/s]


In [32]:
len(index.index_struct.nodes_dict)

22

In [23]:
# we get a dict mapping of ingested documents and their nodes+metadata.
index.ref_doc_info

{'83da34ea-8c9c-43d2-9b5a-6e8ba175d167': RefDocInfo(node_ids=['a3a5a449-a298-4211-b5e6-ed8ad398f3b2', '8d5716d6-10ae-464b-b262-9e4f722244df', 'cb9d2071-1e8c-418d-9157-793f78398cc1', '563bff8b-b393-46b0-a21c-01ff42152a7a', 'bacf4a82-63ee-45df-adf9-9107d92ca743', '1624e574-33fa-466e-a7cc-5cfb89c165c4', 'f5c407df-6499-47c8-a5ea-02045ceab011', 'c74e9b4d-27d9-431f-9fb4-120715af8704', '83c39b5e-2287-4f34-b227-86c75311471f', '750c0ab3-abb4-40fd-b37f-ad3273793881', '8f65a55b-4d0a-4a27-ad3f-fdaa2d38b8f4', 'e3695cf9-45fa-44dc-a94e-7c5f78d6eb99', '96e62a20-b121-4429-9450-a83406ae50a4', '87a0e8b4-5e64-4248-910f-110b65008bcb', '6ecdbfaa-337a-4867-ab39-14e354e467bc', '42433f9c-4872-4e4c-bddf-578f825beb7b', 'bc4e69db-196f-40ff-8412-60602fe41cb1', '34d95582-fa42-44f1-8d75-2deeb225b7ba', '9025c393-c285-4b1a-9280-02aeab0d1a6c', '9cc00d2b-4765-45ff-8c9e-ad800319b970', 'e86a40c3-6fdd-4eda-9190-816fb7ad8fea', 'eca5fe5f-4335-4cd7-9758-48013092ac8d'], metadata={'file_path': '/Users/mohamedadelabdelhady/works

In [25]:
documents[0].id_

'83da34ea-8c9c-43d2-9b5a-6e8ba175d167'

In [26]:
index.docstore.get_node("a3a5a449-a298-4211-b5e6-ed8ad398f3b2")

TextNode(id_='a3a5a449-a298-4211-b5e6-ed8ad398f3b2', embedding=None, metadata={'file_path': '/Users/mohamedadelabdelhady/workspace/kaggle-sandbox/llm-playground/data/paul_graham_essay.txt', 'file_name': 'paul_graham_essay.txt', 'file_type': 'text/plain', 'file_size': 75041, 'creation_date': '2024-05-03', 'last_modified_date': '2024-05-02'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='83da34ea-8c9c-43d2-9b5a-6e8ba175d167', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_path': '/Users/mohamedadelabdelhady/workspace/kaggle-sandbox/llm-playground/data/paul_graham_essay.txt', 'file_name': 'paul_graham_essay.txt', 'file_type': 'text/plain', 'file_size': 75041, 'creation_date': '2024-05-03', 'last_modif

### Persisting the index to disk

In [40]:
# this will overwrite all the json files in storage
index.storage_context.persist(persist_dir="./storage")

### Loading from disk

In [46]:
from llama_index.core import StorageContext, load_index_from_storage

storage_context = StorageContext.from_defaults(persist_dir="./storage")
index = load_index_from_storage(storage_context)

In [47]:
index.ref_doc_info

{'f9b3bf49-8018-41ac-9313-6cfc18491216': RefDocInfo(node_ids=['642e947c-4c86-48c6-956a-15fa83b7ac35', '10c9a4d3-2a1b-4fae-bc55-b31082c70aa6', 'b4721a92-cd08-44fc-8e3a-f137d037676d', 'd03763d2-7b94-4032-9321-11a8283cfe25', '7aea43e9-d7db-4d46-898c-c233b3954ce4', 'b994bc38-0b36-4ad5-b1d4-92690462a19b', '9735c4ac-0c08-4d7f-9d2d-32d26892290c', 'c15d950d-0527-4c10-a314-258ec3412740', '4e44d91c-d20b-4caf-8e06-2d28c647fa1d', '98e89a5e-219e-4020-a823-d848dafc5084', '1eb28048-bcab-45fd-8b44-bf75d6beb190', '5ce487d1-f359-4ad6-9884-5a8f908e88b3', '9b49fc59-1679-44d7-b76f-ee6490ef2ff7', '5e07f79e-1d90-445a-9234-34ec5740e1e0', '8498d5fb-093d-436c-ad2b-b6a47d86240a', 'c7f8260f-2d0a-42dd-8b3b-ea365c0ac647', '3e9d169f-e301-486e-8fa1-ce30ccd12d95', '928b1455-9f8d-4a1f-925d-19faa903685c', '1bea3f2d-5011-4f6a-8b8e-71bd2ed805d8', '29526681-8568-4743-b56c-6266f96bc337', '252bbb52-1328-46b1-ad74-b62968484fd1', '43528ae4-90f2-442f-a5f3-37a81c604c3b'], metadata={'file_path': '/Users/mohamedadelabdelhady/works

In [48]:
index.docstore.get_node("642e947c-4c86-48c6-956a-15fa83b7ac35").get_content(metadata_mode="all")

'file_path: /Users/mohamedadelabdelhady/workspace/kaggle-sandbox/llm-playground/data/paul_graham_essay.txt\nfile_name: paul_graham_essay.txt\nfile_type: text/plain\nfile_size: 75041\ncreation_date: 2024-05-03\nlast_modified_date: 2024-05-02\n\nWhat I Worked On\n\nFebruary 2021\n\nBefore college the two main things I worked on, outside of school, were writing and programming. I didn\'t write essays. I wrote what beginning writers were supposed to write then, and probably still are: short stories. My stories were awful. They had hardly any plot, just characters with strong feelings, which I imagined made them deep.\n\nThe first programs I tried writing were on the IBM 1401 that our school district used for what was then called "data processing." This was in 9th grade, so I was 13 or 14. The school district\'s 1401 happened to be in the basement of our junior high school, and my friend Rich Draves and I got permission to use it. It was like a mini Bond villain\'s lair down there, with all

### Excluding certain metadata fields from Document

In [54]:
from llama_index.core.node_parser import SimpleNodeParser

documents = SimpleDirectoryReader("./data").load_data()
# uncomment this to see the excluded filepath
for document in documents:
    document.excluded_embed_metadata_keys = ["file_path"]

node_parser = SimpleNodeParser.from_defaults()
nodes = node_parser.get_nodes_from_documents(documents)
nodes[0].get_content(metadata_mode="embed")

'file_name: paul_graham_essay.txt\nfile_type: text/plain\nfile_size: 75041\ncreation_date: 2024-05-03\nlast_modified_date: 2024-05-02\n\nWhat I Worked On\n\nFebruary 2021\n\nBefore college the two main things I worked on, outside of school, were writing and programming. I didn\'t write essays. I wrote what beginning writers were supposed to write then, and probably still are: short stories. My stories were awful. They had hardly any plot, just characters with strong feelings, which I imagined made them deep.\n\nThe first programs I tried writing were on the IBM 1401 that our school district used for what was then called "data processing." This was in 9th grade, so I was 13 or 14. The school district\'s 1401 happened to be in the basement of our junior high school, and my friend Rich Draves and I got permission to use it. It was like a mini Bond villain\'s lair down there, with all these alien-looking machines — CPU, disk drives, printer, card reader — sitting up on a raised floor under