# Loading data (Ingestion)

### Simple directory reader

In [4]:
# we can use llama index Reader objects to load data from multiple sources into Documents
# Document contains text and metadata
# ['id_', 'embedding', 'metadata', 'excluded_embed_metadata_keys', 'excluded_llm_metadata_keys', 'relationships', 'text', 'start_char_idx', 'end_char_idx', 'text_template', 'metadata_template', 'metadata_seperator', 'class_name']
from llama_index.core import SimpleDirectoryReader

documents = SimpleDirectoryReader("./data").load_data()

Ignoring wrong pointing object 35 0 (offset 0)
Ignoring wrong pointing object 117 0 (offset 0)


### Web reader

In [None]:
from llama_index.readers.web import SimpleWebPageReader

documents = SimpleWebPageReader(html_to_text=True).load_data(urls=[
    "https://docs.llamaindex.ai/en/stable/understanding/loading/loading/#creating-and-passing-nodes-directly"
])
print(len(documents))

### Transformation

In [32]:
from llama_index.core.node_parser import SentenceSplitter, TokenTextSplitter, TextSplitter
from llama_index.core.ingestion import IngestionPipeline

text_splitter = SentenceSplitter(chunk_size=1024, chunk_overlap=10)

pipeline = IngestionPipeline(transformations=[text_splitter])

nodes = pipeline.run(documents=documents)

In [33]:
len(documents), len(nodes)

(1, 30)

In [36]:
# metadata can be added to the documents and could be anything
# it will be passed on to the nodes 
# it will be injected into the text for embedding and llm calls 

documents[0].metadata = {"category": "AI"}

In [37]:
nodes = pipeline.run(documents=documents)

In [39]:
nodes[0].metadata

{'category': 'AI'}

In [43]:
# instead of using the ingestion pipeline API we can use the node parsers directly
parser = SentenceSplitter()

nodes = parser.get_nodes_from_documents(documents)

In [44]:
nodes[0].metadata

{'category': 'AI'}

In [50]:
from llama_index.core.schema import MetadataMode
n = nodes[0]
n.get_content(MetadataMode.LLM)[:30]

'category: AI\n\nSkip to content\n'

### Developing a pipeline to add metadata to nodes

In [53]:
from llama_index.core import SimpleDirectoryReader
from llama_index.core import Document
from llama_index.core.node_parser import SentenceSplitter, TokenTextSplitter, TextSplitter
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.extractors import SummaryExtractor, TitleExtractor, KeywordExtractor
from llama_index.core.schema import MetadataMode
from llama_index.llms.openai import OpenAI

llm = OpenAI(temperature=0.1, model="gpt-3.5-turbo", max_tokens=512)

In [59]:
documents = [
    Document(text="The number of physically distinct goods and unique types of services that consumers can purchase is in the millions. On the business or production side of the economy, there are even more commodities that are actively traded. This is because firms not only produce commodities for final consumption, but they also produce exports and intermediate commodities that are demanded by other producers. Firms collectively also use millions of imported goods and services, thousands of different types of labour services and hundreds of thousands of specific types of capital. If we further distinguish physical commodities by their geographic location or by the season or time of day that they are produced or consumed, then there are billions of commodities that are traded within each year in any advanced economy. For many purposes, it is necessary to summarize this vast amount of price and quantity information into a much smaller set of numbers. The question that this chapter addresses is: how exactly should the microeconomic information involving possibly millions of prices and quantities be aggregated into a smaller number of price and quantity variables? This is the basic index number problem."),
    Document(text="Waking up at a specific time (like 3:30 a.m.) is likely due to the end of a sleep cycle and/or due to a learned habit.During REM sleep, your body is paralyzed, and upon exiting this phase, the body needs to move and reposition, which often briefly awakens you. Normally, these awakenings are so brief that you don’t remember, but some will wake you up fully.Checking the time when you wake up at night can train your brain to continue waking at that time. This is a form of learning whereby checking the clock strengthens the memory association with that specific time."),
    Document(text="LLMs are trained on enormous bodies of data but they aren't trained on your data. Retrieval-Augmented Generation (RAG) solves this problem by adding your data to the data LLMs already have access to. You will see references to RAG frequently in this documentation.In RAG, your data is loaded and prepared for queries or indexed. User queries act on the index, which filters your data down to the most relevant context. This context and your query then go to the LLM along with a prompt, and the LLM provides a response"),
]

In [73]:
normal_ingestion_pipeline = IngestionPipeline(transformations=[SentenceSplitter(chunk_size=512)])
normal_nodes = normal_ingestion_pipeline.run(documents=documents)
len(normal_nodes)

3

In [75]:
normal_nodes[0].metadata

{}

In [76]:
normal_nodes[0].get_content(MetadataMode.LLM)

'The number of physically distinct goods and unique types of services that consumers can purchase is in the millions. On the business or production side of the economy, there are even more commodities that are actively traded. This is because firms not only produce commodities for final consumption, but they also produce exports and intermediate commodities that are demanded by other producers. Firms collectively also use millions of imported goods and services, thousands of different types of labour services and hundreds of thousands of specific types of capital. If we further distinguish physical commodities by their geographic location or by the season or time of day that they are produced or consumed, then there are billions of commodities that are traded within each year in any advanced economy. For many purposes, it is necessary to summarize this vast amount of price and quantity information into a much smaller set of numbers. The question that this chapter addresses is: how exac

In [78]:
import nest_asyncio

nest_asyncio.apply()

metadata_pipeline = IngestionPipeline(transformations=[SentenceSplitter(chunk_size=512), TitleExtractor(llm=llm)])
metadata_nodes = metadata_pipeline.run(documents=documents)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.12it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.38it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.40it/s]


In [80]:
metadata_nodes[0].metadata

{'document_title': 'Aggregating Microeconomic Information: The Index Number Problem in a Vast Economy'}

In [81]:
metadata_nodes[0].get_content(MetadataMode.LLM)

'[Excerpt from document]\ndocument_title: Aggregating Microeconomic Information: The Index Number Problem in a Vast Economy\nExcerpt:\n-----\nThe number of physically distinct goods and unique types of services that consumers can purchase is in the millions. On the business or production side of the economy, there are even more commodities that are actively traded. This is because firms not only produce commodities for final consumption, but they also produce exports and intermediate commodities that are demanded by other producers. Firms collectively also use millions of imported goods and services, thousands of different types of labour services and hundreds of thousands of specific types of capital. If we further distinguish physical commodities by their geographic location or by the season or time of day that they are produced or consumed, then there are billions of commodities that are traded within each year in any advanced economy. For many purposes, it is necessary to summarize