## This is the tutorial from the llama Index official docs

#### The defualt Global Setting

In [None]:
from llama_index.core import Settings
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

Settings.llm = Ollama(model="llama3.2", request_timeout=60.0)
Settings.embed_model = HuggingFaceEmbedding(
    model_name="BAAI/bge-small-en-v1.5"
) # loads BAAI/bge-small-en-v1.5 as the default embedding model instead of Open-AI

  from .autonotebook import tqdm as notebook_tqdm


#### Ingestion pipeline typically consists of three main stages:

1. Load the data
2. Transform the data
3. Index and store the data

### 1.1 Loading using SimpleDirectoryReader 

In [3]:
from llama_index.core import SimpleDirectoryReader

documents = SimpleDirectoryReader("./data").load_data()

### 1.2 Transform the data 

Transformations include chunking, extracting metadata, and embedding each chunk.

This is necessary to make sure that the data can be retrieved, and used optimally by the LLM.

#### High-Level Transformation API

In [None]:
from llama_index.core import VectorStoreIndex

vector_index = VectorStoreIndex.from_documents(documents)
vector_index.as_query_engine()  # uses BAAI/bge-small-en-v1.5 or whatever is set on the settings default

<llama_index.core.query_engine.retriever_query_engine.RetrieverQueryEngine at 0x7fde5a11d780>

#### Lower-Level Transformation API

##### Splitting Your Documents into Nodes

In [None]:
from llama_index.core import SimpleDirectoryReader
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.node_parser import TokenTextSplitter

documents = SimpleDirectoryReader("./data").load_data()

# https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/modules/#tokentextsplitter
pipeline = IngestionPipeline(transformations=[TokenTextSplitter()])

nodes = pipeline.run(documents=documents)

nodes  # If you see the text section of each node it is splitting it per page 

[TextNode(id_='e2f52654-8a12-47d4-be35-c059587243a1', embedding=None, metadata={'page_label': '1', 'file_name': '2023_canadian_budget.pdf', 'file_path': '/home/brooks/rag_llama_index/rag_llama_tutorial/data/2023_canadian_budget.pdf', 'file_type': 'application/pdf', 'file_size': 376126, 'creation_date': '2024-11-05', 'last_modified_date': '2024-11-05'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='95267e2f-d7ad-457f-863c-5db2e7e891c0', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'page_label': '1', 'file_name': '2023_canadian_budget.pdf', 'file_path': '/home/brooks/rag_llama_index/rag_llama_tutorial/data/2023_canadian_budget.pdf', 'file_type': 'application/pdf', 'file_size': 376126, 'creation_date': '2

##### Adding Metadata

In [None]:
nodes[0].metadata

### OUTPUTS 
metadata={
    'page_label': '1',
     'file_name': '2023_canadian_budget.pdf',
     'file_path': '/home/brooks/rag_llama_index/rag_llama_tutorial/data/2023_canadian_budget.pdf',
     'file_type': 'application/pdf',
     'file_size': 376126,
     'creation_date': '2024-11-05',
     'last_modified_date': '2024-11-05'
     }

{'page_label': '1',
 'file_name': '2023_canadian_budget.pdf',
 'file_path': '/home/brooks/rag_llama_index/rag_llama_tutorial/data/2023_canadian_budget.pdf',
 'file_type': 'application/pdf',
 'file_size': 376126,
 'creation_date': '2024-11-05',
 'last_modified_date': '2024-11-05',
 'document_title': 'Based on the provided context, I would suggest the following comprehensive title:\n\n"Key Components and Measures of the 2023 Canadian Federal Budget"\n\nThis title accurately captures the essence of the context, highlighting the unique entities (e.g., budget, government, Prime Minister), themes (e.g., policy objectives, investments in green growth, dental care program), and other relevant information. It is clear, concise, and informative, making it suitable for a document that aims to provide an overview of the 2023 Canadian Federal Budget.',
 'questions_this_excerpt_can_answer': "Based on the provided context, here are three questions with specific answers that are unlikely to be found e

In [19]:
import asyncio # This is an asyncio case

from llama_index.core.extractors import (
    TitleExtractor,
    QuestionsAnsweredExtractor,
)
from llama_index.core.node_parser import TokenTextSplitter

text_splitter = TokenTextSplitter(
    separator=" ", chunk_size=512, chunk_overlap=128
)
title_extractor = TitleExtractor(nodes=5)
qa_extractor = QuestionsAnsweredExtractor(questions=3)

from llama_index.core.ingestion import IngestionPipeline

pipeline = IngestionPipeline(
    transformations=[text_splitter, title_extractor, qa_extractor]
)

# Use 'await' to run the asynchronous 'arun' method
nodes = await pipeline.arun(
    documents=documents,
    in_place=True,
    show_progress=True,
)

nodes

Parsing nodes: 100%|██████████| 4/4 [00:00<00:00, 275.94it/s]
100%|██████████| 2/2 [00:03<00:00,  1.86s/it]
100%|██████████| 1/1 [00:02<00:00,  2.07s/it]
100%|██████████| 3/3 [00:02<00:00,  1.03it/s]
100%|██████████| 2/2 [00:02<00:00,  1.34s/it]
100%|██████████| 8/8 [00:19<00:00,  2.44s/it]


[TextNode(id_='1fb030fb-e569-4e30-842e-a746b8078ae5', embedding=None, metadata={'page_label': '1', 'file_name': '2023_canadian_budget.pdf', 'file_path': '/home/brooks/rag_llama_index/rag_llama_tutorial/data/2023_canadian_budget.pdf', 'file_type': 'application/pdf', 'file_size': 376126, 'creation_date': '2024-11-05', 'last_modified_date': '2024-11-05', 'document_title': 'Based on the provided options, I would suggest:\n\n**"Canada\'s 2023 Federal Budget: A Summary of Key Facts, Figures, and Initiatives"**\n\nThis title captures the main theme of the context (the Canadian federal budget for 2023-2024) while also emphasizing the key facts, figures, and initiatives mentioned in the text.\n\nAlternatively, you could consider these options:\n\n* "Canadian Federal Budget 2023: Key Provisions and Initiatives"\n* "Key Components of the 2023 Canadian Federal Budget"\n* "Notable Measures in the 2023 Canadian Federal Budget"\n* "Highlights from the 2023 Canadian Federal Budget"\n* "2023 Canadian F

In [None]:
nodes[1].metadata  # Includes the document_title and question_this_excerpt_can_answer

{'page_label': '1',
 'file_name': '2023_canadian_budget.pdf',
 'file_path': '/home/brooks/rag_llama_index/rag_llama_tutorial/data/2023_canadian_budget.pdf',
 'file_type': 'application/pdf',
 'file_size': 376126,
 'creation_date': '2024-11-05',
 'last_modified_date': '2024-11-05',
 'document_title': 'Based on the provided context, I would suggest the following comprehensive title:\n\n"Key Components and Measures of the 2023 Canadian Federal Budget"\n\nThis title accurately captures the essence of the context, highlighting the unique entities (e.g., budget, government, Prime Minister), themes (e.g., policy objectives, investments in green growth, dental care program), and other relevant information. It is clear, concise, and informative, making it suitable for a document that aims to provide an overview of the 2023 Canadian Federal Budget.',
 'questions_this_excerpt_can_answer': 'Based on the provided context, here are three potential questions with specific answers that are unlikely to 