## This is the tutorial from the llama Index official docs

#### The defualt Global Setting

In [None]:
from llama_index.core import Settings
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

Settings.llm = Ollama(model="llama3.2", request_timeout=60.0)
Settings.embed_model = HuggingFaceEmbedding(
    model_name="BAAI/bge-small-en-v1.5"
) # loads BAAI/bge-small-en-v1.5 as the default embedding model instead of Open-AI

  from .autonotebook import tqdm as notebook_tqdm


#### Ingestion pipeline typically consists of three main stages:

1. Load the data
2. Transform the data
3. Index and store the data

### 1.1 Loading using SimpleDirectoryReader 

In [3]:
from llama_index.core import SimpleDirectoryReader

documents = SimpleDirectoryReader("./data").load_data()

### 1.2 Transform the data 

Transformations include chunking, extracting metadata, and embedding each chunk.

This is necessary to make sure that the data can be retrieved, and used optimally by the LLM.

#### High-Level Transformation API

In [None]:
from llama_index.core import VectorStoreIndex

vector_index = VectorStoreIndex.from_documents(documents)
vector_index.as_query_engine()  # uses BAAI/bge-small-en-v1.5 or whatever is set on the settings default

<llama_index.core.query_engine.retriever_query_engine.RetrieverQueryEngine at 0x7fde5a11d780>

#### Lower-Level Transformation API

##### Splitting Your Documents into Nodes

In [None]:
from llama_index.core import SimpleDirectoryReader
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.node_parser import TokenTextSplitter

documents = SimpleDirectoryReader("./data").load_data()

# https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/modules/#tokentextsplitter
pipeline = IngestionPipeline(transformations=[TokenTextSplitter()])

nodes = pipeline.run(documents=documents)

nodes  # If you see the text section of each node it is splitting it per page 

##### Adding Metadata

In [None]:
nodes[0].metadata

### OUTPUTS 
metadata={
    'page_label': '1',
     'file_name': '2023_canadian_budget.pdf',
     'file_path': '/home/brooks/rag_llama_index/rag_llama_tutorial/data/2023_canadian_budget.pdf',
     'file_type': 'application/pdf',
     'file_size': 376126,
     'creation_date': '2024-11-05',
     'last_modified_date': '2024-11-05'
     }

{'page_label': '1',
 'file_name': '2023_canadian_budget.pdf',
 'file_path': '/home/brooks/rag_llama_index/rag_llama_tutorial/data/2023_canadian_budget.pdf',
 'file_type': 'application/pdf',
 'file_size': 376126,
 'creation_date': '2024-11-05',
 'last_modified_date': '2024-11-05',
 'document_title': 'Based on the provided context, I would suggest the following comprehensive title:\n\n"Key Components and Measures of the 2023 Canadian Federal Budget"\n\nThis title accurately captures the essence of the context, highlighting the unique entities (e.g., budget, government, Prime Minister), themes (e.g., policy objectives, investments in green growth, dental care program), and other relevant information. It is clear, concise, and informative, making it suitable for a document that aims to provide an overview of the 2023 Canadian Federal Budget.',
 'questions_this_excerpt_can_answer': "Based on the provided context, here are three questions with specific answers that are unlikely to be found e

In [None]:
import asyncio # This is an asyncio case

### These are the MetaData Extractors 
from llama_index.core.extractors import (
    TitleExtractor,
    QuestionsAnsweredExtractor,
)
from llama_index.core.node_parser import TokenTextSplitter

text_splitter = TokenTextSplitter(
    separator=" ", chunk_size=512, chunk_overlap=128
)
title_extractor = TitleExtractor(nodes=4)
qa_extractor = QuestionsAnsweredExtractor(questions=3)

from llama_index.core.ingestion import IngestionPipeline

pipeline = IngestionPipeline(
    transformations=[text_splitter, title_extractor, qa_extractor]
)

# Use 'await' to run the asynchronous 'arun' method
nodes = await pipeline.arun(
    documents=documents,
    in_place=True,
    show_progress=True,
)

nodes

In [None]:
nodes[1].metadata  # Includes the document_title and question_this_excerpt_can_answer

{'page_label': '1',
 'file_name': '2023_canadian_budget.pdf',
 'file_path': '/home/brooks/rag_llama_index/rag_llama_tutorial/data/2023_canadian_budget.pdf',
 'file_type': 'application/pdf',
 'file_size': 376126,
 'creation_date': '2024-11-05',
 'last_modified_date': '2024-11-05',
 'document_title': 'Based on the provided context, I would suggest the following comprehensive title:\n\n"Key Components and Measures of the 2023 Canadian Federal Budget"\n\nThis title accurately captures the essence of the context, highlighting the unique entities (e.g., budget, government, Prime Minister), themes (e.g., policy objectives, investments in green growth, dental care program), and other relevant information. It is clear, concise, and informative, making it suitable for a document that aims to provide an overview of the 2023 Canadian Federal Budget.',
 'questions_this_excerpt_can_answer': 'Based on the provided context, here are three potential questions with specific answers that are unlikely to 

##### Adding Embeddings

In [None]:
embedding_test = Settings.embed_model.get_text_embedding(
    "It is raining cats and dogs here!"
)
# https://huggingface.co/BAAI/bge-small-en-v1.5   it is 384 dimensions here 
len(embedding_test)

384

##### Embedding Stored in VectorStoreIndex using pipeline

##### Qdrant Vector Store Implementation

In [None]:
# Make sure all the dependencies have been installed

In [27]:
import asyncio # This is an asyncio case

### These are the MetaData Extractors 
from llama_index.core.extractors import (
    TitleExtractor,
    QuestionsAnsweredExtractor,
)
# Text splitter for the document 
from llama_index.core.node_parser import TokenTextSplitter
# The Ingestion pipeline 
from llama_index.core.ingestion import IngestionPipeline

# Vector store to store our low level created embeddings 
from llama_index.vector_stores.qdrant import QdrantVectorStore

import qdrant_client 

from llama_index.core import VectorStoreIndex

# you can use :memory: mode for fast and light-weight experiments, No deployment req.
client = qdrant_client.QdrantClient(location=":memory:")
vector_qdrant_store = QdrantVectorStore(client=client, collection_name="test_store")

text_splitter = TokenTextSplitter(
    separator=" ", chunk_size=512, chunk_overlap=128
)
title_extractor = TitleExtractor(nodes=4)
qa_extractor = QuestionsAnsweredExtractor(questions=3)



pipeline = IngestionPipeline(
    name="ETL and Qdrant Store pipeline",
    transformations=[text_splitter, title_extractor, qa_extractor],
    vector_store=vector_qdrant_store
)

# Use 'await' to run the asynchronous 'arun' method
low_level_nodes = await pipeline.arun(
    documents=documents,
    in_place=True,
    show_progress=True,
)

qdrant_stored_index = VectorStoreIndex.from_vector_store(vector_qdrant_store)

  return _validate_core_schema(schema)
Parsing nodes: 100%|██████████| 4/4 [00:00<00:00, 123.38it/s]
100%|██████████| 2/2 [00:06<00:00,  3.13s/it]
100%|██████████| 1/1 [00:01<00:00,  1.66s/it]
100%|██████████| 3/3 [00:03<00:00,  1.13s/it]
100%|██████████| 2/2 [00:03<00:00,  1.61s/it]
100%|██████████| 8/8 [00:19<00:00,  2.44s/it]


##### View the Indices stored in Low-level API custom stored

In [29]:
# Print the Nodes Directly
for node in low_level_nodes:
    print(node.id_)

777fa0de-3357-4759-8a79-b2699ab789eb
334f722a-fa7f-44e9-b8ab-0cecf2f62166
b27e25df-bc1b-4de1-906b-16bc744cbf60
ed4dec14-e665-4f9b-af28-0d12904c006f
fa9ea975-db7a-49db-b011-d7d972eb46a1
aa18f65d-a700-4a33-a926-b320173000a1
0f4d7d4a-1450-4426-ac01-2401590d3504
6197071e-0d55-467c-921f-87e74e32f193


In [None]:
print(qdrant_stored_index._get_node_with_embedding(low_level_nodes)[0])  # Displaying only the first index (you can see the node id above as well)

Node ID: 777fa0de-3357-4759-8a79-b2699ab789eb
Text: ‹ 2022 2024›2023 budget of the Canadian federal government
Submitted 28 March 2023 Presented 28 March 2023 Parliament 44th Party
Liberal Finance ministerChrystia Freeland Total revenue$456.8 billion
(projected) Total expenditures$496.9 billion (projected) Deﬁcit $40.1
billion (projected)[ 1 ] GDP TBA Website 2023 Budget (http
s://www.budget.can ...


##### See what has been by the high level Transformation API   

In [42]:
# Print the Nodes Directly
for node in nodes:
    print(node.id_)

6e9143f6-288e-449b-99fc-bfed45757dae
6d57812b-c448-44ae-a6b7-a29d47483251
9bd1528a-a1bd-4a16-87f4-6b21e9f96abc
072dd4bb-f8fb-4c66-9c7b-ba4305e7cc0d
82210f6a-132d-47eb-b32d-bb3d3bd48c9c
b1d85874-f0e1-45be-93ef-cd389445cba6
13ad8f3d-d0ee-4dc9-946f-25749c9d470b
1eb9f8e6-e44a-4092-baee-53cdf46a4ceb


In [41]:
print(vector_index._get_node_with_embedding(nodes)[0])

Node ID: 6e9143f6-288e-449b-99fc-bfed45757dae
Text: ‹ 2022 2024›2023 budget of the Canadian federal government
Submitted 28 March 2023 Presented 28 March 2023 Parliament 44th Party
Liberal Finance ministerChrystia Freeland Total revenue$456.8 billion
(projected) Total expenditures$496.9 billion (projected) Deﬁcit $40.1
billion (projected)[ 1 ] GDP TBA Website 2023 Budget (http
s://www.budget.can ...
