## Loading Data
### Ingestion pipline:
`Load the data` -> `Transform the data` -> `Store and Index the data`

* Load the data

In [3]:
from llama_index.core import SimpleDirectoryReader

document = SimpleDirectoryReader(input_files=['data/Hackers-Painters-BigIdeasFromTheComputerAgebyPaulGraham.pdf']).load_data()

In [19]:
len(document)

274

In [20]:
document[16].__dict__

{'id_': '051963d5-8d80-4fa0-a5bb-fb24a91bc15e',
 'embedding': None,
 'metadata': {'page_label': '3',
  'file_name': 'Hackers-Painters-BigIdeasFromTheComputerAgebyPaulGraham.pdf',
  'file_path': 'data/Hackers-Painters-BigIdeasFromTheComputerAgebyPaulGraham.pdf',
  'file_type': 'application/pdf',
  'file_size': 1742887,
  'creation_date': '2025-06-30',
  'last_modified_date': '2025-06-30'},
 'excluded_embed_metadata_keys': ['file_name',
  'file_type',
  'file_size',
  'creation_date',
  'last_modified_date',
  'last_accessed_date'],
 'excluded_llm_metadata_keys': ['file_name',
  'file_type',
  'file_size',
  'creation_date',
  'last_modified_date',
  'last_accessed_date'],
 'relationships': {},
 'metadata_template': '{key}: {value}',
 'metadata_separator': '\n',
 'text_resource': MediaResource(embeddings=None, data=None, text='why nerds are unpopular\nit would be a step up. Even for someone in the eightieth per-\ncentile (assuming, as everyone seemed to then, that intelligence is\na scal

In [None]:
from llama_index.core import Document

manual_doc = Document(text="This a random text for manual document in llama-index.")
# metadata can also be added to Document
# manual_doc = Document(text="This a random text for manual document in llama-index.", metadata = {'filename': "text.txt"})

In [None]:
manual_doc.__dict__

{'id_': '112baa88-76d2-41d0-90f0-e4bc91059686',
 'embedding': None,
 'metadata': {},
 'excluded_embed_metadata_keys': [],
 'excluded_llm_metadata_keys': [],
 'relationships': {},
 'metadata_template': '{key}: {value}',
 'metadata_separator': '\n',
 'text_resource': MediaResource(embeddings=None, data=None, text='This a random text for manual document in llama-index.', path=None, url=None, mimetype=None),
 'image_resource': None,
 'audio_resource': None,
 'video_resource': None,
 'text_template': '{metadata_str}\n\n{content}'}

* Transform Data

In [16]:
from llama_index.core.node_parser import SentenceSplitter

parser = SentenceSplitter(
    chunk_size=128, # in tokens
    chunk_overlap=16, #in tokens
    paragraph_separator="\n\n"
)

nodes = parser.get_nodes_from_documents(documents=document, show_progress=True)

Parsing nodes: 100%|██████████| 274/274 [00:00<00:00, 738.27it/s]


In [21]:
len(nodes)

1692

In [18]:
nodes[76].__dict__

{'id_': 'f04fcf19-7b53-47cd-9435-910d0088cbab',
 'embedding': None,
 'metadata': {'page_label': '8',
  'file_name': 'Hackers-Painters-BigIdeasFromTheComputerAgebyPaulGraham.pdf',
  'file_path': 'data/Hackers-Painters-BigIdeasFromTheComputerAgebyPaulGraham.pdf',
  'file_type': 'application/pdf',
  'file_size': 1742887,
  'creation_date': '2025-06-30',
  'last_modified_date': '2025-06-30'},
 'excluded_embed_metadata_keys': ['file_name',
  'file_type',
  'file_size',
  'creation_date',
  'last_modified_date',
  'last_accessed_date'],
 'excluded_llm_metadata_keys': ['file_name',
  'file_type',
  'file_size',
  'creation_date',
  'last_modified_date',
  'last_accessed_date'],
 'relationships': {<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='be1794e6-7434-426d-a312-d2635fcd4413', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'page_label': '8', 'file_name': 'Hackers-Painters-BigIdeasFromTheComputerAgebyPaulGraham.pdf', 'file_path': 'data/Hackers-Painters-BigIdeasFromTheComputerAge

In [132]:
import os

from getpass import getpass
import nest_asyncio

from dotenv import load_dotenv

nest_asyncio.apply()

load_dotenv()


CO_API_KEY = os.environ['CO_API_KEY'] or getpass("Enter your Cohere API key: ")
OPENAI_API_KEY = os.environ['OPENAI_API_KEY'] or getpass("Enter your OpenAI API key: ")
QDRANT_URL = os.environ['QDRANT_URL'] or getpass("Enter your Qdrant URL:")
QDRANT_API_KEY = os.environ['QDRANT_API_KEY'] or  getpass("Enter your Qdrant API Key:")
AZURE_AI_SEARCH_ENDPOINT= os.environ['AZURE_AI_SEARCH_ENDPOINT']
AZURE_AI_SEARCH_API_KEY= os.environ['AZURE_AI_SEARCH_API_KEY']
AZURE_OPENAI_ENDPOINT= os.environ['AZURE_OPENAI_ENDPOINT']
AZURE_API_KEY= os.environ['AZURE_API_KEY']

* Index Data

In [106]:
CO_API_KEY = os.environ['CO_API_KEY'] or getpass("Enter your Cohere API key: ")

In [28]:
from llama_index.embeddings.cohere import CohereEmbedding

embed_v3 = CohereEmbedding(model_name="embed-english-v3.0", api_key=CO_API_KEY)

embed_v3_light = CohereEmbedding(model_name="embed-english-light-v3.0", api_key=CO_API_KEY)

embed_v2 = CohereEmbedding(model_name="embed-english-v2.0", api_key=CO_API_KEY) 

In [46]:
string = "A"

string_0 = "This is a complete sentence."
string_2 = """In the pursuit of a life well-lived, one must recognize the transient nature of the 
material world and the enduring value of virtue. The Sikh Gurus taught us that the Divine Light 
resides within all, and thus, we are united in our essence beyond the superficial distinctions of 
caste, creed, or status. Similarly, the Stoics emphasized the cultivation of inner virtues such as courage, 
temperance, and wisdom, understanding that true freedom lies in mastery over one's own perceptions and actions. 
As we navigate the vicissitudes of life, let us remember that our choices are our own"""

string_3 = """In the pursuit of a life well-lived, one must recognize the transient nature of the 
material world and the enduring value of virtue. The Sikh Gurus taught us that the Divine Light 
resides within all, and thus, we are united in our essence beyond the superficial distinctions of 
caste, creed, or status. Similarly, the Stoics emphasized the cultivation of inner virtues such as courage, 
temperance, and wisdom, understanding that true freedom lies in mastery over one's own perceptions and actions. 
As we navigate the vicissitudes of life, let us remember that our choices are our own, and in choosing virtue, 
we align ourselves with the cosmic order and the teachings of the Gurus. It is through selfless service, 
compassion, and the relentless pursuit of truth that we may attain a state of inner peace and contribute 
to the harmony of the world, embodying the principles of both Sikhism and Stoicism in our daily lives
"""

In [None]:
example_embedding = embed_v3.get_text_embedding(string)

In [32]:
len(example_embedding)

1024

In [38]:
embedding2 = embed_v3_light.get_text_embedding(string)

In [39]:
len(embedding2)

384

In [40]:
embedding3 = embed_v2.get_text_embedding(string)

In [41]:
len(embedding3)

4096

[comparing similarity]

In [48]:
embed_v3.similarity(
    embed_v3.get_text_embedding(string_3),
    embed_v3.get_text_embedding(string_2),
    mode="cosine"
)

np.float64(0.9612125127558748)

## Creating index from scratch

In [55]:
import requests

def load_text_from_url(url):
    response = requests.get(url)
    response.raise_for_status
    return response.text

url = "https://www.gutenberg.org/cache/epub/10763/pg10763.txt"
text_content = load_text_from_url(url)

In [58]:
from llama_index.core import Document, VectorStoreIndex

document = Document(text=text_content[50000:60000])

In [59]:
index = VectorStoreIndex.from_documents(
    documents=[document],
    embed_model=embed_v3,
    show_progress=True
)

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes: 100%|██████████| 1/1 [00:00<00:00, 21.43it/s]
Generating embeddings: 100%|██████████| 3/3 [00:09<00:00,  3.04s/it]


In [61]:
index.__dict__

{'_use_async': False,
 '_store_nodes_override': False,
 '_embed_model': CohereEmbedding(model_name='embed-english-v3.0', embed_batch_size=10, callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x13ec0ef10>, num_workers=None, embeddings_cache=None, api_key='tLD3G1ZJFKFa0VSNSSBfBsWtn22ICpQsvCHduoYm', base_url=None, truncate='END', input_type=None, embedding_type='float'),
 '_insert_batch_size': 2048,
 '_storage_context': StorageContext(docstore=<llama_index.core.storage.docstore.simple_docstore.SimpleDocumentStore object at 0x14aaf2f90>, index_store=<llama_index.core.storage.index_store.simple_index_store.SimpleIndexStore object at 0x14aaf3380>, vector_stores={'default': SimpleVectorStore(stores_text=False, is_embedding_query=True, data=SimpleVectorStoreData(embedding_dict={'fb1624a5-35ea-49d7-a575-ad6b79905f3d': [0.061157227, -0.003583908, 0.07043457, 0.036010742, -0.010551453, -0.010726929, 0.05618286, -0.06951904, -0.021194458, 0.007637024, -0.050750732, 0.022

* creating index from nodes

In [70]:
from llama_index.core import SimpleDirectoryReader


documents = SimpleDirectoryReader(input_files=["data/Eric-Jorgenson_The-Almanack-of-Naval-Ravikant_Final.pdf"]).load_data()

len(documents)

242

In [71]:
from llama_index.core.node_parser import SentenceSplitter

splitter = SentenceSplitter(
    chunk_size=128,
    chunk_overlap=16,
    paragraph_separator="\n\n"
)

nodes = splitter.get_nodes_from_documents(
    documents=documents,
    show_progress=True
)
len(nodes)

Parsing nodes: 100%|██████████| 242/242 [00:00<00:00, 1940.69it/s]


967

In [72]:
index_2 = VectorStoreIndex(
    nodes=nodes,
    embed_model=embed_v3,
    show_progress=True
)

Generating embeddings: 100%|██████████| 967/967 [00:51<00:00, 18.93it/s]


## Storing and Retrieving

In [None]:
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding

embed_model = AzureOpenAIEmbedding(
    model="text-embedding-3-small",
    deployment_name="text-embedding-3-small",
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
    api_key=AZURE_API_KEY,
    api_version="2024-02-01", 
)



In [128]:
from llama_index.vector_stores.azureaisearch import AzureAISearchVectorStore
from llama_index.vector_stores.azureaisearch import IndexManagement
from azure.search.documents.indexes import SearchIndexClient
from azure.core.credentials import AzureKeyCredential

credential = AzureKeyCredential(AZURE_AI_SEARCH_API_KEY)
index_client = SearchIndexClient(endpoint=AZURE_AI_SEARCH_ENDPOINT, credential=credential)

index_name = "it_can_be_done"
vector_store = AzureAISearchVectorStore(
    search_or_index_client=index_client,
    index_name=index_name,
    index_management=IndexManagement.CREATE_IF_NOT_EXISTS,
    embed_model=embed_model,
    id_field_key="id",
    chunk_field_key="chunk",
    embedding_field_key="embedding",
    metadata_string_field_key="metadata",
    doc_id_field_key="doc_id",
)

async_search_or_index_client is None. Depending on the client type passed in, sync or async functions may not work.


In [None]:
from llama_index.core import StorageContext

storage_context = StorageContext.from_defaults(
    vector_store=vector_store,
    )

In [131]:
from llama_index.core import  VectorStoreIndex

# create the index
index = VectorStoreIndex.from_documents(
    documents,
    show_progress=True,
    store_nodes_override=True,
    transformation=[splitter],
    embed_model=embed_model,
    storage_context=storage_context,
)


Parsing nodes: 100%|██████████| 242/242 [00:00<00:00, 4436.96it/s]

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Generating embeddings: 100%|██████████| 242/242 [00:11<00:00, 21.83it/s]


### Retrieving

In [137]:
retriever = index.as_retriever(
    similarity_top_k=5,
    similarity_threshold=0.5)

In [139]:
retriever.retrieve("What is happiness?")

[NodeWithScore(node=TextNode(id_='4c52685c-4de9-4877-8275-5b89f0764fec', embedding=None, metadata={'page_label': '129', 'file_name': 'Eric-Jorgenson_The-Almanack-of-Naval-Ravikant_Final.pdf', 'file_path': 'data/Eric-Jorgenson_The-Almanack-of-Naval-Ravikant_Final.pdf', 'file_type': 'application/pdf', 'file_size': 1884309, 'creation_date': '2025-06-30', 'last_modified_date': '2025-06-30'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='74f33b32-68f7-45ac-948c-af198485d230', node_type='4', metadata={'page_label': '129', 'file_name': 'Eric-Jorgenson_The-Almanack-of-Naval-Ravikant_Final.pdf', 'file_path': 'data/Eric-Jorgenson_The-Almanack-of-Naval-Ravikant_Final.pdf', 'file_type': 'application/pdf', 'file_size'