# Advanced RAG in LlamaIndex

In [1]:
%pip install nest_asyncio



In [2]:
import nest_asyncio
nest_asyncio.apply()

In [3]:
%pip install -Uq llama-index

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.9/11.9 MB[0m [31m70.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m303.3/303.3 kB[0m [31m21.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.3/107.3 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.9/63.9 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m329.1/329.1 kB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m52.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.0/88.0 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

## Extract

In [6]:
import os
from llama_index.core import SimpleDirectoryReader

# Create a directory named 'data' if it doesn't already exist
if not os.path.exists('./data'):
    os.makedirs('./data')

# Define the source and destination paths for the PDF file
source_pdf_path = '/content/EKS-v24-10-30080.pdf'
dest_pdf_path = './data/EKS-v24-10-30080.pdf'

# Move the PDF file into the 'data' directory if it's not already there
if os.path.exists(source_pdf_path) and not os.path.exists(dest_pdf_path):
    os.rename(source_pdf_path, dest_pdf_path)

docs = SimpleDirectoryReader(input_dir="./data").load_data()

In [7]:
len(docs)  # one per page

14

In [8]:
import pprint
pprint.pprint(docs)

[Document(id_='760d1486-4b18-4af0-8986-582060bb6d6b', embedding=None, metadata={'page_label': '1', 'file_name': 'EKS-v24-10-30080.pdf', 'file_path': '/content/data/EKS-v24-10-30080.pdf', 'file_type': 'application/pdf', 'file_size': 1148439, 'creation_date': '2026-02-03', 'last_modified_date': '2026-02-03'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='Education in the Knowledge Society 24 (2023)\nEducation in the Knowledge Society\njournal homepage http://revistas.usal.es/index.php/eks/\nEdiciones Universidad de Salamanca | https://doi.org/10.14201/eks.30080 | e30080 24 - 1\nRecibido, 09/08/2022. Revisado, 22/01/2023. Aceptado, 07/02/20

## Transform

In [9]:
# hide some keys from llm

docs[0].__dict__ # too much data about one doc

{'id_': '760d1486-4b18-4af0-8986-582060bb6d6b',
 'embedding': None,
 'metadata': {'page_label': '1',
  'file_name': 'EKS-v24-10-30080.pdf',
  'file_path': '/content/data/EKS-v24-10-30080.pdf',
  'file_type': 'application/pdf',
  'file_size': 1148439,
  'creation_date': '2026-02-03',
  'last_modified_date': '2026-02-03'},
 'excluded_embed_metadata_keys': ['file_name',
  'file_type',
  'file_size',
  'creation_date',
  'last_modified_date',
  'last_accessed_date'],
 'excluded_llm_metadata_keys': ['file_name',
  'file_type',
  'file_size',
  'creation_date',
  'last_modified_date',
  'last_accessed_date'],
 'relationships': {},
 'metadata_template': '{key}: {value}',
 'metadata_separator': '\n',
 'text_resource': MediaResource(embeddings=None, data=None, text='Education in the Knowledge Society 24 (2023)\nEducation in the Knowledge Society\njournal homepage http://revistas.usal.es/index.php/eks/\nEdiciones Universidad de Salamanca | https://doi.org/10.14201/eks.30080 | e30080 24 - 1\nReci

In [10]:
# quick example of what the LLM and Embeddings see when with a test document

from llama_index.core import Document
from llama_index.core.schema import MetadataMode

document = Document(
    text="This is a super-customized document",
    metadata={
        "file_name": "super_secret_document.txt",
        "category": "finance",
        "author": "LlamaIndex",
    },
    # excluded_embed_metadata_keys=["file_name"],
    excluded_llm_metadata_keys=["category"],
    metadata_seperator="\n",
    metadata_template="{key}:{value}",
    text_template="Metadata:\n{metadata_str}\n-----\nContent:\n{content}",
)

print(
    "The LLM sees this: \n",
    document.get_content(metadata_mode=MetadataMode.LLM),
)
# print(
#     "The Embedding model sees this: \n",
#     document.get_content(metadata_mode=MetadataMode.EMBED),
# )

The LLM sees this: 
 Metadata:
file_name:super_secret_document.txt
author:LlamaIndex
-----
Content:
This is a super-customized document


In [11]:
from llama_index.core.schema import MetadataMode

# print(docs[0].get_content(metadata_mode=MetadataMode.LLM))   # what the llm sees
print(docs[0].get_content(metadata_mode=MetadataMode.EMBED)) # what embeddings see. in this case, same thing

page_label: 1
file_path: /content/data/EKS-v24-10-30080.pdf

Education in the Knowledge Society 24 (2023)
Education in the Knowledge Society
journal homepage http://revistas.usal.es/index.php/eks/
Ediciones Universidad de Salamanca | https://doi.org/10.14201/eks.30080 | e30080 24 - 1
Recibido, 09/08/2022. Revisado, 22/01/2023. Aceptado, 07/02/2023. Publicado, 26/06/2024.
e-ISSN: 2444-8729
Educational Data Science and Machine Learning: A Case Study on University 
Student Dropout in Mexico
Ciencia de Datos Educativos y aprendizaje automático: un caso de estudio sobre la 
deserción estudiantil universitaria en México
Antonieta Kuza*, Rosa Moralesb
a Facultad de Informática, Universidad Metropolitana para la Educación y el Trabajo, Buenos Aires, Argentina.
https://orcid.org/0000-0002-8696-0859 antokuz@esgcffaa.edu.ar
b Departamento de Economía, Universidad de Monterrey, Monterrey, México. 
https://orcid.org/0000-0002-7044-2600 rosa.moralesv@udem.edu
(*) Autor de correspondencia / Correspon

In [12]:
for doc in docs:
    # define the content/metadata template
    doc.text_template = "Metadata:\n{metadata_str}\n---\nContent:\n{content}"

    # exclude page label from embedding
    if "page_label" not in doc.excluded_embed_metadata_keys:
        doc.excluded_embed_metadata_keys.append("page_label")

In [13]:
# after editing the content seen by embedings

print(docs[0].get_content(metadata_mode=MetadataMode.EMBED))

Metadata:
file_path: /content/data/EKS-v24-10-30080.pdf
---
Content:
Education in the Knowledge Society 24 (2023)
Education in the Knowledge Society
journal homepage http://revistas.usal.es/index.php/eks/
Ediciones Universidad de Salamanca | https://doi.org/10.14201/eks.30080 | e30080 24 - 1
Recibido, 09/08/2022. Revisado, 22/01/2023. Aceptado, 07/02/2023. Publicado, 26/06/2024.
e-ISSN: 2444-8729
Educational Data Science and Machine Learning: A Case Study on University 
Student Dropout in Mexico
Ciencia de Datos Educativos y aprendizaje automático: un caso de estudio sobre la 
deserción estudiantil universitaria en México
Antonieta Kuza*, Rosa Moralesb
a Facultad de Informática, Universidad Metropolitana para la Educación y el Trabajo, Buenos Aires, Argentina.
https://orcid.org/0000-0002-8696-0859 antokuz@esgcffaa.edu.ar
b Departamento de Economía, Universidad de Monterrey, Monterrey, México. 
https://orcid.org/0000-0002-7044-2600 rosa.moralesv@udem.edu
(*) Autor de correspondencia / C

Here are other, more advanced transformations. Some require an LLM to work. We will use Qwen 2.5 32B Instruct 128k through Groq, which is an affordble, high-rate model. It should be enough to extract Q&As and titles from the documents.

In [14]:
%pip install -Uq llama-index-llms-groq

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m75.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m566.3/566.3 kB[0m [31m34.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [15]:
from llama_index.llms.groq import Groq
import os
import getpass

os.environ["GROQ_API_KEY"] = getpass.getpass("Enter your Groq API key: ")

Enter your Groq API key: ··········


In [22]:
llm_transformations = Groq(model="llama-3.1-8b-instant", api_key=os.environ["GROQ_API_KEY"])

In [None]:
# other transformations

from llama_index.core.extractors import (
    TitleExtractor,
    QuestionsAnsweredExtractor,
)
from llama_index.core.node_parser import SentenceSplitter

text_splitter = SentenceSplitter(
    separator=" ", chunk_size=1024, chunk_overlap=128
)
title_extractor = TitleExtractor(llm=llm_transformations, nodes=5)
qa_extractor = QuestionsAnsweredExtractor(llm=llm_transformations, questions=3)


from llama_index.core.ingestion import IngestionPipeline

pipeline = IngestionPipeline(
    transformations=[
        text_splitter,
        title_extractor,
        qa_extractor
    ]
)

nodes = pipeline.run(
    documents=docs,
    in_place=True,
    show_progress=True,
)

In [27]:
import time
from llama_index.core.extractors import (
    TitleExtractor,
    QuestionsAnsweredExtractor,
)
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.llms import ChatMessage

# =========================
# FIX 1: splitter ultra chico
# =========================
text_splitter = SentenceSplitter(
    separator=" ",
    chunk_size=384,      # 🔥 más chico todavía
    chunk_overlap=48
)

# =========================
# FIX 2: extractors MINIMAL
# =========================
title_extractor = TitleExtractor(
    llm=llm_transformations,
    nodes=1              # 🔥 SOLO 1
)

qa_extractor = QuestionsAnsweredExtractor(
    llm=llm_transformations,
    questions=1
)

pipeline = IngestionPipeline(
    transformations=[
        text_splitter,
        title_extractor,
        qa_extractor
    ]
)

# =========================
# FIX 3: throttle MANUAL
# =========================
nodes = []
for i, doc in enumerate(docs):
    print(f"Procesando doc {i+1}/{len(docs)}")

    out = pipeline.run(
        documents=[doc],     # 🔥 UNO POR UNO
        in_place=True,
        show_progress=False,
        num_workers=1
    )

    nodes.extend(out)

    # 🔥 pausa obligatoria (Groq-friendly)
    time.sleep(5)



Procesando doc 1/14


100%|██████████| 1/1 [00:00<00:00,  1.13it/s]
100%|██████████| 3/3 [00:01<00:00,  2.61it/s]


Procesando doc 2/14


100%|██████████| 1/1 [00:01<00:00,  1.22s/it]
100%|██████████| 5/5 [00:16<00:00,  3.30s/it]


Procesando doc 3/14


100%|██████████| 1/1 [00:03<00:00,  3.36s/it]
100%|██████████| 4/4 [00:35<00:00,  8.89s/it]


Procesando doc 4/14


100%|██████████| 1/1 [00:03<00:00,  3.05s/it]
100%|██████████| 3/3 [00:20<00:00,  6.73s/it]


Procesando doc 5/14


100%|██████████| 1/1 [00:03<00:00,  3.09s/it]
100%|██████████| 5/5 [00:35<00:00,  7.03s/it]


Procesando doc 6/14


100%|██████████| 1/1 [00:03<00:00,  3.30s/it]
100%|██████████| 4/4 [00:34<00:00,  8.52s/it]


Procesando doc 7/14


100%|██████████| 1/1 [00:02<00:00,  2.15s/it]
100%|██████████| 3/3 [00:23<00:00,  7.97s/it]


Procesando doc 8/14


100%|██████████| 1/1 [00:02<00:00,  2.15s/it]
100%|██████████| 4/4 [00:30<00:00,  7.56s/it]


Procesando doc 9/14


100%|██████████| 1/1 [00:01<00:00,  1.02s/it]
100%|██████████| 3/3 [00:21<00:00,  7.12s/it]


Procesando doc 10/14


100%|██████████| 1/1 [00:00<00:00,  1.04it/s]
100%|██████████| 2/2 [00:16<00:00,  8.01s/it]


Procesando doc 11/14


100%|██████████| 1/1 [00:00<00:00,  1.20it/s]
100%|██████████| 3/3 [00:20<00:00,  6.99s/it]


Procesando doc 12/14


100%|██████████| 1/1 [00:03<00:00,  3.21s/it]
100%|██████████| 5/5 [00:35<00:00,  7.10s/it]


Procesando doc 13/14


100%|██████████| 1/1 [00:04<00:00,  4.15s/it]
100%|██████████| 5/5 [00:37<00:00,  7.57s/it]


Procesando doc 14/14


100%|██████████| 1/1 [00:03<00:00,  3.18s/it]
100%|██████████| 5/5 [00:30<00:00,  6.10s/it]


By default, Llamaindex uses OpenAI's embedding models. But you can choose to load a free model from HuggingFace too (but it it will be slower).

In [28]:
len(nodes)

54

In [29]:
import pprint

# pprint.pprint(nodes[0].__dict__)

print(nodes[0].get_content(metadata_mode=MetadataMode.LLM))

[Excerpt from document]
page_label: 1
file_path: /content/data/EKS-v24-10-30080.pdf
document_title: Based on the provided information, a comprehensive title for the document could be:

"University Student Dropout in Mexico: A Case Study on Educational Data Science and Machine Learning Applied to Universidad Metropolitana para la Educación y el Trabajo and Universidad de Monterrey"

However, since the authors are not mentioned in the provided context, the title would be more accurate without including the authors' names.
questions_this_excerpt_can_answer: Based on the provided context, here are some questions that this context can provide specific answers to, which are unlikely to be found elsewhere:

1. **What is the exact date of publication of the article "Educational Data Science and Machine Learning: A Case Study on University Student Dropout in Mexico" in the journal "Education in the Knowledge Society"?**

This question can be answered based on the information provided in the exc

## Index

In [30]:
%pip install -Uq llama-index-embeddings-huggingface

In [31]:
# Embeddings

from llama_index.embeddings.huggingface import HuggingFaceEmbedding

hf_embeddings = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

test_embed = hf_embeddings.get_text_embedding("Hello world")
print(test_embed)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

[0.015196100808680058, -0.022570667788386345, 0.0085471011698246, -0.07417059689760208, 0.0038364154752343893, 0.0027135491836816072, -0.0312679260969162, 0.04463403671979904, 0.044055208563804626, -0.007871134206652641, -0.025200756266713142, -0.033366620540618896, 0.014427922666072845, 0.04653818905353546, 0.008555104956030846, -0.016145728528499603, 0.007405802607536316, -0.01901242695748806, -0.114726223051548, -0.01815761812031269, 0.12635929882526398, 0.02970289997756481, 0.025281012058258057, -0.034217868000268936, -0.04099970683455467, 0.006617335136979818, 0.010270599275827408, 0.022362269461154938, 0.004436342045664787, -0.12730959057807922, -0.0161492470651865, -0.020380133762955666, 0.047212108969688416, 0.011579900048673153, 0.0681871548295021, 0.007298617158085108, -0.017852986231446266, 0.04078212380409241, -0.010269463062286377, 0.023757092654705048, 0.01060289703309536, -0.028584439307451248, 0.00815972313284874, -0.015180555172264576, 0.0308962594717741, -0.0659798905

In [32]:
# create index

from llama_index.core import VectorStoreIndex

index = VectorStoreIndex(nodes, embed_model=hf_embeddings)

## Query

In [33]:
llm_querying = Groq(model="llama-3.3-70b-versatile", api_key=os.environ["GROQ_API_KEY"])

query_engine = index.as_query_engine(llm=llm_querying)
response = query_engine.query(
    "what does this model do?"
)

print(response)

This model, specifically the XGBoost model, is used to predict university student dropout in Mexico. It analyzes various predictors, such as the student's average in the first period, scholarship percentage, and region, to determine the likelihood of a student continuing in the university. The model provides a predictive approach for identifying risk factors and optimizing retention strategies.


In [34]:
response.__dict__

{'response': "This model, specifically the XGBoost model, is used to predict university student dropout in Mexico. It analyzes various predictors, such as the student's average in the first period, scholarship percentage, and region, to determine the likelihood of a student continuing in the university. The model provides a predictive approach for identifying risk factors and optimizing retention strategies.",
 'source_nodes': [NodeWithScore(node=TextNode(id_='4eeec4ce-73fc-4e73-98a5-b6d6e4c75162', embedding=None, metadata={'page_label': '8', 'file_name': 'EKS-v24-10-30080.pdf', 'file_path': '/content/data/EKS-v24-10-30080.pdf', 'file_type': 'application/pdf', 'file_size': 1148439, 'creation_date': '2026-02-03', 'last_modified_date': '2026-02-03', 'document_title': 'Based on the provided entities, a comprehensive title for this document could be:\n\n"Algoritmos de Aprendizaje Supervisado: Regresión Logística y Procesamiento de Datos con Ediciones Universidad de Salamanca"\n\nHowever, i

## Store

In [35]:
index.storage_context.persist(persist_dir="./vectors")

In [36]:
from llama_index.core import StorageContext, load_index_from_storage

# rebuild storage context
storage_context = StorageContext.from_defaults(persist_dir="./vectors")

# load index
index_from_storage = load_index_from_storage(storage_context, embed_model=hf_embeddings)

In [37]:
qa = index_from_storage.as_query_engine(llm=llm_querying)

In [38]:
response = qa.query("En que contribuye el articulo?")
print(response)

El artículo contribuye a la literatura creciente en deserción estudiantil universitaria al utilizar técnicas de Aprendizaje Automático (AA) que permiten clasificar y jerarquizar las categorías que predicen el abandono estudiantil institucional a nivel de educación superior.


# Using Vector Stores

In [39]:
%pip install -Uq chromadb
%pip install -Uq llama-index-vector-stores-chroma

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.0/52.0 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.1/21.1 MB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.2/278.2 kB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m44.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.4/17.4 MB[0m [31m26.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.5/72.5 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.6/132.6 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.4/66.4 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [40]:
import chromadb
from llama_index.core import VectorStoreIndex
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext

# initialize client, setting path to save data
db = chromadb.PersistentClient(path="./chroma_db")

# create collection
chroma_collection = db.get_or_create_collection("healthGPT")

# assign chroma as the vector_store to the context
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

# create your index
index = VectorStoreIndex(
    nodes, storage_context=storage_context, embed_model=hf_embeddings
)

# You can also load from documents and apply transformations in place
# index = VectorStoreIndex.from_documents(
#     documents, storage_context=storage_context, transformations=[]
# )

# Or you can initialize your index from your vector store and then add the nodes
# index = VectorStoreIndex.from_vector_store(
#     vector_store=vector_store, embed_model=hf_embeddings
# )
# index.insert_nodes(nodes)


# create a query engine and query
query_engine = index.as_query_engine(llm=llm_querying)

In [41]:
response = query_engine.query("Este estudio en que contribuye?")
print(response)

Este estudio contribuye a la literatura creciente en deserción estudiantil universitaria al usar técnicas de Aprendizaje Automático (AA) que permiten clasificar y jerarquizar las categorías que predicen el abandono estudiantil institucional a nivel de educación superior.
