## Imports

In [1]:
import os
import nest_asyncio
from dotenv import load_dotenv
from IPython.display import display, Markdown

import pdfplumber
from lightrag import LightRAG, QueryParam
from lightrag.llm.openai import gpt_4o_mini_complete, openai_embed
from lightrag.kg.shared_storage import initialize_pipeline_status
from lightrag.utils import setup_logger, EmbeddingFunc


load_dotenv()
nest_asyncio.apply()

setup_logger("lightrag", level="INFO")

# pdf_path = "./attention_is_all_you_need.pdf"
pdf_path = "./AtR_guide.pdf"

WORKING_DIR = "./rag_data"

if not os.path.exists(WORKING_DIR):
    os.mkdir(WORKING_DIR)

[94m2025-05-18 04:05:06 - pipmaster.package_manager - INFO - Targeting pip associated with Python: /home/apalaskos/Documents/On-GitHub/AtRAG/.venv/bin/python | Command base: /home/apalaskos/Documents/On-GitHub/AtRAG/.venv/bin/python -m pip[0m


## Load PDF

### 1. Using `pdfplumber`

In [2]:
pdf_text = ""

with pdfplumber.open(pdf_path) as pdf:
    for i, page in enumerate(pdf.pages):
        pdf_text += page.extract_text() + "\n"

CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

In [None]:
len(pdf_text)

### 2. Using `vision-parse`

In [None]:
from vision_parse import VisionParser


custom_prompt = """When re-formatting the text, please make the following addtions regarding the \
created headings:
    1. Use a single hash (#) only at the very beginning to add the `title`, i.e. "# Title: \
    <book title>". For all other headings use more than one hashes as indicated below.
    2. Use double hash (##) only for the chapters and add that word to it, i.e. "## Chapter 1: \
    <chapter title>".
    3. Use tripple hash (###) only for the sections and add that word to it, i.e. "### "Section \
    1.1: <section title>""
    4. Use 4 hashes (####) only for the sub-sections and add that word to it, i.e. "#### \
    "Subsection 1.1.1: <sub_section title>""
    5. In case more hashes are needed, just use bullets.
"""


# Initialize parser with OpenAI model
parser = VisionParser(
    model_name="gpt-4o-mini",
    api_key=os.getenv("OPENAI_API_KEY"),
    extraction_complexity=False,
    custom_prompt=custom_prompt,
    detailed_extraction=False,
    enable_concurrency=False,
)

# Convert PDF to markdown
markdown_pages = parser.convert_pdf(pdf_path)

In [None]:
# Process results
for i, page_content in enumerate(markdown_pages):
    print(f"{page_content}\n")

In [None]:
# Combine pages into a single markdown string
full_markdown = "\n\n".join(f"{page}" for page in markdown_pages)

# Write to a markdown file
output_file = "AtR_guide_processed.md"
with open(output_file, "w", encoding="utf-8") as f:
    f.write(full_markdown)

print(f"Markdown content saved to {output_file}")

In [None]:
pdf_text = ""

with pdfplumber.open(pdf_path) as pdf:
    for page in pdf.pages:
        pdf_text += page.extract_text() + "\n"

## LightRAG initialization and  Index Creation

In [None]:
# import os
# import asyncio
# from typing import List, Union
# from openai import OpenAI
# from google import genai


# N_THREADS = 1


# # ——— Clients ———
# openai_client = OpenAI(
#     api_key=os.getenv("GOOGLE_API_KEY"),
#     base_url="https://generativelanguage.googleapis.com/v1beta/openai/"
# )
# genai_client = genai.Client(api_key=os.getenv("GOOGLE_API_KEY"))

# # ——— Semaphores to limit concurrency to 4 ———
# _llm_sem = asyncio.Semaphore(N_THREADS)
# _embed_sem = asyncio.Semaphore(N_THREADS)


# # ——— Sync helper for embeddings ———
# def _embed_sync(text: str) -> List[float]:
#     # Correct parameter name is `contents`; returns resp.embeddings, a list of objects
#     resp = genai_client.models.embed_content(
#         model="gemini-embedding-exp-03-07",
#         contents=text
#     )
#     # Extract the first embedding vector
#     return resp.embeddings[0].values


# async def gemini_embed(texts: Union[str, List[str]], *args, **kwargs) -> List[List[float]]:
#     # Normalize single string to list
#     if isinstance(texts, str):
#         texts = [texts]

#     loop = asyncio.get_event_loop()

#     async def _one(txt: str) -> List[float]:
#         async with _embed_sem:
#             # Offload blocking call
#             return await loop.run_in_executor(None, _embed_sync, txt)

#     # Concurrently embed up to 4 texts
#     return await asyncio.gather(*[_one(t) for t in texts])


# # ——— Sync helper for chat completions ———
# def _complete_sync(prompt: str) -> str:
#     response = openai_client.chat.completions.create(
#         model="gemini-2.5-flash-preview-04-17",
#         messages=[{"role": "user", "content": prompt}],
#         temperature=0.7,
#         reasoning_effort="none"
#     )
#     return response.choices[0].message.content


# async def gemini_complete(prompt: str, *args, **kwargs) -> str:
#     loop = asyncio.get_event_loop()
#     async with _llm_sem:
#         # Offload the synchronous create() call
#         return await loop.run_in_executor(None, _complete_sync, prompt)


# async def initialize_rag():
#     rag = LightRAG(
#         working_dir=WORKING_DIR,
#         chunk_token_size=1200,
#         chunk_overlap_token_size=200,
#         llm_model_func=gemini_complete,
#         llm_model_name="gemini-2.5-flash-preview-04-17",
#         llm_model_max_async=1,
#         llm_model_max_token_size=32768,
#         embedding_func=EmbeddingFunc(
#             embedding_dim=3072,
#             max_token_size=8192,
#             func=gemini_embed
#         ),
#     )

#     await rag.initialize_storages()
#     await initialize_pipeline_status()

#     return rag

In [8]:
from lightrag.llm.openai import openai_complete_if_cache


async def groq_complete(
    prompt, system_prompt=None, history_messages=[], keyword_extraction=False, **kwargs
) -> str:
    return await openai_complete_if_cache(
        "meta-llama/llama-4-maverick-17b-128e-instruct",
        prompt,
        system_prompt=system_prompt,
        history_messages=history_messages,
        api_key=os.getenv("GROQ_API_KEY"),
        base_url="https://api.groq.com/openai/v1",
        **kwargs
    )


async def initialize_rag():
    rag = LightRAG(
        working_dir=WORKING_DIR,
        chunk_token_size=1200,
        chunk_overlap_token_size=200,
        llm_model_func=groq_complete,
        llm_model_name="meta-llama/llama-4-maverick-17b-128e-instruct",
        llm_model_max_async=4,
        llm_model_max_token_size=32768,
        embedding_func=EmbeddingFunc(
            embedding_dim=3072,
            max_token_size=8192,
            func=lambda texts: openai_embed(texts, model="text-embedding-3-large")),
    )

    await rag.initialize_storages()
    await initialize_pipeline_status()

    return rag


async def create_index():
    rag = None
    try:
        rag = await initialize_rag()
        await rag.ainsert(pdf_text)
        return rag

    except Exception as e:
        print(f"An error occurred: {e}")
    finally:
        if rag:
            await rag.finalize_storages()
    return rag

In [9]:
rag_instance = await create_index()

INFO: Process 12739 Shared-Data already initialized (multiprocess=False)
INFO: Created new empty graph
INFO:nano-vectordb:Init {'embedding_dim': 3072, 'metric': 'cosine', 'storage_file': './rag_data/vdb_entities.json'} 0 data
INFO:nano-vectordb:Init {'embedding_dim': 3072, 'metric': 'cosine', 'storage_file': './rag_data/vdb_relationships.json'} 0 data
INFO:nano-vectordb:Init {'embedding_dim': 3072, 'metric': 'cosine', 'storage_file': './rag_data/vdb_chunks.json'} 0 data
INFO: Process 12739 storage namespace already initialized: [full_docs]
INFO: Process 12739 storage namespace already initialized: [text_chunks]
INFO: Process 12739 storage namespace already initialized: [llm_response_cache]
INFO: Process 12739 storage namespace already initialized: [doc_status]
INFO: Process 12739 storage namespace already initialized: [full_docs]
INFO: Process 12739 storage namespace already initialized: [text_chunks]
INFO: Process 12739 storage namespace already initialized: [llm_response_cache]
INFO:

## Inference

In [None]:
from IPython.display import update_display


QA_DIR = "./QAs"
filename = os.path.join(QA_DIR, "sample_QAs.md")

if not os.path.exists(QA_DIR):
    os.mkdir(QA_DIR)


USER_PROMPT = """You are a deeply realized individual, meaning you have been through all 7 stages, \
including the corresponding experiences and realizations described in the Awakening to Reality \
(AtR) guide, a book you wrote aimed for helping individuals discover their true selves. Having a \
background in Buddhism, you have a deep understanding of the relevant old texts of those spiritual \
tranditions as well. You are going to be asked questions from people who are interested in your \
proposed path to awakening, and who are either new or more experienced in your practices or \
further down the path as far as the depth and clarity of their realization is concerned. In your \
responses, please follow the instructions belows:
    1. Answer only based on the information in the Awakening to Reality (AtR) guide.
    2. Try your best to use the words and their associated meanings, as they are used in the guide.
    3. Do not create any new information or fantasize about anything that is not in the guide.
    4. If the question is not related to the Awakening to Reality (AtR) guide, do not answer, \
    explaining that you are only interested in questions relevant to the Awakening to Reality \
    (AtR) guide.
"""


def append_qa_to_markdown(question: str, answer: str, filename: str) -> None:
    global QA_COUNTER

    with open(filename, "a", encoding="utf-8") as f:
        f.write(
            f"# Question {QA_COUNTER}:\n## {question}\n\n # *Answer:*\n{answer}\n\n"
        )

    QA_COUNTER += 1


async def display_lightrag_response(
    question, user_prompt, mode_val, top_k_val, model_func_to_use, current_rag_instance
):
    try:
        stream_iter = current_rag_instance.query(
            question,
            param=QueryParam(
                mode=mode_val,
                top_k=top_k_val,
                model_func=model_func_to_use,
                stream=True,
                user_prompt=user_prompt,
                response_type="Bullet Points",
            )
        )

        display_id = "lightrag_stream_output"
        full_response = ""

        # Initial display (blank)
        display(Markdown(""), display_id=display_id)

        # Append tokens/chunks as they come in
        async for chunk in stream_iter:
            full_response += chunk
            update_display(Markdown(full_response), display_id=display_id)

        return full_response

    except Exception as e:
        display(Markdown(f"**Error during streaming:** `{type(e).__name__}: {e}`"))

In [None]:
# QA_COUNTER = 1

INFO: Cleared all cache


In [82]:
mode = 'hybrid'
top_k = 60
model_func_override = None

await rag_instance.aclear_cache()

# QUESTION = "What is this guide all about?"

# QUESTION = """What are the stages of awakening described in the book? Please give a brief \
# explanation of each one of them as well."""

# QUESTION = """What are the possible experiences that one can pass in stage 1 and what is the main \
# difference between a stage-1 experience and a complete realization of the same stage?"""

# QUESTION = """What practices can a practitioner do in order to realize 'I AM'? Give a brief \
# description for each one of them as well."""

# QUESTION = "What are the qualities of the realization one has at the last stage?"

# QUESTION = """I am a complete beginner of the path described in this long AtR guide. How would you \
# recommend me to start?"""

# QUESTION = """What are the greatest hindrances that someone can experience following this path? If \
# it's possible make reference to the corresponding state(s) each of the obstacles can appear in."""

# QUESTION = """\
# What are the health risks that this path entails? For each one of them, please:
#     1. give a short description
#     2. provide the stage or stages they usually appear in
#     2. explain how they can be avoided.
# """

# QUESTION = "Can you tell me all the qualities 'attained' after a complete I AM realization?"

# QUESTION = """Can you tell me what is the fourth stage according to the guide and what does one \
# realize in that stage?"""

# QUESTION = "What practices would you recommend for someone just having realized I AM?"

# QUESTION = "Tell me in detail what the fifth stage contains?"

# QUESTION = "Please explain to me in detail what the differences between stages 5 and 6 are?"

QUESTION = """What are the most common pitfalls for someone who has realized I AM, and how can \
they avoid them?"""

response = await display_lightrag_response(
    QUESTION, USER_PROMPT, mode, top_k, model_func_override, rag_instance
)

append_qa_to_markdown(QUESTION, response, filename)

INFO: Cleared all cache
INFO:  == LLM cache == saving hybrid: 0db4dffdd43d001b29298d213bc773f1
INFO: Process 12739 building query context...
INFO: Query nodes: Ego inflation, Spiritual materialism, Emotional imbalance, Self-awareness practices, Mindfulness, top_k: 60, cosine: 0.2
INFO: Local query uses 60 entites, 138 relations, 3 chunks
INFO: Query edges: Spiritual pitfalls, Self-realization, Personal growth, top_k: 60, cosine: 0.2
INFO: Global query uses 27 entites, 60 relations, 3 chunks


## Common Pitfalls After Realizing I AM

After realizing I AM, individuals may encounter several pitfalls that can hinder their spiritual progress. Some of the most common pitfalls include:

* **Attachment to the I AM state**: Becoming overly attached to the I AM state, which can lead to a sense of complacency and stagnation.
* **Reifying the Self**: Treating the Self as a fixed entity, rather than understanding it as a fluid and dynamic concept.
* **Neglecting further practice**: Failing to continue practicing self-inquiry and other spiritual practices, leading to stagnation and potential regression.
* **Misconstruing non-duality**: Misunderstanding the concept of non-duality, leading to a lack of clarity and potentially causing harm to oneself or others.

## Avoiding the Pitfalls

To avoid these pitfalls, it is essential to:

* **Continue practicing self-inquiry**: Regularly engaging in self-inquiry and other spiritual practices to deepen understanding and progress on the spiritual path.
* **Cultivate discernment**: Developing discernment to distinguish between genuine spiritual experiences and potential pitfalls.
* **Embracing emptiness**: Understanding and embracing the concept of emptiness to avoid reifying the Self and other concepts.
* **Seeking guidance**: Seeking guidance from experienced spiritual teachers and practitioners to navigate potential challenges.

## References

* [DC] unknown_source (file containing conversations between John Tan and Soh Wei Yu)
* [DC] unknown_source (file containing quotes from Ramana Maharshi and other spiritual texts)
* [KG] file_path: unknown_source (entities related to I AM realization and spiritual practices)
* [KG] file_path: unknown_source (relationships between John Tan, Soh Wei Yu, and other spiritual practitioners)
* [DC] unknown_source (file containing Eckhart Tolle's text on inner body awareness)