Main Goal: To create a rag system that provides information usin sonnet using shakesphere's sonnet
# To-Do


In [1]:
import os
from dotenv import load_dotenv

In [2]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from huggingface_hub import InferenceClient

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
load_dotenv()

PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_REGION = os.getenv("PINECONE_REGION")
PINECONE_CLOUD = os.getenv("PINECONE_CLOUD")
HUGGING_FACE_API = os.getenv("HUGGING_FACE_API")

In [4]:
documents = SimpleDirectoryReader("data").load_data()

In [5]:
print(f"Loaded {len(documents)} documents\n\n")
print(documents[0].text[:500])  # Show a preview of the first doc

Loaded 1 documents


From fairest creatures we desire increase,
That thereby beauty's rose might never die,
But as the riper should by time decease,
His tender heir might bear his memory:
But thou contracted to thine own bright eyes,
Feed'st thy light's flame with self-substantial fuel,
Making a famine where abundance lies,
Thy self thy foe, to thy sweet self too cruel:
Thou that art now the world's fresh ornament,
And only herald to the gaudy spring,
Within thine own bud buriest thy content,
And, tender 


In [6]:
splitter = SentenceSplitter(chunk_size=512, chunk_overlap=50)
nodes = splitter.get_nodes_from_documents(documents)

In [7]:
print(nodes[:1])

[TextNode(id_='71e40f00-6944-4637-be38-e58a781b5c30', embedding=None, metadata={'file_path': '/home/vertex/Desktop/RAG-using-LlamaIndex/data/Sonnets.txt', 'file_name': 'Sonnets.txt', 'file_type': 'text/plain', 'file_size': 96390, 'creation_date': '2025-07-10', 'last_modified_date': '2025-07-10'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='a400c16a-e471-48d8-a664-3f386d5f67c0', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_path': '/home/vertex/Desktop/RAG-using-LlamaIndex/data/Sonnets.txt', 'file_name': 'Sonnets.txt', 'file_type': 'text/plain', 'file_size': 96390, 'creation_date': '2025-07-10', 'last_modified_date': '2025-07-10'}, hash='d2acb3db69fc18a716639cb931dc5c9f53757707801db29cb1543711edc

In [8]:
# Use HuggingFace embedding model
# https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [9]:
index = VectorStoreIndex(nodes, embed_model=embed_model)

In [10]:
from llama_index.core import VectorStoreIndex
from llama_index.core import SimpleDirectoryReader
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.pinecone import PineconeVectorStore

from pinecone import Pinecone, ServerlessSpec

In [11]:
pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "rag-llamaindex"
embedding_dim = 384

# Create index if it doesn't exist
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=embedding_dim,
        metric="cosine",
        spec=ServerlessSpec(
            cloud=PINECONE_CLOUD,
            region=PINECONE_REGION
        )
    )

In [12]:
# Connect to the Pinecone index
pinecone_index = pc.Index(index_name)

# Create LlamaIndex Pinecone vector store wrapper
vector_store = PineconeVectorStore(pinecone_index=pinecone_index)

In [13]:
index = VectorStoreIndex(nodes, embed_model=embed_model, vector_store=vector_store)

In [15]:
# retriever = index.as_retriever()
# results = retriever.retrieve("What is this document about?")
# results[0]

In [16]:
retriever = index.as_retriever()
results = retriever.retrieve("What do they say about the painter")

# Build prompt with text and metadata
context_blocks = []

for i, node in enumerate(results):
    metadata = node.metadata or {}
    page_number = metadata.get("page_number", "N/A")
    source = metadata.get("file_name", "Unknown Source")
    
    block = f"""
[Document {i+1}]
Source: {source}
Page: {page_number}
Content:
{node.get_text()}
"""
    context_blocks.append(block)

retrieved_text = "\n".join(context_blocks)


In [25]:
for i, node in enumerate(results):
    print(f"Document {i+1} Metadata:")
    print("Content:")
    print(node.get_text())
    print("-" * 40)

Document 1 Metadata:
Content:
O! let my looks be then the eloquence
And dumb presagers of my speaking breast,
Who plead for love, and look for recompense,
More than that tongue that more hath more express'd.
O! learn to read what silent love hath writ:
To hear with eyes belongs to love's fine wit.

Mine eye hath played the painter and hath steeled,
Thy beauty's form in table of my heart;
My body is the frame wherein 'tis held,
And perspective that is best painter's art.
For through the painter must you see his skill,
To find where your true image pictured lies,
Which in my bosom's shop is hanging still,
That hath his windows glazed with thine eyes.
Now see what good turns eyes for eyes have done:
Mine eyes have drawn thy shape, and thine for me
Are windows to my breast, where-through the sun
Delights to peep, to gaze therein on thee;
Yet eyes this cunning want to grace their art,
They draw but what they see, know not the heart.

Let those who are in favour with their stars
Of public ho

In [17]:
retrieved_text

"\n[Document 1]\nSource: Sonnets.txt\nPage: N/A\nContent:\nO! let my looks be then the eloquence\r\nAnd dumb presagers of my speaking breast,\r\nWho plead for love, and look for recompense,\r\nMore than that tongue that more hath more express'd.\r\nO! learn to read what silent love hath writ:\r\nTo hear with eyes belongs to love's fine wit.\r\n\r\nMine eye hath played the painter and hath steeled,\r\nThy beauty's form in table of my heart;\r\nMy body is the frame wherein 'tis held,\r\nAnd perspective that is best painter's art.\r\nFor through the painter must you see his skill,\r\nTo find where your true image pictured lies,\r\nWhich in my bosom's shop is hanging still,\r\nThat hath his windows glazed with thine eyes.\r\nNow see what good turns eyes for eyes have done:\r\nMine eyes have drawn thy shape, and thine for me\r\nAre windows to my breast, where-through the sun\r\nDelights to peep, to gaze therein on thee;\r\nYet eyes this cunning want to grace their art,\r\nThey draw but what

In [18]:
user_question = "What does Krishna say about the Bhagavad Gita?"

prompt = f"Context:\n{retrieved_text}\n\nQuestion: {user_question}\nAnswer:"


In [19]:
# import os
# from huggingface_hub import InferenceClient
 
# client = InferenceClient(
#     provider="auto",
#     api_key=os.getenv("HUGGING_FACE_API"),
# )

# completion = client.chat.completions.create(
#     model="microsoft/phi-4",
#     messages=[
#         {
#             "role": "user",
#             "content": "What is the capital of France?"
#         }
#     ],
# )
 
# print(completion.choices[0].message)

In [27]:
from huggingface_hub import InferenceClient
 
client = InferenceClient(
    provider="auto",
    api_key=os.getenv("HUGGING_FACE_API"),
)

completion = client.chat.completions.create(
    model="microsoft/phi-4",
    messages=[
        {"role": "system", "content": "You are a helpful assistant. Answer the question based on the provided context only."},
        {"role": "user", "content": prompt}
    ],
)

print(completion.choices[0].message.content)


The context provided does not contain any information about Krishna or the Bhagavad Gita. The content is focused on excerpts from sonnets, which discuss themes of love and beauty. Therefore, I cannot provide an answer regarding what Krishna says about the Bhagavad Gita based on the given documents.
