In [2]:
import os
from dotenv import load_dotenv 

In [3]:
load_dotenv()

PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_REGION = os.getenv("PINECONE_REGION")
PINECONE_CLOUD = os.getenv("PINECONE_CLOUD")
HUGGING_FACE_API = os.getenv("HUGGING_FACE_API")

In [9]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core.node_parser import SemanticSplitterNodeParser
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from huggingface_hub import InferenceClient
from llama_index.core import Document

In [5]:
documents = SimpleDirectoryReader("data").load_data()

In [6]:
print(f"Loaded {len(documents)} documents\n\n")
print(documents[0].text[:500])  # Show a preview of the first doc

Loaded 1 documents


From fairest creatures we desire increase,
That thereby beauty's rose might never die,
But as the riper should by time decease,
His tender heir might bear his memory:
But thou contracted to thine own bright eyes,
Feed'st thy light's flame with self-substantial fuel,
Making a famine where abundance lies,
Thy self thy foe, to thy sweet self too cruel:
Thou that art now the world's fresh ornament,
And only herald to the gaudy spring,
Within thine own bud buriest thy content,
And, tender 


In [10]:
embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [10]:
print(embed_model)

model_name='sentence-transformers/all-MiniLM-L6-v2' embed_batch_size=10 callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x7b0fd0858610> num_workers=None embeddings_cache=None max_length=256 normalize=True query_instruction=None text_instruction=None cache_folder=None show_progress_bar=False


In [13]:
splitter = SemanticSplitterNodeParser.from_defaults(
    embed_model=embed_model,
    buffer_size=1,
    breakpoint_percentile_threshold=95,
    include_metadata=True,
    include_prev_next_rel=True,
)


In [None]:
nodes = splitter.get_nodes_from_documents(documents)
print(f"Created {len(nodes)} semantic nodes")
print(nodes[0].text[:300])  # View first chunk

Created 29 semantic nodes
From fairest creatures we desire increase,
That thereby beauty's rose might never die,
But as the riper should by time decease,
His tender heir might bear his memory:
But thou contracted to thine own bright eyes,
Feed'st thy light's flame with self-substantial fuel,
Making a famine where abund


In [15]:
index = VectorStoreIndex(nodes, embed_model=embed_model)

In [12]:
print(index)

<llama_index.core.indices.vector_store.base.VectorStoreIndex object at 0x7b0eb2b7a440>


In [16]:
from llama_index.vector_stores.pinecone import PineconeVectorStore
from pinecone import Pinecone, ServerlessSpec

In [17]:
pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "rag-llamaindex"
embedding_dim = 384

# Create index if it doesn't exist
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=embedding_dim,
        metric="cosine",
        spec=ServerlessSpec(
            cloud=PINECONE_CLOUD,
            region=PINECONE_REGION
        )
    )

In [18]:
# Connect to the Pinecone index
pinecone_index = pc.Index(index_name)

# Create LlamaIndex Pinecone vector store wrapper
vector_store = PineconeVectorStore(pinecone_index=pinecone_index)

In [None]:
# retriever = index.as_retriever()
# results = retriever.retrieve("What do they say about the painter")

# # Build prompt with text and metadata
# context_blocks = []

# for i, node in enumerate(results):
#     metadata = node.metadata or {}
#     page_number = metadata.get("page_number", "N/A")
#     source = metadata.get("file_name", "Unknown Source")

#     block = f"""
# [Document {i+1}]
# Source: {source}
# Page: {page_number}
# Content:
# {sonnet_docs[i].text}
# """
#     context_blocks.append(block)

# retrieved_text = "\n".join(context_blocks)


In [None]:
# for i, block in enumerate(context_blocks, start=1):
#     print(f"--- Document {i} ---")
#     print(block.strip())
#     print("\n")  # Extra newline for spacing

--- Document 1 ---
[Document 1]
Source: data/Sonnets.txt
Page: N/A
Content:
From fairest creatures we desire increase,
That thereby beauty's rose might never die,
But as the riper should by time decease,
His tender heir might bear his memory:
But thou contracted to thine own bright eyes,
Feed'st thy light's flame with self-substantial fuel,
Making a famine where abundance lies,
Thy self thy foe, to thy sweet self too cruel:
Thou that art now the world's fresh ornament,
And only herald to the gaudy spring,
Within thine own bud buriest thy content,
And, tender churl, mak'st waste in niggarding:
Pity the world, or else this glutton be,
To eat the world's due, by the grave and thee.


--- Document 2 ---
[Document 2]
Source: data/Sonnets.txt
Page: N/A
Content:
When forty winters shall besiege thy brow,
And dig deep trenches in thy beauty's field,
Thy youth's proud livery so gazed on now,
Will be a totter'd weed of small worth held:
Then being asked, where all thy beauty lies,
Where all the 

In [40]:
retrieved_text

'\n[Document 1]\nSource: data/Sonnets.txt\nPage: N/A\nContent:\n\n\n\n[Document 2]\nSource: data/Sonnets.txt\nPage: N/A\nContent:\n\n'

In [21]:
user_question = "What is said about the painter in the sonnets?"

# Use the actual user question to retrieve relevant chunks
retriever = index.as_retriever()
results = retriever.retrieve(user_question)

# Build prompt with retrieved results
context_blocks = []
for i, node_with_score in enumerate(results):
    # Fix: Access the underlying node
    node = node_with_score.node
    metadata = node.metadata or {}
    
    # Now you can access the metadata you carefully created:
    title = metadata.get("title", "Unknown")
    sonnet_num = metadata.get("sonnet_number", "N/A")
    line_count = metadata.get("line_count", "N/A")
    
    block = f"""
[Document {i+1}]
Title: {title}
Lines: {line_count}
Content:
{node.text}
"""

    context_blocks.append(block)

print(context_blocks)
retrieved_text = "\n".join(context_blocks)

# Final prompt
prompt = f"Context:\n{retrieved_text}\n\nQuestion: {user_question}\nAnswer:"



In [22]:
# print(prompt)

In [23]:
from huggingface_hub import InferenceClient
 
client = InferenceClient(
    provider="auto",
    api_key=os.getenv("HUGGING_FACE_API"),
)

completion = client.chat.completions.create(
    model="microsoft/phi-4",
    messages=[
        {"role": "system", "content": "You are a helpful assistant. Answer the question based on the provided context only ."},
        {"role": "user", "content": prompt}
    ],
)

print(completion.choices[0].message.content)


The sonnets touch upon the theme of painting and the artist's attempt to capture beauty in several ways:

1. **Critique of Art's Impermanence**: The sonnets suggest that the painter's efforts to capture beauty are ultimately futile because art is transient. For example, one sonnet speaks of painting applying a "mouldy lease" to beauty, implying that art can only offer temporary preservation ("For fair nothing can be so enduring").

2. **The Inadequacy of Portraits**: Another sonnet discusses the weaknesses in portraits. It mentions how a painted image can only "blot the beauty" of the subject, indicating that the representation falls short of the original's allure and vitality.

3. **Art's Temporal Nature**: Art is described as mortal and limited in comparison to the enduring nature of beauty described within the sonnets. The painted image is acknowledged as a "poor ornament," emphasizing its inability to truly encapsulate the true essence of its subject.

4. **Artist’s Recognition of 