In [10]:
import os
from dotenv import load_dotenv

In [11]:
# Loading the environment variables from a .env file
load_dotenv()

PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_REGION = os.getenv("PINECONE_REGION")
PINECONE_CLOUD = os.getenv("PINECONE_CLOUD")
HUGGING_FACE_API = os.getenv("HUGGING_FACE_API")

# Data Pipeline / Indexing

In [12]:
from llama_index.core import SimpleDirectoryReader, Document, VectorStoreIndex
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SemanticSplitterNodeParser
from huggingface_hub import InferenceClient



In [13]:
documents = SimpleDirectoryReader("data", required_exts=[".pdf"]).load_data()

# Combine all pages into one big document
combined_text = "\n\n".join([doc.text for doc in documents])
combined_doc = Document(text=combined_text)

In [14]:
print(f"Loaded {len(documents)} documents from the 'data' directory.\n\n")
print(combined_doc.text[:500])  # First 500 characters

Loaded 256 documents from the 'data' directory.






AN	IMPRINT	OF	PENGUIN	RANDOM	HOUSE	LLC
375	Hudson	Street
New	York,	New	York	10014
Copyright	©	2018	by	James	Clear
Penguin	supports	copyright.	Copyright	fuels	creativity,	encourages	diverse	voices,	promotes	free	speech,	and	creates	a	vibrant	culture.	Thank	you	for	buying	an	authorized	edition	of	this	book	and	for
complying	with	copyright	laws	by	not	reproducing,	scanning,	or	distributing	any	part	of	it	in	any	form	without	permission.	You	are	supporting	writers	and	allowing	Penguin	to	continue


In [15]:
embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [16]:
splitter = SemanticSplitterNodeParser.from_defaults(
    embed_model=embed_model,
    buffer_size=3,
    breakpoint_percentile_threshold=90,
    include_metadata=True,
    include_prev_next_rel=True,
)


In [17]:
nodes = splitter.get_nodes_from_documents(documents)
print(f"Created {len(nodes)} semantic nodes")
print(nodes[0].text[:300])  # View first chunk

Created 767 semantic nodes



In [18]:
print(nodes[80].text[:500])  # First 500 characters of the first node

FIGURE	3:	There	are	three	layers	of	behavior	change:	a	change	in	your	outcomes,	a	change	in	your	processes,	or	a	change	in	your	identity.
The	first	layer	is	changing	your	outcomes.
	This	level	is	concerned	with
changing	 your	 results:	 losing	 weight,	 publishing	 a	 book,	 winning	 a
championship.	Most	of	the	goals	you	set	are	associated	with	this	level	of	change.



In [20]:
from llama_index.vector_stores.pinecone import PineconeVectorStore
from pinecone import Pinecone, ServerlessSpec
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.retrievers.bm25 import BM25Retriever

In [21]:
pc = Pinecone(api_key=PINECONE_API_KEY)
index_name = "rag-llamaindex"
embedding_dim = 384

In [22]:
# Create index if it doesn't exist
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=embedding_dim,
        metric="cosine",
        spec=ServerlessSpec(
            cloud=PINECONE_CLOUD,
            region=PINECONE_REGION
        )
    )

In [24]:
# Connect to the Pinecone index
pinecone_index = pc.Index(index_name)

In [25]:
# Create LlamaIndex Pinecone vector store wrapper
vector_store = PineconeVectorStore(pinecone_index=pinecone_index)

In [None]:
# Create index that USES Pinecone
index = VectorStoreIndex(
    nodes, 
    vector_store=vector_store,  # ← This stores in Pinecone!
    embed_model=embed_model
)

# Add nodes to Pinecone
vector_store.add_nodes(nodes)

In [27]:
from sentence_transformers import CrossEncoder

#### Reranking

In [28]:
# Initialize reranker model
reranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-2-v2')

In [29]:
def rerank_results(query, results, top_k=5):
    """Rerank retrieved results for better quality"""
    
    # Prepare query-document pairs for reranking
    query_doc_pairs = []
    for result in results:
        query_doc_pairs.append([query, result.node.text])
    
    # Get reranking scores
    rerank_scores = reranker.predict(query_doc_pairs)
    
    # Combine original results with new scores
    reranked_results = []
    for i, result in enumerate(results):
        # Create new result with reranking score
        result.score = rerank_scores[i]  # Update with rerank score
        reranked_results.append(result)
    
    # Sort by rerank score (higher is better)
    reranked_results.sort(key=lambda x: x.score, reverse=True)
    
    return reranked_results[:top_k]

# Retrieval

In [None]:
# # Modified retrieval with reranking
# user_question = "Summarize this book"

# # Step 1: Get vector retriever
# vector_retriever = index.as_retriever(similarity_top_k=10)

# # Step 2: Create query fusion retriever combining vector and BM25
# fusion_retriever = QueryFusionRetriever(
#     retrievers=[vector_retriever, bm25_retriever],
#     similarity_top_k=20,  # Get 20 candidates from fusion
#     num_queries=1,        # Use original query
#     use_async=False
# )

# # Step 3: Retrieve using fusion (combines vector + BM25)
# fusion_results = fusion_retriever.retrieve(user_question)

# print(f"Retrieved {len(fusion_results)} results from fusion retriever")

# # Step 4: Rerank the fused results
# reranked_results = rerank_results(user_question, fusion_results, top_k=5)

# # Step 3: Build context with reranked results
# context_blocks = []
# for i, node_with_score in enumerate(reranked_results):
#     node = node_with_score.node
    
#     block = f"""
# --- Excerpt {i+1} (Rerank Score: {node_with_score.score:.3f}) ---
# {node.text}
# """
#     context_blocks.append(block)

# retrieved_text = "\n".join(context_blocks)


In [30]:
# Step 1: Create BM25 retriever (sparse vectors)
bm25_retriever = BM25Retriever.from_defaults(
    nodes=nodes,  # Nodes created from your documents
    similarity_top_k=10  # Top 10 sparse results
)

# Step 2: Create dense vector retriever
vector_retriever = VectorIndexRetriever(
    index=index,  # Your Pinecone-backed index
    similarity_top_k=10  # Top 10 dense results
)

In [31]:
# Step 3: Manual hybrid retrieval (no LLM needed)
def hybrid_retrieve(query, vector_retriever, bm25_retriever, top_k=20):
    """Manually combine vector and BM25 results"""
    
    # Get results from both retrievers
    vector_results = vector_retriever.retrieve(query)
    bm25_results = bm25_retriever.retrieve(query)
    
    # Combine and deduplicate by node ID
    all_results = []
    seen_node_ids = set()
    
    # Add vector results
    for result in vector_results:
        if result.node.node_id not in seen_node_ids:
            all_results.append(result)
            seen_node_ids.add(result.node.node_id)
    
    # Add BM25 results (skip duplicates)
    for result in bm25_results:
        if result.node.node_id not in seen_node_ids:
            all_results.append(result)
            seen_node_ids.add(result.node.node_id)
    
    # Sort by score and return top_k
    all_results.sort(key=lambda x: x.score, reverse=True)
    return all_results[:top_k]

In [32]:
# Step 4: Retrieve results using hybrid retriever
user_question = "Who won the Ballon d'Or in 2023, according to Atomic Habits?"
hybrid_results = hybrid_retrieve(user_question, vector_retriever, bm25_retriever, top_k=20)
print(f"Retrieved {len(hybrid_results)} results using hybrid retrieval.")

Retrieved 20 results using hybrid retrieval.


In [33]:
# Step 5: Rerank hybrid results
reranked_results = rerank_results(user_question, hybrid_results, top_k=5)

# Step 6: Build context with reranked results
context_blocks = []
for i, node_with_score in enumerate(reranked_results):
    node = node_with_score.node
    block = f"""
--- Excerpt {i+1} (Rerank Score: {node_with_score.score:.3f}) ---
{node.text}
"""
    context_blocks.append(block)

retrieved_text = "\n".join(context_blocks)

# Generateion

In [None]:

# Final prompt
prompt = f"Context:\n{retrieved_text}\n\nQuestion: {user_question}\nAnswer:"


client = InferenceClient(
    provider="auto",
    api_key=os.getenv("HUGGING_FACE_API"),
)

completion = client.chat.completions.create(
    model="microsoft/phi-4",
    messages=[
        {"role": "system", "content": """You are a specialized Book Analysis Assistant.

CAPABILITIES:
- Provide comprehensive book summaries and chapter overviews
- Answer questions about specific concepts, themes, and ideas from the book
- Explain key principles and their practical applications
- Compare different concepts within the book
- Provide relevant quotes and examples from the text
- Offer detailed analysis of characters, plot, or arguments (depending on book type)
- Generate thoughtful book reviews and critical analysis
- Discuss the book's relevance and impact

STRICT GUIDELINES:
1. ONLY use information from the provided context below
2. If asked about topics outside this specific book, respond: "I can only discuss content from this book. Please ask me about topics, concepts, or themes covered in this book."
3. If asked about other books, authors, or general topics, politely redirect to this book's content
4. Be comprehensive but focused in your responses
5. Include specific examples, quotes, and page references when available in context
6. Structure answers clearly with headings or bullet points for complex topics

RESPONSE STRUCTURE:
- Direct answer to the question
- Supporting evidence from the book
- Relevant examples or case studies from the text  
- Practical implications or takeaways when applicable
- Clear indication if context is insufficient for a complete answer

TONE: Professional, knowledgeable, and helpful while staying strictly within the book's scope."""
},
        {"role": "user", "content": prompt}
    ],
)

print(completion.choices[0].message.content)


I can only discuss content from this book. Please ask me about topics, concepts, or themes covered in "The Surprising Power of Atomic Habits." The excerpts provided do not mention anything about the Ballon d'Or or its winner in 2023. If you have questions about the principles or ideas from the book, feel free to ask!


# Performance matrix

In [None]:
# Optional: Compare different retrieval methods
print("=== COMPARISON OF RETRIEVAL METHODS ===\n")

# Vector only
vector_only = vector_retriever.retrieve(user_question)
print(f"Vector Search Only: {len(vector_only)} results")
print(f"Top result score: {vector_only[0].score:.3f}")
print(f"Sample result: {vector_only[0].node.text[:150]}...\n")

# BM25 only  
bm25_only = bm25_retriever.retrieve(user_question)
print(f"BM25 Only: {len(bm25_only)} results")
print(f"Top result score: {bm25_only[0].score:.3f}")
print(f"Sample result: {bm25_only[0].node.text[:150]}...\n")

# Hybrid (Vector + BM25)
print(f"Hybrid (Vector + BM25): {len(hybrid_results)} results")
print(f"Top result score: {hybrid_results[0].score:.3f}")
print(f"Sample result: {hybrid_results[0].node.text[:150]}...\n")

# Final reranked
print(f"Final Reranked: {len(reranked_results)} results")
print(f"Top result score: {reranked_results[0].score:.3f}")
print(f"Sample result: {reranked_results[0].node.text[:150]}...\n")

print("=== SUMMARY ===")
print(f"Question: '{user_question}'")
print(f"Vector retrieval found {len(vector_only)} results")
print(f"BM25 retrieval found {len(bm25_only)} results") 
print(f"Hybrid retrieval combined to {len(hybrid_results)} unique results")
print(f"Final reranking selected top {len(reranked_results)} most relevant results")

=== COMPARISON OF RETRIEVAL METHODS ===

Vector Search Only: 10 results
Top result score: 0.507
Sample result: Acknowledgments
I
	
HAVE	RELIED	HEAVILY	
on	others	during	the	creation	of	this	book.	Before	anyone	else,	I
must	thank	my	wife,	Kristy,	who	has	been	in...

BM25 Only: 10 results
Top result score: 1.670
Sample result: She
gave	me	the	space	I	needed	to	create	a	book	I	was	proud	of	and	championed	my
ideas	at	every	step.	To	Nina,	for	her	ability	to	transform	my	writing...

Hybrid (Vector + BM25): 16 results
Top result score: -8.847
Sample result: *
	Readers	of
	The	Power	of	Habit
	by	Charles	Duhigg	will	recognize	these	terms.
Duhigg	wrote	a	great	book	and	my	intention	is	to	pick	up	where	he	lef...

Final Reranked: 5 results
Top result score: -7.331
Sample result: Acknowledgments
I
	
HAVE	RELIED	HEAVILY	
on	others	during	the	creation	of	this	book.	Before	anyone	else,	I
must	thank	my	wife,	Kristy,	who	has	been	in...

=== SUMMARY ===
Question: 'Summarize this book'
Vector retrieval fo

In [34]:
import pandas as pd

In [35]:
data = pd.read_csv("data/test_queries.csv", delimiter="\t")  # For tab-separated values

In [36]:
data.head()

Unnamed: 0,question
0,Can you use Atomic Habits to invest in the sto...
1,What is the Goldilocks Rule in Atomic Habits?
2,How does the book connect habits with long-ter...
3,Can I learn how to code in Python from Atomic ...
4,How does feedback reinforce or hinder habits?


In [37]:
questions = data["question"].tolist()

In [38]:
questions

['Can you use Atomic Habits to invest in the stock market?',
 'What is the Goldilocks Rule in Atomic Habits?',
 'How does the book connect habits with long-term success?',
 'Can I learn how to code in Python from Atomic Habits?',
 'How does feedback reinforce or hinder habits?',
 'What role does identity play in habit formation?',
 'What are some limitations of the habit strategies in the book?',
 'What’s the significance of the plateau of latent potential?',
 'Can I use Atomic Habits to invest in the stock market?',
 'What are the four laws of behavior change according to Atomic Habits?',
 'How can friction be used to eliminate bad habits?',
 'Does the book include a workout plan for bodybuilding?',
 'How can someone recover from a habit relapse?',
 'Does Atomic Habits talk about AI or machine learning?',
 'What psychological theories support the habit loop in Atomic Habits?',
 'How do small habits compound over time?',
 'What are the top 10 tourist places in Japan mentioned in Atomic

In [55]:
import json
import numpy as np

def to_native(val):
    """Convert numpy float types to native Python floats."""
    if isinstance(val, (np.float32, np.float64)):
        return float(val)
    return val

# Assuming: questions = data["question"].tolist()
questions = data["question"].tolist()

all_scores = {}

for i, question in enumerate(questions):
    print(f"Processing Q{i}: {question}")

    # Retrieve results
    sparse_results = bm25_retriever.retrieve(question)
    dense_results = vector_retriever.retrieve(question)
    hybrid_results = hybrid_retrieve(question, vector_retriever, bm25_retriever)
    reranked_results = rerank_results(question, hybrid_results)

    # Store scores only
    all_scores[f"q_{i}"] = {
        "question": question,
        "sparse_scores": [to_native(r.score) for r in sparse_results],
        "dense_scores": [to_native(r.score) for r in dense_results],
        "hybrid_scores": [to_native(r.score) for r in hybrid_results],
        "reranked_scores": [to_native(r.score) for r in reranked_results]
    }

# Save all scores to JSON
with open("retrieval_score_results.json", "w") as f:
    json.dump(all_scores, f, indent=2)

print("✅ All question scores saved to retrieval_score_results.json")


Processing Q0: Can you use Atomic Habits to invest in the stock market?
Processing Q1: What is the Goldilocks Rule in Atomic Habits?
Processing Q2: How does the book connect habits with long-term success?
Processing Q3: Can I learn how to code in Python from Atomic Habits?
Processing Q4: How does feedback reinforce or hinder habits?
Processing Q5: What role does identity play in habit formation?
Processing Q6: What are some limitations of the habit strategies in the book?
Processing Q7: What’s the significance of the plateau of latent potential?
Processing Q8: Can I use Atomic Habits to invest in the stock market?
Processing Q9: What are the four laws of behavior change according to Atomic Habits?
Processing Q10: How can friction be used to eliminate bad habits?
Processing Q11: Does the book include a workout plan for bodybuilding?
Processing Q12: How can someone recover from a habit relapse?
Processing Q13: Does Atomic Habits talk about AI or machine learning?
Processing Q14: What psy