In [90]:
# ========================================
# 📌 STEP 1: Load PDF
# ========================================
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    return "\n".join([page.get_text() for page in doc])

pdf_path = "./PDF's/The_Great_Gatsby.pdf"
doc = fitz.open(pdf_path)
pages = [page.get_text() for page in doc]


print("🔍 Raw text loaded.")
print("Text Preview:\n", raw_text[:500])

print("Contains key quote?", "hope she’ll be a fool" in raw_text)
print("fool" in raw_text)


🔍 Raw text loaded.
Text Preview:
 The Great
Gatsby
By F. Scott Fitzgerald
Download free eBooks of classic literature, books
and novels at Planet eBook. Subscribe to our free
eBooks blog and email newsletter.
Then wear the gold hat, if that will move her;
If you can bounce high, bounce for her too,
Till she cry ‘Lover, gold-hatted,
high-bouncing lover, I must have you!’

—THOMAS PARKE D’INVILLIERS
The Great Gatsby
Chapter 1

In my younger and more vulnerable years my father gave
me some advice that I’ve been turning over in my
mi
Contains key quote? False
True


In [91]:
# ========================================
# 📌 STEP 2: Chunking with Page Numbers
# ========================================
from langchain.text_splitter import CharacterTextSplitter

splitter = CharacterTextSplitter(
    separator="\n\n",
    chunk_size=800,
    chunk_overlap=150
)

chunks = []
chunk_id = 0

for page_num, page_text in enumerate(pages, start=1):  # start=1 for 1-based page numbers
    page_chunks = splitter.create_documents([page_text])
    for doc in page_chunks:
        content = doc.page_content
        doc.metadata = {
            "chunk_id": chunk_id,
            "source": f"Chunk {chunk_id+1}",
            "char_range": f"{chunk_id*800}-{(chunk_id+1)*800}",
            "word_count": len(content.split()),
            "page_number": page_num
        }
        print(f"📌 Created Chunk {chunk_id} (Page: {page_num})")
        chunks.append(doc)
        chunk_id += 1

print(f"\n✅ Total Chunks: {len(chunks)}")
print("🧩 Sample Chunk:\n", chunks[10].page_content[:400])
print("📎 Metadata:", chunks[10].metadata)


📌 Created Chunk 0 (Page: 1)
📌 Created Chunk 1 (Page: 2)
📌 Created Chunk 2 (Page: 3)
📌 Created Chunk 3 (Page: 4)
📌 Created Chunk 4 (Page: 5)
📌 Created Chunk 5 (Page: 6)
📌 Created Chunk 6 (Page: 7)
📌 Created Chunk 7 (Page: 8)
📌 Created Chunk 8 (Page: 9)
📌 Created Chunk 9 (Page: 10)
📌 Created Chunk 10 (Page: 11)
📌 Created Chunk 11 (Page: 12)
📌 Created Chunk 12 (Page: 13)
📌 Created Chunk 13 (Page: 14)
📌 Created Chunk 14 (Page: 15)
📌 Created Chunk 15 (Page: 16)
📌 Created Chunk 16 (Page: 17)
📌 Created Chunk 17 (Page: 18)
📌 Created Chunk 18 (Page: 19)
📌 Created Chunk 19 (Page: 20)
📌 Created Chunk 20 (Page: 21)
📌 Created Chunk 21 (Page: 22)
📌 Created Chunk 22 (Page: 23)
📌 Created Chunk 23 (Page: 24)
📌 Created Chunk 24 (Page: 25)
📌 Created Chunk 25 (Page: 26)
📌 Created Chunk 26 (Page: 27)
📌 Created Chunk 27 (Page: 28)
📌 Created Chunk 28 (Page: 29)
📌 Created Chunk 29 (Page: 30)
📌 Created Chunk 30 (Page: 31)
📌 Created Chunk 31 (Page: 32)
📌 Created Chunk 32 (Page: 33)
📌 Created Chunk 33 (Page: 34)

In [92]:
# ========================================
# 📌 STEP 3: Embed Chunks
# ========================================
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("all-mpnet-base-v2")
texts = [doc.page_content for doc in chunks]
embeddings = embedding_model.encode(texts, show_progress_bar=True)

print("✅ Embeddings generated.")
print("Embedding shape:", embeddings.shape)

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

✅ Embeddings generated.
Embedding shape: (91, 768)


In [93]:
# ========================================
# 📌 STEP 4: Store in FAISS
# ========================================
import faiss
import numpy as np

dimension = embeddings.shape[1]
faiss_index = faiss.IndexFlatL2(dimension)
faiss_index.add(np.array(embeddings))

print("✅ FAISS index built. Total vectors:", faiss_index.ntotal)

✅ FAISS index built. Total vectors: 91


In [94]:
# ========================================
# 📌 STEP 5: Retrieval + Reranking
# ========================================
from transformers import GPT2TokenizerFast
from sentence_transformers import CrossEncoder

tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

def retrieve_top_k_chunks(query, k=20, max_tokens=3500):
    query_embedding = embedding_model.encode([query])
    distances, indices = faiss_index.search(np.array(query_embedding), k)

    candidate_chunks = []
    for i in indices[0]:
        candidate_chunks.append(chunks[i])

    # 🔁 Rerank using cross-encoder
    pairs = [(query, chunk.page_content) for chunk in candidate_chunks]
    scores = cross_encoder.predict(pairs)
    ranked_chunks = sorted(zip(scores, candidate_chunks), reverse=True)

    selected = []
    total_tokens = 0
    for score, chunk in ranked_chunks:
        tokens = len(tokenizer.encode(chunk.page_content))
        if total_tokens + tokens <= max_tokens:
            selected.append(chunk)
            total_tokens += tokens
        if len(selected) >= 5:  # top 5 relevant
            break

    return selected

# ✅ Retrieval Tester
test_query = "What did Daisy say when her daughter was born?"
test_results = retrieve_top_k_chunks(test_query)
print("\n🧪 Top Chunks for Query:")
for i, doc in enumerate(test_results):
    print(f"\nChunk {i+1} (Score-based):\n", doc.page_content[:300], "\nMetadata:", doc.metadata)


🧪 Top Chunks for Query:

Chunk 1 (Score-based):
 ‘That’s true.’ She hesitated. ‘Well, I’ve had a very
bad time, Nick, and I’m pretty cynical about
everything.’ Evidently she had reason to be. I waited
but she didn’t say any more, and after a moment I
returned rather feebly to the subject of her daughter.
‘I suppose she talks, and—eats, and everyth 
Metadata: {'chunk_id': 21, 'source': 'Chunk 22', 'char_range': '16800-17600', 'word_count': 263, 'page_number': 22}

Chunk 2 (Score-based):
 Hot Springs and Palm Beach. I had heard some story
of her too, a critical, unpleasant story, but what it was
I had forgot ten long ago.
‘Good night,’ she said softly. ‘Wake me at eight,
won’t you.’
‘If you’ll get up.’
‘I will. Good night, Mr. Carraway. See you anon.’ ‘Of
course you will,’ confirmed  
Metadata: {'chunk_id': 23, 'source': 'Chunk 24', 'char_range': '18400-19200', 'word_count': 234, 'page_number': 24}

Chunk 3 (Score-based):
 said.
‘Don’t talk. I want to hear what happens.’
‘Is something h

In [100]:
# ========================================
# 📌 STEP 6: Ask Question with Gemini (No Summarization)
# ========================================
import google.generativeai as genai
from dotenv import load_dotenv
import os
from transformers import GPT2TokenizerFast
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

# Load environment & API key
load_dotenv()
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))

# Ask a question
query = input("🔎 Enter your question: ").strip()

# Get top chunks
retrieved = retrieve_top_k_chunks(query)

if not retrieved:
    print("⚠️ No relevant context found.")
    exit()

joined_chunks = [
    f"[Page {c.metadata['page_number']}] {c.page_content}"
    for c in retrieved
]

# Truncate token-wise
token_limit = 1000
final_context = ""
total_tokens = 0

for chunk in joined_chunks:
    tokens = tokenizer.encode(chunk)
    if total_tokens + len(tokens) > token_limit:
        break
    final_context += chunk + "\n\n"
    total_tokens += len(tokens)

context = final_context.strip()
print(f"\n🧱 Total tokens in truncated context: {total_tokens}")

# 🧠 Gemini Prompt
system_instruction = (
    "You are a helpful assistant answering questions about a book. "
    "Use only the provided context and cite the chunk number when possible. "
    "Be accurate and concise. Don't guess beyond the context."
)

prompt = (
    f"{system_instruction}\n\n"
    f"Context:\n{context}\n\n"
    f"Question: {query}"
)

# Generate Answer
model = genai.GenerativeModel("models/gemini-2.5-pro")
response = model.generate_content(prompt)

# Show results
print("\n📄 Top-k Retrieved Context:\n")
print(context[:1000])  # Preview

print("\n📘 Final Answer from Gemini:\n")
print(response.text.strip())


🧱 Total tokens in truncated context: 820

📄 Top-k Retrieved Context:

[Page 14] ‘She’s asleep. She’s two years old. Haven’t you
ever seen her?’
‘Never.’
‘Well, you ought to see her. She’s——‘
Tom Buchanan who had been hovering restlessly
about the room stopped and rested his hand on my
shoulder.
12 The Great Gatsby
‘What you doing, Nick?’
‘I’m a bond man.’
‘Who with?’
I told him.
‘Never heard of them,’ he remarked
decisively. This annoyed me.
‘You will,’ I answered shortly. ‘You will if you stay in
the East.’
‘Oh, I’ll stay in the East, don’t you worry,’ he said,
glanc ing at Daisy and then back at me, as if he were
alert for something more. ‘I’d be a God Damned fool
to live any where else.’
At this point Miss Baker said ‘Absolutely!’ with such
suddenness that I started—it was the first word she
uttered
since I came into the room. Evidently it
surprised her as
much as it did me, for she yawned
and with a series of rapid, deft movements stood up
into the room.
‘I’m stiff,’ she complaine

In [15]:
models = genai.list_models()
for m in models:
    print(m.name)


models/embedding-gecko-001
models/gemini-1.0-pro-vision-latest
models/gemini-pro-vision
models/gemini-1.5-pro-latest
models/gemini-1.5-pro-002
models/gemini-1.5-pro
models/gemini-1.5-flash-latest
models/gemini-1.5-flash
models/gemini-1.5-flash-002
models/gemini-1.5-flash-8b
models/gemini-1.5-flash-8b-001
models/gemini-1.5-flash-8b-latest
models/gemini-2.5-pro-preview-03-25
models/gemini-2.5-flash-preview-05-20
models/gemini-2.5-flash
models/gemini-2.5-flash-lite-preview-06-17
models/gemini-2.5-pro-preview-05-06
models/gemini-2.5-pro-preview-06-05
models/gemini-2.5-pro
models/gemini-2.0-flash-exp
models/gemini-2.0-flash
models/gemini-2.0-flash-001
models/gemini-2.0-flash-exp-image-generation
models/gemini-2.0-flash-lite-001
models/gemini-2.0-flash-lite
models/gemini-2.0-flash-preview-image-generation
models/gemini-2.0-flash-lite-preview-02-05
models/gemini-2.0-flash-lite-preview
models/gemini-2.0-pro-exp
models/gemini-2.0-pro-exp-02-05
models/gemini-exp-1206
models/gemini-2.0-flash-thin

In [16]:
print(any("hope she’ll be a fool" in c.page_content for c in chunks))


False


In [17]:
query = "What did Daisy say when her daughter was born?"
results = retrieve_top_k_chunks(query)
for doc in results:
    print(doc.page_content[:300])


settee.
Daisy took her face in her hands, as if feeling its
love ly shape, and her eyes moved gradually out into
the velvet dusk. I saw that turbulent emotions
possessed her, so I asked what I thought would be
some sedative questions about her little girl.
Free eBooks at Planet eBook.com 19
‘We don’
sea in a boat, and all that sort of thing——‘
‘Good night,’ called Miss Baker from the stairs. ‘I
haven’t heard a word.’
‘She’s a nice girl,’ said Tom after a moment. ‘They
oughtn’t to let her run around the country this way.’
‘Who oughtn’t to?’ inquired Daisy coldly.
‘Her family.’
‘Her family is one
and get married to each other right away.’
‘Doesn’t she like Wilson either?’
The answer to this was unexpected. It came from
Myrtle who had overheard the question and it was
violent and ob scene.
‘You
see?’
cried
Catherine
triumphantly.
She
lowered her
voice again. ‘It’s really his wife that’s
keepi
here this summer. I think the home influence will be
very good for her.’
Daisy and Tom looked at 