In [3]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import FAISS

# Load and split PDF
loader = PyPDFLoader("sample.pdf")   # change filename if needed
docs = loader.load()
splitter = CharacterTextSplitter(chunk_size=800, chunk_overlap=120)
chunks = splitter.split_documents(docs)

# Build vectorstore
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
vectorstore = FAISS.from_documents(chunks, embeddings)

  embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")


In [4]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
docs = retriever.invoke("What is the main conclusion of the paper?")
for i, d in enumerate(docs, 1):
    print(f"\n--- CHUNK {i} ---\n{d.page_content[:500]}")


--- CHUNK 1 ---
Proceedings of 15th International Conference on Science and Innovative Engineering 2025 
April 26th - 27th, 2025 
Prince Dr.K.Vasudevan college of Engineering and Technology, India  
Manipal University College Malaysia, Melaka, Malaysia           ISBN 978-81-983498-5-9                                                                                                                       
 
 
 
Sidon sets-SSs are subsets of real numbers possessing different totals for pair wise sums. Simon Sidon 
i

--- CHUNK 2 ---
Proceedings of 15th International Conference on Science and Innovative Engineering 2025 
April 26th - 27th, 2025 
Prince Dr.K.Vasudevan college of Engineering and Technology, India  
Manipal University College Malaysia, Melaka, Malaysia           ISBN 978-81-983498-5-9                                                                                                                       
 
 
Jerusalem College of Engineering, Chennai-600100. 
 
In the era of big d

In [5]:
# 1. Imports
from langchain.chains import RetrievalQA
from groq_remote_llm import GroqRemoteLLM   # your custom wrapper

# 2. Create the LLM
llm = GroqRemoteLLM()

# 3. Create the RetrievalQA chain
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",     # you can also try "map_reduce" or "refine"
    retriever=retriever
)

# 4. Now ask a question and get both chunks + answer
query = "What is the main conclusion of the paper?"

# Inspect retrieved docs
docs = retriever.invoke(query)
print(f"\n🔎 Retrieved {len(docs)} chunks for query: {query}\n")
for i, d in enumerate(docs, 1):
    print(f"\n--- CHUNK {i} ---\n{d.page_content[:400]}")

# Model's final answer
print("\n🤖 Model Answer:\n", qa.run(query))


🔎 Retrieved 3 chunks for query: What is the main conclusion of the paper?


--- CHUNK 1 ---
Proceedings of 15th International Conference on Science and Innovative Engineering 2025 
April 26th - 27th, 2025 
Prince Dr.K.Vasudevan college of Engineering and Technology, India  
Manipal University College Malaysia, Melaka, Malaysia           ISBN 978-81-983498-5-9                                                                                                                       
 
 
 
Sidon

--- CHUNK 2 ---
Proceedings of 15th International Conference on Science and Innovative Engineering 2025 
April 26th - 27th, 2025 
Prince Dr.K.Vasudevan college of Engineering and Technology, India  
Manipal University College Malaysia, Melaka, Malaysia           ISBN 978-81-983498-5-9                                                                                                                       
 
 
Jerusal

--- CHUNK 3 ---
Proceedings of 15th International Conference on Science and Innovative 

  print("\n🤖 Model Answer:\n", qa.run(query))


MissingSchema: Invalid URL 'None': No scheme supplied. Perhaps you meant https://None?

In [6]:
# === REBUILD FAISS WITH SMALLER CHUNKS (Notebook cell) ===
import os, json
from dotenv import load_dotenv
load_dotenv()

# Config - change PDF_PATH if needed
PDF_PATH = "sample.pdf"
CHUNK_SIZE = 500
CHUNK_OVERLAP = 80
EMBED_MODEL = "all-MiniLM-L6-v2"
SAVE_DIR = "faiss_index"
CHUNKS_JSONL = os.path.join(SAVE_DIR, "chunks.jsonl")

# Imports (make sure packages are installed)
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import FAISS

# 1) Load PDF
if not os.path.exists(PDF_PATH):
    raise FileNotFoundError(f"Put your PDF at '{PDF_PATH}' or change the PDF_PATH variable.")
loader = PyPDFLoader(PDF_PATH)
docs = loader.load()
print("Loaded pages:", len(docs))

# 2) Split into smaller chunks
splitter = CharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
chunks = splitter.split_documents(docs)
print("Created chunks:", len(chunks))

# 3) Save chunks to JSONL (safe, portable)
os.makedirs(SAVE_DIR, exist_ok=True)
with open(CHUNKS_JSONL, "w", encoding="utf-8") as f:
    for doc in chunks:
        rec = {"page_content": doc.page_content, "metadata": getattr(doc, "metadata", {})}
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")
print("Saved chunks JSONL to:", CHUNKS_JSONL)

# 4) Create embeddings and FAISS vectorstore
embeddings = SentenceTransformerEmbeddings(model_name=EMBED_MODEL)
vectorstore = FAISS.from_documents(chunks, embeddings)
print("Built FAISS vectorstore with vectors:", len(chunks))

# 5) Persist FAISS index to disk
vectorstore.save_local(SAVE_DIR)
print("Saved FAISS index to folder:", SAVE_DIR)

# 6) Create retriever for immediate use (k=2 recommended)
retriever = vectorstore.as_retriever(search_kwargs={"k": 2})
print("Retriever ready with k=2. Done.")


Loaded pages: 3
Created chunks: 3
Saved chunks JSONL to: faiss_index\chunks.jsonl
Built FAISS vectorstore with vectors: 3
Saved FAISS index to folder: faiss_index
Retriever ready with k=2. Done.
