In [None]:
# Install required packages (run once)
!pip install pdfplumber langchain faiss-cpu sentence-transformers ollama langchain-text-splitters langchain-community

import subprocess
import pdfplumber
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

In [None]:
# -------------------------------
# Step 1: Extract text from PDFs
# -------------------------------
def extract_pdf_text(pdf_path):
    """Extracts text from a single PDF file with page previews."""
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages):
            page_text = page.extract_text()
            if page_text:
                print(f"\n--- Page {i+1} Preview ({pdf_path}) ---")
                print(page_text[:200], "...")
                text += page_text + "\n"
    return text

In [None]:
# -------------------------------
# Step 2: Build vectorstore from multiple PDFs
# -------------------------------
def build_vectorstore_from_pdfs(pdf_paths, verbose=True):
    """Builds a single FAISS vectorstore from multiple PDFs with debug prints."""
    all_chunks = []
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)

    # Loop through each PDF
    for path in pdf_paths:
        text = extract_pdf_text(path)
        chunks = splitter.split_text(text)
        print(f"\n✅ {path} → {len(chunks)} chunks created")

        # Print every chunk if verbose=True
        if verbose:
            for i, ch in enumerate(chunks):
                print(f"\n--- Chunk {i+1} ---")
                print(f"Text Preview: {ch[:200]}...")
                print(f"Token count (approx): {len(ch.split())}")

                # Embed this chunk directly for preview
                embeddings_preview = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
                emb = embeddings_preview.embed_query(ch)
                print(f"Embedding length: {len(emb)}")
                print(f"First 10 values: {emb[:10]}")

        all_chunks.extend(chunks)

    print(f"\n✅ Total chunks across PDFs: {len(all_chunks)}")

    # Build FAISS vectorstore
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vectorstore = FAISS.from_texts(all_chunks, embeddings)
    print("\n✅ Vectorstore built successfully with FAISS.")
    print(f"--- Vectorstore Info ---\nTotal vectors: {vectorstore.index.ntotal}")
    print(f"Embedding dimension: {vectorstore.index.d}")
    print(f"Docstore IDs: {list(vectorstore.docstore._dict.keys())[:5]} ...")

    # Show embedding preview for first vector
    emb0 = vectorstore.index.reconstruct(0)
    print(f"\n--- First Embedding Vector Preview ---\nLength: {len(emb0)}\nFirst 20 values: {emb0[:20]}")

    # Map IDs to text
    print("\n--- ID → Text → Embedding Mapping ---")
    for pos, doc_id in list(vectorstore.index_to_docstore_id.items())[:5]:
        doc = vectorstore.docstore._dict.get(doc_id)
        emb = vectorstore.index.reconstruct(pos)
        print(f"Vector Pos: {pos}\nID: {doc_id}\nText Preview: {doc.page_content[:100]}...\nEmbedding First 10 values: {emb[:10]}\n")

    return vectorstore


In [None]:
# -------------------------------
# Step 3: Ask question across PDFs
# -------------------------------
def ask_question(vectorstore, query, k=3):
    """Searches across all PDFs in the vectorstore and returns an answer with debug prints."""
    docs = vectorstore.similarity_search(query, k=k)

    print("\n--- Retrieved Chunks for Query ---")
    for i, doc in enumerate(docs):
        print(f"\nChunk {i+1}:\n{doc.page_content}\n")

    context = "\n".join([d.page_content for d in docs])
    prompt = f"Answer the question based on the PDFs:\n\n{context}\n\nQuestion: {query}"

    result = subprocess.run(
        ["ollama", "run", "llama3"],
        input=prompt,
        text=True,
        encoding="utf-8",
        errors="replace",
        capture_output=True
    )
    return result.stdout

In [None]:
# -------------------------------
# Example usage
# -------------------------------
pdf_paths = [
    r"C:\Users\2shiv\Downloads\Shivang_Soni_Enhanced_CV.pdf",
    r"C:\Users\2shiv\Downloads\Research_Paper_On_Artificial_Intelligence_And_Its.pdf"
]

vectorstore = build_vectorstore_from_pdfs(pdf_paths, verbose=True)

question = "Summarize Shivang Soni's skills and the main AI research topics."
answer = ask_question(vectorstore, question)

print("\n--- Final Answer from Ollama ---\n")
print(answer)