In [None]:
# Install required packages (run once)
!pip install pdfplumber langchain faiss-cpu sentence-transformers ollama

import subprocess
import pdfplumber

In [None]:
!pip install langchain-text-splitters


In [None]:
import pdfplumber
from langchain_text_splitters import RecursiveCharacterTextSplitter
print("✅ Import works")

In [None]:
!pip install langchain-community


In [None]:
import pdfplumber
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

print("✅ All imports working!")


In [None]:
# --- Auto‑fallback import for text splitter ---
try:
    from langchain_text_splitters import RecursiveCharacterTextSplitter
    SplitterClass = RecursiveCharacterTextSplitter
    print("✅ Using RecursiveCharacterTextSplitter")
except ImportError:
    from langchain_text_splitters import CharacterTextSplitter
    SplitterClass = CharacterTextSplitter
    print("✅ Using CharacterTextSplitter (older LangChain version)")

#from langchain.embeddings import HuggingFaceEmbeddings
#from langchain.vectorstores import FAISS
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

In [None]:
# -------------------------------
# Step 1: Extract text from PDF
# -------------------------------
def extract_pdf_text(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages):
            page_text = page.extract_text()
            if page_text:
                print(f"\n--- Page {i+1} Preview ---")
                print(page_text[:20], "...")  # preview first 200 chars
                text += page_text + "\n"
    return text

pdf_paths = [
    r"C:\Users\2shiv\Downloads\Shivang_Soni_Enhanced_CV.pdf",
    r"C:\Users\2shiv\Downloads\Research_Paper_On_Artificial_Intelligence_And_Its.pdf"
]

#pdf_path = r"C:\Users\2shiv\Downloads\Shivang_Soni_Enhanced_CV.pdf"
#pdf_path = r"C:\Users\2shiv\Downloads\Research_Paper_On_Artificial_Intelligence_And_Its.pdf"
print(extract_pdf_text(pdf_path))


In [None]:
# -------------------------------
# Step 2: Build vectorstore
# -------------------------------
def build_pdf_vectorstore(pdf_path):
    text = extract_pdf_text(pdf_path)
    splitter = SplitterClass(chunk_size=500, chunk_overlap=50)
    chunks = splitter.split_text(text)
    print(f"\n\n******************************************")
    print(f"\n✅ Total chunks created: {len(chunks)}")
    print(f"\n\n******************************************")
    print(f"\n--- First Chunk Preview --- {chunks[0][:500]}")
    print(f"\n\n******************************************")
    print(f"\n--- Splitter Preview --- {splitter}")
    print(f"\n\n******************************************")


    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    print(f"\n\n******************************************")
    print("\n✅ Embeddings model loaded successfully.")
    print(f"\n\n******************************************")
    print(f"\n--- Embeddings Info --- {embeddings}")
    print(f"\n\n******************************************")
    print(f"\n--- Embeddings Info --- {embeddings.embed_documents(['Sample text for embedding.'])}")
    print(f"\n\n******************************************")
    vectorstore = FAISS.from_texts(chunks, embeddings)
    print(f"\n\n******************************************")
    print("\n✅ Vectorstore built successfully with FAISS.")
    print(f"\n\n******************************************")
    print(f"\n--- Vectorstore Info --- {vectorstore}")
    print(f"\n\n******************************************")
    print(f"\n--- Vectorstore Info --- {vectorstore.docstore._dict.keys()}")
    print(f"\n\n******************************************")
    return vectorstore

print(build_pdf_vectorstore(pdf_path))

In [None]:
# Suppose you already have your vectorstore object
vectorstore = build_pdf_vectorstore(pdf_path)  # <langchain_community.vectorstores.faiss.FAISS ...>

import sys

# Get the total number of vectors
print("Total vectors:", vectorstore.index.ntotal)

# Estimate memory size of the FAISS index
# Each embedding is stored as float32 (4 bytes per value)
dim = vectorstore.index.d  # embedding dimension
num = vectorstore.index.ntotal
bytes_used = dim * num * 4   # total bytes

print(f"Approx size: {bytes_used/1024:.2f} KB ({bytes_used/(1024*1024):.2f} MB)")

# # Get the Document object from the docstore
# doc_id = "77bf888b-4436-47c1-a85a-afa6c6ef0d4d"
# document = vectorstore.docstore.search(doc_id)

# if document is not None:
#     print("Document text:", document)
# else:
#     print("No document found for that ID")

# # Map FAISS position to doc_id
# id_map = vectorstore.index_to_docstore_id
# # Find the position for this doc_id
# pos = [k for k, v in id_map.items() if v == doc_id][0]

# # Get the embedding vector from FAISS
# embedding = vectorstore.index.reconstruct(pos)
# print("Embedding length:", len(embedding))
# print("First 10 values:", embedding[:10])

In [None]:
# Get the first embedding vector
embedding = vectorstore.index.reconstruct(0)

print("Embedding length:", len(embedding))   # e.g. 384
print("First 384 dimensions:", embedding[:384])

print("Total vectors:", vectorstore.index.ntotal)
print("Embedding dimension:", vectorstore.index.d)

In [None]:
# -------------------------------
# Step 3: Ask question from PDF
# -------------------------------
def ask_pdf_question(pdf_path, query):
    vectorstore = build_pdf_vectorstore(pdf_path)
    docs = vectorstore.similarity_search(query, k=3)

    print("\n--- Retrieved Chunks for Query ---")
    for i, doc in enumerate(docs):
        print(f"\nChunk {i+1}:\n{doc.page_content}\n")

    context = "\n".join([d.page_content for d in docs])
    prompt = f"Answer the question based on the PDF:\n\n{context}\n\nQuestion: {query}"

    result = subprocess.run(
        ["ollama", "run", "llama3"],
        input=prompt,
        text=True,
        encoding="utf-8",
        errors="replace",
        capture_output=True
    )
    return result.stdout

In [None]:
# -------------------------------
# Example usage
# -------------------------------
pdf_paths = [
    r"C:\Users\2shiv\Downloads\Shivang_Soni_Enhanced_CV.pdf",
    r"C:\Users\2shiv\Downloads\Research_Paper_On_Artificial_Intelligence_And_Its.pdf"
]

#pdf_path = r"C:\Users\2shiv\Downloads\Shivang_Soni_Enhanced_CV.pdf"  # raw string avoids escape errors
#pdf_path = r"C:\Users\2shiv\Downloads\Research_Paper_On_Artificial_Intelligence_And_Its.pdf"
question = "Give me Shivang Soni mobile number"
#question = "Summarize the Research Paper on AI topic of this PDF in 100 words."
answer = ask_pdf_question(pdf_path, question)

print("\n--- Final Answer from Ollama ---\n")
print(answer)

In [None]:
# # -------------------------------
# # Example usage
# # -------------------------------
# pdf_path = r"C:\Users\2shiv\Downloads\Shivang_Soni_Enhanced_CV.pdf"
# pdf_path = r"C:\Users\2shiv\Downloads\Research_Paper_On_Artificial_Intelligence_And_Its.pdf"  # raw string avoids escape errors
# #pdf_path = "C:\Users\2shiv\Downloads\Shivang_Soni_Enhanced_CV.pdf"  # raw string avoids escape errors

# question = "Can you please share Shivang mobile number from Shivang Soni CV PDF ? and also give me References links from AI paper PDF"
# answer = ask_pdf_question(pdf_path, question)

# print("\n--- Final Answer from Ollama ---\n")
# print(answer)

In [None]:
# question = "Can you please share Shivang address from PDF ?"
# answer = ask_pdf_question(pdf_path, question)

# print("\n--- Final Answer from Ollama ---\n")
# print(answer)