In [1]:
import fitz  # PyMuPDF
import numpy as np
import faiss
import os
from sentence_transformers import SentenceTransformer


  from .autonotebook import tqdm as notebook_tqdm





In [7]:

class PDFRetrievalPipeline:
    def __init__(self, model_name='all-MiniLM-L6-v2'):
        # Load a pre-trained embedding model
        self.model = SentenceTransformer(model_name)
        self.index = None
        self.chunks = []
        
    def ingest_pdfs(self, pdf_folder, chunk_size=36, overlap=8):
        """Extracts text from PDFs, chunks it, and builds the FAISS index."""
        all_text_chunks = [] # stores list of chunks  OR  list of strings
        
        for filename in os.listdir(pdf_folder):
            if filename.endswith(".pdf"):
                filepath = os.path.join(pdf_folder, filename)
                doc = fitz.open(filepath)
                for page in doc:
                    words = " ".join(page.get_text("text").split()).split()
                    # Simple chunking logic (by word count)
                    for start in range(0,len(words),chunk_size-overlap):
                        end = start + chunk_size
                        chunk = " ".join(words[start:end]).strip()
                        if chunk:
                            all_text_chunks.append(f"[{filename}] {chunk}")
        
        self.chunks = all_text_chunks
        self._build_index(all_text_chunks)
        print(f"Ingested {len(all_text_chunks)} chunks from PDFs.")

    def _build_index(self, chunks):
        """Creates a FAISS index for similarity search."""
        embeddings = self.model.encode(chunks)
        dimension = embeddings.shape[1]
        
        # Using IndexFlatL2 for exact L2 distance search
        self.index = faiss.IndexFlatL2(dimension)
        self.index.add(np.array(embeddings).astype('float32'))

    def retrieve(self, query, top_k=2):
        if self.index is None:
            return "No documents ingested yet."
        
        # 1.Convert query to embedding
        query_vector = self.model.encode([query]).astype('float32')
        # 2.Search the index
        distances, indices = self.index.search(query_vector, top_k)
        # 3.Return the retrieved chunks
        results = [self.chunks[idx] for idx in indices[0] if idx != -1]
        return results




# --- Usage Example ---

# 1. Initialize pipeline
#pipeline = PDFRetrievalPipeline()

# 2. Ingest PDFs from a local directory
# pipeline.ingest_pdfs("./my_documents") 

# 3. Perform a query
# results = pipeline.retrieve("What are the project requirements?")
# for i, res in enumerate(results):
#     print(f"Match {i+1}: {res}\n")

In [19]:
filepath = r"C:\Users\tinka001\Documents\Python\PROJECTS\AI CHATBOT"

In [20]:
pipeline = PDFRetrievalPipeline()
try:
    pipeline.ingest_pdfs(filepath) 
except:
    print("cannot find the file")

    


Ingested 27 chunks from PDFs.


In [21]:
pipeline.chunks

['[Data Science 3.pdf] Area of Interest Automation & Predictive Modelling, AI & Machine Learning, Natural Language Processing, , Education Year Degree/Examination Institution/Board CGPA/ Percentage 2024 B.Tech. 4th Year Indian Institute of Technology, Roorkee 8.104 2021 Intermediate (Class XII) RG',
 '[Data Science 3.pdf] Technology, Roorkee 8.104 2021 Intermediate (Class XII) RG NAVODAYA VIDYALAYA SYAT KOTABAGH NAINITAL 86.60 % 2019 Matriculate (Class X) RG NAVODAYA VIDYALAYA SYAT KOTABAGH NAINITAL 95.60 % Projects AI Chatbot for Querying Multiple PDFs | Self',
 '[Data Science 3.pdf] AI Chatbot for Querying Multiple PDFs | Self Project Developed a retrieval-based chatbot for querying multiple PDF documents using natural language. Implemented a pipeline for PDF ingestion, text extraction, chunking, and semantic similarity search. Built a',
 '[Data Science 3.pdf] extraction, chunking, and semantic similarity search. Built a locally hosted web interface to test conversational querying an

In [22]:
results = pipeline.retrieve("when india got independence")
for i, res in enumerate(results):
    print(f"Match {i+1}: {res}\n")

Match 1: [pdf1.pdf] and philosophy. After gaining independence from British rule in 1947, India adopted a democratic constitution, making it the largest democracy in the world. Its political system is a federal structure with a parliamentary form of government.

Match 2: [pdf1.pdf] . No extra response , no extra explaining India continues to inspire the world through its spiritual depth, resilience, and commitment to peace and progress.



In [18]:
results = pipeline.retrieve("Which festivals are celbrated in INDIA")
results[0]

'[pdf1.pdf] festivals such as Diwali, Holi, and Eid, which reflect its rich traditions and communal harmony. Landmarks like the Taj Mahal, Qutub Minar, and the ghats of Varanasi showcase its artistic heritage. Modern India balances tradition with'

In [None]:
results = pipeline.retrieve("Which festivals are celbrated in INDIA")
results[1]

'[pdf1.pdf] federal structure with a parliamentary form of government. The economy is one of the fastest-growing globally, driven by agriculture, industry, and technology. Culturally, India is renowned for its festivals such as Diwali, Holi, and Eid, which'