In [1]:
import os
from pathlib import Path
import re
import pickle
import numpy as np
import faiss
from unstructured.partition.pdf import partition_pdf
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer

ModuleNotFoundError: No module named 'pdfminer'

In [None]:
# ========== CONFIG ==========
PDF_FOLDER = "Data_Training/15_06_2025/"
VECTOR_DIR = "vector_store"
FAISS_INDEX_FILE = os.path.join(VECTOR_DIR, "index.faiss")
CHUNKS_FILE = os.path.join(VECTOR_DIR, "chunks.pkl")
CHUNK_SIZE = 500
CHUNK_OVERLAP = 50
EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"

In [None]:
# ========== STEP 1: PDF Text Extraction ==========
def extract_text_from_pdf(pdf_path):
    elements = partition_pdf(str(pdf_path))
    return "\n".join([el.text for el in elements if hasattr(el, "text")])

In [None]:
# ========== STEP 2: Clean Text ==========
def clean_text(text):
    return re.sub(r'\s+', ' ', text).strip()

In [None]:
# ========== STEP 3: Chunking ==========
def chunk_text(text, chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP):
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return splitter.split_text(text)

In [None]:
# ========== STEP 4: Embed Text ==========
def embed_chunks(chunks, model):
    return model.encode(chunks, convert_to_tensor=False)

In [None]:
# ========== STEP 5: Save FAISS Index and Chunks ==========
def save_vector_store(index, chunks):
    os.makedirs(VECTOR_DIR, exist_ok=True)
    faiss.write_index(index, FAISS_INDEX_FILE)
    with open(CHUNKS_FILE, "wb") as f:
        pickle.dump(chunks, f)

In [None]:
# ========== STEP 6: Load FAISS Index and Chunks ==========
def load_vector_store():
    index = faiss.read_index(FAISS_INDEX_FILE)
    with open(CHUNKS_FILE, "rb") as f:
        chunks = pickle.load(f)
    return index, chunks

In [None]:
# ========== STEP 7: Retrieve Top-K Chunks ==========
def retrieve_chunks(question, model, index, chunks, top_k=5):
    query_embedding = model.encode([question], convert_to_tensor=False).astype("float32")
    D, I = index.search(np.array(query_embedding), top_k)
    return [chunks[i] for i in I[0]]

In [None]:
# ========== MAIN PROCESS ==========
def process_pdfs_and_store():
    print("⏳ Loading embedding model...")
    embed_model = SentenceTransformer(EMBEDDING_MODEL_NAME)

    all_chunks, all_embeddings = [], []
    pdf_files = list(Path(PDF_FOLDER).glob("*.pdf"))

    for pdf_file in pdf_files:
        print(f"📄 Processing: {pdf_file.name}")
        raw_text = extract_text_from_pdf(pdf_file)
        cleaned = clean_text(raw_text)
        chunks = chunk_text(cleaned)
        embeddings = embed_chunks(chunks, embed_model)

        all_chunks.extend(chunks)
        all_embeddings.extend(embeddings)

    # Convert and store
    embedding_matrix = np.vstack(all_embeddings).astype("float32")
    index = faiss.IndexFlatL2(embedding_matrix.shape[1])
    index.add(embedding_matrix)

    save_vector_store(index, all_chunks)
    print("✅ All done! FAISS index and chunks saved.")

In [None]:
# ========== OPTIONAL: Query Interface ==========
def ask_question():
    print("⏳ Loading model and vector store...")
    embed_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
    index, chunks = load_vector_store()

    while True:
        question = input("\n💬 Ask a question (or type 'exit'): ")
        if question.lower() == "exit":
            break

        top_chunks = retrieve_chunks(question, embed_model, index, chunks, top_k=5)
        print("\n📌 Top Relevant Chunks:\n")
        for i, chunk in enumerate(top_chunks, 1):
            print(f"--- Chunk {i} ---\n{chunk}\n")