In [None]:
# Preprocessing.ipynb

# Step 1: Setup
import os
import fitz  # PyMuPDF
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import re
import pickle

DOCUMENTS_DIR = "../data/documents"
INDEX_DIR = "../embeddings/faiss_index"
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"  # Or use a GPU-optimized model

os.makedirs(INDEX_DIR, exist_ok=True)

In [None]:
# Step 2: Load SentenceTransformer
model = SentenceTransformer(MODEL_NAME)
model.to('cuda')  # Use GPU if available

In [None]:
# Step 3: PDF to Text
def extract_text_from_pdf(file_path):
    doc = fitz.open(file_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text


In [None]:
# Step 4: Chunking and Cleaning
def clean_text(text):
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def chunk_text(text, max_tokens=256):
    sentences = text.split(". ")
    chunks, chunk = [], ""
    for sentence in sentences:
        if len(chunk) + len(sentence) < max_tokens:
            chunk += sentence + ". "
        else:
            chunks.append(chunk.strip())
            chunk = sentence + ". "
    if chunk:
        chunks.append(chunk.strip())
    return chunks


In [None]:

# Step 5: Embed and Store
texts, metadata = [], []

for filename in os.listdir(DOCUMENTS_DIR):
    if filename.endswith(".pdf"):
        print(f"Processing {filename}...")
        pdf_path = os.path.join(DOCUMENTS_DIR, filename)
        text = clean_text(extract_text_from_pdf(pdf_path))
        chunks = chunk_text(text)
        for chunk in chunks:
            texts.append(chunk)
            metadata.append({"source": filename})

print(f"Total chunks: {len(texts)}")
embeddings = model.encode(texts, show_progress_bar=True, device='cuda')



In [None]:
# Step 6: Save FAISS Index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings))

faiss.write_index(index, os.path.join(INDEX_DIR, "docs.index"))
with open(os.path.join(INDEX_DIR, "metadata.pkl"), "wb") as f:
    pickle.dump({"texts": texts, "meta": metadata}, f)

print("Indexing complete.")