In [None]:
# ingest.py
import time
from pdf_parser import chunk_pages_by_heading
from embeddings import Embedder
from vector_store import FaissStore
import numpy as np
import argparse
import json


def run_ingest(pdf_path: str, index_dir: str):
    print(f"Ingesting {pdf_path} -> {index_dir}")
    chunks = chunk_pages_by_heading(pdf_path)
    texts = [c["text"] for c in chunks]
    metadatas = [{
        "source": c["source"],
        "chapter": c["chapter"],
        "page": c["page"],
        "chunk_id": c["chunk_id"],
    } for c in chunks]

    embedder = Embedder()
    start = time.perf_counter()
    embeddings = embedder.embed_texts(texts)
    end = time.perf_counter()
    print(f"Embedding time: {end - start:.2f}s for {len(texts)} chunks")

    dim = embeddings.shape[1]
    store = FaissStore(dim=dim, index_path=index_dir)
    store.add(np.array(embeddings), metadatas)
    store.save()
    print("Index saved.")


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--pdf", required=True)
    parser.add_argument("--index", default="data/index")
    args = parser.parse_args()
    run_ingest(args.pdf, args.index)
