In [None]:
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaLLM

PDF_FOLDER = "pdfs"
CHUNK_SIZE = 500
CHUNK_OVERLAP = 50
MAX_CHUNKS_PER_PDF = 5
MAX_WORKERS = 8

def process_pdf(filename):
    if not filename.endswith(".pdf"):
        return

    pdf_path = os.path.join(PDF_FOLDER, filename)
    try:
        loader = PyPDFLoader(pdf_path)
        documents = loader.load()
        splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
        chunks = splitter.split_documents(documents)

        if MAX_CHUNKS_PER_PDF:
            chunks = chunks[:MAX_CHUNKS_PER_PDF]

        llm = OllamaLLM(model="llama3.2:1b")
        for i, chunk in enumerate(chunks):
            try:
                llm.invoke(chunk.page_content)
                print(f"✅ {filename} | Chunk {i+1}")
            except Exception as e:
                print(f"❌ Error on chunk {i+1} of {filename}: {e}")
    except Exception as e:
        print(f"❌ Failed to process {filename}: {e}")

def main():
    pdf_files = [f for f in os.listdir(PDF_FOLDER) if f.endswith(".pdf")]
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        futures = [executor.submit(process_pdf, f) for f in pdf_files]
        for future in as_completed(futures):
            future.result()

if __name__ == "__main__":
    main()


BrokenProcessPool: A process in the process pool was terminated abruptly while the future was running or pending.