In [None]:
import os
import asyncio
from concurrent.futures import ThreadPoolExecutor
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaLLM

PDF_FOLDER = "pdfs"
CHUNK_SIZE = 500
CHUNK_OVERLAP = 50
MAX_CHUNKS_PER_PDF = 5
MAX_WORKERS = 8

llm = OllamaLLM(model="llama3.2:1b")  # Shared LLM instance
splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)

executor = ThreadPoolExecutor(max_workers=MAX_WORKERS)

def load_and_split_pdf(pdf_path):
    loader = PyPDFLoader(pdf_path)
    documents = loader.load()
    chunks = splitter.split_documents(documents)
    return chunks[:MAX_CHUNKS_PER_PDF] if MAX_CHUNKS_PER_PDF else chunks

def process_chunk_sync(content, filename, i):
    try:
        llm.invoke(content)
        print(f"✅ {filename} | Chunk {i+1}")
    except Exception as e:
        print(f"❌ Error on chunk {i+1} of {filename}: {e}")

async def process_pdf(filename):
    if not filename.endswith(".pdf"):
        return

    pdf_path = os.path.join(PDF_FOLDER, filename)
    try:
        chunks = await asyncio.get_event_loop().run_in_executor(executor, load_and_split_pdf, pdf_path)

        tasks = [
            asyncio.get_event_loop().run_in_executor(
                executor, process_chunk_sync, chunk.page_content, filename, i
            )
            for i, chunk in enumerate(chunks)
        ]
        await asyncio.gather(*tasks)

    except Exception as e:
        print(f"❌ Failed to process {filename}: {e}")

async def main():
    pdf_files = [f for f in os.listdir(PDF_FOLDER) if f.endswith(".pdf")]
    tasks = [process_pdf(f) for f in pdf_files]
    await asyncio.gather(*tasks)

if __name__ == "__main__":
    import nest_asyncio                 
    nest_asyncio.apply()  # Allows nested use of asyncio.run()
    asyncio.get_event_loop().run_until_complete(main())



✅ 1-s2.0-S1876610215007183-main.pdf | Chunk 1
✅ 1-s2.0-S1876610215007183-main.pdf | Chunk 2
✅ 1-s2.0-S1876610215007183-main.pdf | Chunk 3
✅ 1-s2.0-S1876610215007183-main.pdf | Chunk 5
✅ 1-s2.0-S1876610215007183-main.pdf | Chunk 4
✅ 1-s2.0-S0038092X16303383-main.pdf | Chunk 2
✅ 1-s2.0-S0038092X16303383-main.pdf | Chunk 1
✅ 1-s2.0-S0038092X16303383-main.pdf | Chunk 4
✅ 1-s2.0-S0038092X16303383-main.pdf | Chunk 5
✅ 1-s2.0-S0927024815001415-main.pdf | Chunk 1
✅ 1-s2.0-S0038092X16303383-main.pdf | Chunk 3
✅ 1-s2.0-S0927024815001415-main.pdf | Chunk 2
✅ 1-s2.0-S0927024815001415-main.pdf | Chunk 4
✅ 1-s2.0-S0927024815001415-main.pdf | Chunk 5
✅ 1-s2.0-S0927024815001415-main.pdf | Chunk 3
✅ 1-s2.0-S0927024816301519-main.pdf | Chunk 2
✅ 1-s2.0-S0927024816301519-main.pdf | Chunk 1
✅ 1-s2.0-S0927024816301519-main.pdf | Chunk 3
✅ 1-s2.0-S0927024816301519-main.pdf | Chunk 4
✅ 1-s2.0-S0927024816301519-main.pdf | Chunk 5
✅ 1-s2.0-S0927024816300071-main.pdf | Chunk 1
✅ 1-s2.0-S0927024816300071-main.pd

In [4]:
import os
import asyncio
from concurrent.futures import ThreadPoolExecutor
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaLLM

# Configuration
PDF_FOLDER = "pdfs"
CHUNK_SIZE = 500
CHUNK_OVERLAP = 50
MAX_CHUNKS_PER_PDF = 5
MAX_WORKERS = os.cpu_count() or 8

# Shared objects
llm = OllamaLLM(model="llama3.2:1b")
splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
executor = ThreadPoolExecutor(max_workers=MAX_WORKERS)

# Purely synchronous processing for one PDF file
def process_pdf_sync(filename):
    if not filename.endswith(".pdf"):
        return

    pdf_path = os.path.join(PDF_FOLDER, filename)
    try:
        loader = PyPDFLoader(pdf_path)
        documents = loader.load()
        chunks = splitter.split_documents(documents)
        for i, chunk in enumerate(chunks[:MAX_CHUNKS_PER_PDF]):
            try:
                llm.invoke(chunk.page_content)
                print(f"✅ {filename} | Chunk {i+1}")
            except Exception as e:
                print(f"❌ Error on chunk {i+1} of {filename}: {e}")
    except Exception as e:
        print(f"❌ Failed to process {filename}: {e}")

# Async wrapper that delegates sync task to a thread
async def main():
    pdf_files = [f for f in os.listdir(PDF_FOLDER) if f.endswith(".pdf")]
    loop = asyncio.get_running_loop()
    tasks = [
        loop.run_in_executor(executor, process_pdf_sync, f)
        for f in pdf_files
    ]
    await asyncio.gather(*tasks)

if __name__ == "__main__":
    import nest_asyncio
    nest_asyncio.apply()  # Allows nested use of asyncio.run()
    asyncio.get_event_loop().run_until_complete(main())



  return ByteStringObject(string)


✅ 1-s2.0-S1876610215007183-main.pdf | Chunk 1
✅ 1-s2.0-S0038092X16303383-main.pdf | Chunk 1
✅ 1-s2.0-S0927024816300071-main.pdf | Chunk 1
✅ 1-s2.0-S0927024815001415-main.pdf | Chunk 1
✅ 1-s2.0-S0927024816000313-main.pdf | Chunk 1
✅ 1-s2.0-S0927024815003244-main.pdf | Chunk 1
✅ 1-s2.0-S1876610215007183-main.pdf | Chunk 2
✅ 1-s2.0-S0927024816302069-main.pdf | Chunk 1
✅ 1-s2.0-S0927024816301519-main.pdf | Chunk 1
✅ 1-s2.0-S0038092X16303383-main.pdf | Chunk 2
✅ 1-s2.0-S0927024815001415-main.pdf | Chunk 2
✅ 1-s2.0-S0927024816300071-main.pdf | Chunk 2
✅ 1-s2.0-S0927024816000313-main.pdf | Chunk 2
✅ 1-s2.0-S0927024815003244-main.pdf | Chunk 2
✅ 1-s2.0-S1876610215007183-main.pdf | Chunk 3
✅ 1-s2.0-S0927024816302069-main.pdf | Chunk 2
✅ 1-s2.0-S0927024816301519-main.pdf | Chunk 2
✅ 1-s2.0-S0038092X16303383-main.pdf | Chunk 3
✅ 1-s2.0-S0927024816300071-main.pdf | Chunk 3
✅ 1-s2.0-S0927024815001415-main.pdf | Chunk 3
✅ 1-s2.0-S0927024815003244-main.pdf | Chunk 3
✅ 1-s2.0-S0927024816000313-main.pd

In [4]:
import os
from langchain_core.documents import Document
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain_community.llms import Ollama

CHUNKS_FOLDER = "chunks"
VECTORSTORE_PATH = "vectorstore"

# 🔍 Recursively load .txt chunks from all subfolders
def load_chunks():
    print("📄 Recursively loading .txt chunks...")
    docs = []
    for root, _, files in os.walk(CHUNKS_FOLDER):
        for filename in files:
            if filename.endswith(".txt"):
                path = os.path.join(root, filename)
                try:
                    with open(path, "r", encoding="utf-8") as f:
                        content = f.read().strip()
                except UnicodeDecodeError:
                    try:
                        with open(path, "r", encoding="latin-1") as f:
                            content = f.read().strip()
                        print(f"⚠️ Non-UTF8 file read with latin-1: {os.path.relpath(path, CHUNKS_FOLDER)}")
                    except Exception as e:
                        print(f"❌ Skipping unreadable file: {os.path.relpath(path, CHUNKS_FOLDER)} - {e}")
                        continue

                if content:
                    docs.append(Document(
                        page_content=content,
                        metadata={"source": os.path.relpath(path, CHUNKS_FOLDER)}
                    ))
                else:
                    print(f"⚠️ Skipped empty file: {os.path.relpath(path, CHUNKS_FOLDER)}")
    print(f"✅ Loaded {len(docs)} non-empty documents.")
    return docs


# 🧠 Embed and prepare vectorstore
def prepare_vectorstore(documents):
    embedder = OllamaEmbeddings(model="llama3.2:1b")
    
    if os.path.exists(VECTORSTORE_PATH):
        print("📂 Loading existing vectorstore...")
        return FAISS.load_local(VECTORSTORE_PATH, embedder)

    print("🔧 Creating new FAISS vectorstore...")
    if not documents:
        raise ValueError("❌ No documents to index. Check your chunks folder.")
    vs = FAISS.from_documents(documents, embedder)
    vs.save_local(VECTORSTORE_PATH)
    return vs

# 💬 Create the RAG QA chain
def create_qa_chain(vectorstore):
    llm = OllamaLLM(model="llama3.2:1b")
    retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
    return RetrievalQA.from_chain_type(llm=llm, retriever=retriever)

# 🚀 Main loop
if __name__ == "__main__":
    documents = load_chunks()
    vectorstore = prepare_vectorstore(documents)
    qa_chain = create_qa_chain(vectorstore)

    print("\n🧠 RAG is ready. Ask anything (type 'exit' to quit):")
    while True:
        query = input("❓> ")
        if query.lower() == "exit":
            break
        result = qa_chain.run(query)
        print(f"💬 {result}")


📄 Recursively loading .txt chunks...
⚠️ Non-UTF8 file read with latin-1: 1-s2.0-S0038092X16303383-main\chunk_009.txt
⚠️ Non-UTF8 file read with latin-1: 1-s2.0-S0927024815001415-main\chunk_052.txt
⚠️ Non-UTF8 file read with latin-1: 1-s2.0-S0927024815001415-main\chunk_059.txt
⚠️ Non-UTF8 file read with latin-1: 1-s2.0-S0927024815001415-main\chunk_061.txt
⚠️ Non-UTF8 file read with latin-1: 1-s2.0-S0927024815003244-main\chunk_055.txt
⚠️ Non-UTF8 file read with latin-1: 1-s2.0-S0927024816000313-main\chunk_065.txt
⚠️ Non-UTF8 file read with latin-1: 1-s2.0-S0927024816300071-main\chunk_072.txt
⚠️ Non-UTF8 file read with latin-1: 1-s2.0-S1876610215007183-main\chunk_038.txt
⚠️ Non-UTF8 file read with latin-1: 1-s2.0-S1876610215007420-main\chunk_030.txt
⚠️ Non-UTF8 file read with latin-1: 1-s2.0-S1876610215007420-main\chunk_031.txt
⚠️ Non-UTF8 file read with latin-1: 1-s2.0-S1876610215007420-main\chunk_033.txt
⚠️ Non-UTF8 file read with latin-1: 1-s2.0-S1876610215007420-main\chunk_036.txt
⚠️ 

KeyboardInterrupt: 

In [None]:
import os
from concurrent.futures import ThreadPoolExecutor
from langchain_core.documents import Document
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain_community.llms import Ollama

CHUNKS_FOLDER = "chunks"
VECTORSTORE_PATH = "vectorstore"

# 🧹 Load a single file
def load_file(path):
    try:
        with open(path, "r", encoding="utf-8") as f:
            content = f.read().strip()
    except UnicodeDecodeError:
        try:
            with open(path, "r", encoding="latin-1") as f:
                content = f.read().strip()
            print(f"⚠️ Non-UTF8 file read with latin-1: {os.path.relpath(path, CHUNKS_FOLDER)}")
        except Exception as e:
            print(f"❌ Could not read file: {path} - {e}")
            return None
    if not content:
        print(f"⚠️ Skipped empty file: {os.path.relpath(path, CHUNKS_FOLDER)}")
        return None
    return Document(page_content=content, metadata={"source": os.path.relpath(path, CHUNKS_FOLDER)})

# 🔄 Load all documents concurrently from chunks/
def load_chunks():
    print("📂 Loading .txt files from 'chunks/' recursively...")
    paths = [
        os.path.join(root, file)
        for root, _, files in os.walk(CHUNKS_FOLDER)
        for file in files if file.endswith(".txt")
    ]
    with ThreadPoolExecutor() as executor:
        docs = list(executor.map(load_file, paths))
    documents = [doc for doc in docs if doc]
    print(f"✅ Loaded {len(documents)} documents.")
    return documents

# 🧠 Prepare or load FAISS vectorstore
def prepare_vectorstore(documents):
    embedder = OllamaEmbeddings(model="nomic-embed-text")

    if os.path.exists(VECTORSTORE_PATH):
        print("📦 Loading existing FAISS vectorstore...")
        return FAISS.load_local(VECTORSTORE_PATH, embedder)

    print("🔧 Building new FAISS vectorstore...")
    if not documents:
        raise ValueError("❌ No documents found in chunks/.")
    vectorstore = FAISS.from_documents(documents, embedder)
    vectorstore.save_local(VECTORSTORE_PATH)
    return vectorstore

# 🧵 Create RAG chain
def create_qa_chain(vectorstore):
    llm = Ollama(model="llama3.2:1b")
    retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
    return RetrievalQA.from_chain_type(llm=llm, retriever=retriever, return_source_documents=True)

# 🚀 Main
if __name__ == "__main__":
    docs = load_chunks()
    vectordb = prepare_vectorstore(docs)
    qa = create_qa_chain(vectordb)

    print("\n🧠 RAG ready. Ask questions (type 'exit' to quit):")
    while True:
        query = input("❓> ")
        if query.lower() == "exit":
            break
        result = qa(query)
        answer = result["result"]
        sources = result["source_documents"]

        print(f"\n💬 {answer}\n")
        if sources:
            print("📚 Sources:")
            for doc in sources:
                print(f" - {doc.metadata['source']}")
        else:
            print("⚠️ No relevant sources found.\n")


📂 Loading .txt files from 'chunks/' recursively...
⚠️ Non-UTF8 file read with latin-1: 1-s2.0-S0038092X16303383-main\chunk_009.txt
⚠️ Non-UTF8 file read with latin-1: 1-s2.0-S0927024815001415-main\chunk_052.txt
⚠️ Non-UTF8 file read with latin-1: 1-s2.0-S0927024815001415-main\chunk_059.txt
⚠️ Non-UTF8 file read with latin-1: 1-s2.0-S0927024815001415-main\chunk_061.txt
⚠️ Non-UTF8 file read with latin-1: 1-s2.0-S0927024815003244-main\chunk_055.txt
⚠️ Non-UTF8 file read with latin-1: 1-s2.0-S0927024816000313-main\chunk_065.txt
⚠️ Non-UTF8 file read with latin-1: 1-s2.0-S0927024816300071-main\chunk_072.txt
⚠️ Non-UTF8 file read with latin-1: 1-s2.0-S1876610215007183-main\chunk_038.txt
⚠️ Non-UTF8 file read with latin-1: 1-s2.0-S1876610215007420-main\chunk_030.txt
⚠️ Non-UTF8 file read with latin-1: 1-s2.0-S1876610215007420-main\chunk_033.txt
⚠️ Non-UTF8 file read with latin-1: 1-s2.0-S1876610215007420-main\chunk_031.txt
⚠️ Non-UTF8 file read with latin-1: 1-s2.0-S1876610215007420-main\chu

  answer = qa.run(query)


💬 I don't know, as this information isn't provided in the context.
💬 I don't know the answer to your question about whether it's possible to prevent Al diffusion during a strong annealing step in the titanium nitride (TiN) deposition process. Can I help you with anything else?
💬 I don't know the answer to what type of container or packaging the Jinko Solar product is stored in.
💬 I don't know the answer to your question about what to expect in terms of electrical characteristics and packaging specifications for the Jinko Solar 36pcs/pallets, 72pcs/stack, 936pcs/40'HQ Container TiN layers.
💬 I don't know when it comes to the specific details of the question regarding "tract" in the context of the given text. If you could provide more information or clarify what you are asking about, I would be happy to try and assist further.
💬 I don't know the answer to what is being asked. The text provided does not explicitly state the question at the end of the passage. Can you please provide more c