In [1]:
import os
import asyncio
from concurrent.futures import ThreadPoolExecutor
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaLLM

PDF_FOLDER = "pdfs"
CHUNK_SIZE = 500
CHUNK_OVERLAP = 50
MAX_CHUNKS_PER_PDF = 5
MAX_WORKERS = 8

llm = OllamaLLM(model="llama3.2:1b")  # Shared LLM instance
splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)

executor = ThreadPoolExecutor(max_workers=MAX_WORKERS)

def load_and_split_pdf(pdf_path):
    loader = PyPDFLoader(pdf_path)
    documents = loader.load()
    chunks = splitter.split_documents(documents)
    return chunks[:MAX_CHUNKS_PER_PDF] if MAX_CHUNKS_PER_PDF else chunks

def process_chunk_sync(content, filename, i):
    try:
        llm.invoke(content)
        print(f" {filename} | Chunk {i+1}")
    except Exception as e:
        print(f"Error on chunk {i+1} of {filename}: {e}")

async def process_pdf(filename):
    if not filename.endswith(".pdf"):
        return

    pdf_path = os.path.join(PDF_FOLDER, filename)
    try:
        chunks = await asyncio.get_event_loop().run_in_executor(executor, load_and_split_pdf, pdf_path)

        tasks = [
            asyncio.get_event_loop().run_in_executor(
                executor, process_chunk_sync, chunk.page_content, filename, i
            )
            for i, chunk in enumerate(chunks)
        ]
        await asyncio.gather(*tasks)

    except Exception as e:
        print(f" Failed to process {filename}: {e}")

async def main():
    pdf_files = [f for f in os.listdir(PDF_FOLDER) if f.endswith(".pdf")]
    tasks = [process_pdf(f) for f in pdf_files]
    await asyncio.gather(*tasks)

if __name__ == "__main__":
    import nest_asyncio                 
    nest_asyncio.apply()  # Allows nested use of asyncio.run()
    asyncio.get_event_loop().run_until_complete(main())



FileNotFoundError: [WinError 3] The system cannot find the path specified: 'pdfs'

In [2]:
import os
import asyncio
from concurrent.futures import ThreadPoolExecutor
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaLLM

# Configuration
PDF_FOLDER = "pdfs"
CHUNK_SIZE = 500
CHUNK_OVERLAP = 50
MAX_CHUNKS_PER_PDF = 5
MAX_WORKERS = os.cpu_count() or 8

# Shared objects
llm = OllamaLLM(model="llama3.2:1b")
splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
executor = ThreadPoolExecutor(max_workers=MAX_WORKERS)

# Purely synchronous processing for one PDF file
def process_pdf_sync(filename):
    if not filename.endswith(".pdf"):
        return

    pdf_path = os.path.join(PDF_FOLDER, filename)
    try:
        loader = PyPDFLoader(pdf_path)
        documents = loader.load()
        chunks = splitter.split_documents(documents)
        for i, chunk in enumerate(chunks[:MAX_CHUNKS_PER_PDF]):
            try:
                llm.invoke(chunk.page_content)
                print(f" {filename} | Chunk {i+1}")
            except Exception as e:
                print(f" Error on chunk {i+1} of {filename}: {e}")
    except Exception as e:
        print(f" Failed to process {filename}: {e}")

# Async wrapper that delegates sync task to a thread
async def main():
    pdf_files = [f for f in os.listdir(PDF_FOLDER) if f.endswith(".pdf")]
    loop = asyncio.get_running_loop()
    tasks = [
        loop.run_in_executor(executor, process_pdf_sync, f)
        for f in pdf_files
    ]
    await asyncio.gather(*tasks)

if __name__ == "__main__":
    import nest_asyncio
    nest_asyncio.apply()  # Allows nested use of asyncio.run()
    asyncio.get_event_loop().run_until_complete(main())



 JinkoSolar Eagle 54HM G6 Datasheet (420–440 W, N-Type TOPCon).pdf | Chunk 4
 Model_Based_Continuous_Improvement_of_Industrial_p.pdf | Chunk 4
 JinkoSolar Eagle 72 G6B Datasheet (570–590 W, N-Type Bifacial).pdf | Chunk 4
 1-s2.0-S1876610215008851-main.pdf | Chunk 5
 Adani ELAN SHINE TOPCon Datasheet (550–575 W, N-Type Bifacial).pdf | Chunk 5
 1-s2.0-S1876610215008206-main.pdf | Chunk 5
 Qcells Q.TRON BLK M-G2+ Series Datasheet (415–440 Wp, 2024).pdf | Chunk 1
 Intl J of Energy Research - 2021 - Gawusu - The dynamics of green supply chain management within the framework of renewable.pdf | Chunk 5
 Model_Based_Continuous_Improvement_of_Industrial_p.pdf | Chunk 5
 JinkoSolar Eagle 54HM G6 Datasheet (420–440 W, N-Type TOPCon).pdf | Chunk 5
 JinkoSolar Eagle 72 G6B Datasheet (570–590 W, N-Type Bifacial).pdf | Chunk 5
 Rayzon TOPCon Datasheet (570–590 W, N-Type Bifacial, 2024).pdf | Chunk 1
 Resistive_Power_Loss_Analysis_of_PV_Modules_Made_From_Halved_15.615.6_cm2_Silicon_PERC_Solar_Cells_Wi

In [None]:
import os
from langchain_core.documents import Document
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain_community.llms import Ollama

CHUNKS_FOLDER = "chunks"
VECTORSTORE_PATH = "vectorstore"

#  Recursively load .txt chunks from all subfolders
def load_chunks():
    print(" Recursively loading .txt chunks...")
    docs = []
    for root, _, files in os.walk(CHUNKS_FOLDER):
        for filename in files:
            if filename.endswith(".txt"):
                path = os.path.join(root, filename)
                try:
                    with open(path, "r", encoding="utf-8") as f:
                        content = f.read().strip()
                except UnicodeDecodeError:
                    try:
                        with open(path, "r", encoding="latin-1") as f:
                            content = f.read().strip()
                        print(f" Non-UTF8 file read with latin-1: {os.path.relpath(path, CHUNKS_FOLDER)}")
                    except Exception as e:
                        print(f" Skipping unreadable file: {os.path.relpath(path, CHUNKS_FOLDER)} - {e}")
                        continue

                if content:
                    docs.append(Document(
                        page_content=content,
                        metadata={"source": os.path.relpath(path, CHUNKS_FOLDER)}
                    ))
                else:
                    print(f" Skipped empty file: {os.path.relpath(path, CHUNKS_FOLDER)}")
    print(f" Loaded {len(docs)} non-empty documents.")
    return docs


#  Embed and prepare vectorstore
def prepare_vectorstore(documents):
    embedder = OllamaEmbeddings(model="llama3.2:1b")
    
    if os.path.exists(VECTORSTORE_PATH):
        print(" Loading existing vectorstore...")
        return FAISS.load_local(VECTORSTORE_PATH, embedder)

    print(" Creating new FAISS vectorstore...")
    if not documents:
        raise ValueError(" No documents to index. Check your chunks folder.")
    vs = FAISS.from_documents(documents, embedder)
    vs.save_local(VECTORSTORE_PATH)
    return vs

#  Create the RAG QA chain
def create_qa_chain(vectorstore):
    llm = OllamaLLM(model="llama3.2:1b")
    retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
    return RetrievalQA.from_chain_type(llm=llm, retriever=retriever)

#  Main loop
if __name__ == "__main__":
    documents = load_chunks()
    vectorstore = prepare_vectorstore(documents)
    qa_chain = create_qa_chain(vectorstore)

    print("\n RAG is ready. Ask anything (type 'exit' to quit):")
    while True:
        query = input("> ")
        if query.lower() == "exit":
            break
        result = qa_chain.run(query)
        print(f" {result}")


 Recursively loading .txt chunks...
 Non-UTF8 file read with latin-1: 1-s2.0-S0038092X16303383-main\chunk_009.txt
 Non-UTF8 file read with latin-1: 1-s2.0-S0927024815001415-main\chunk_052.txt
 Non-UTF8 file read with latin-1: 1-s2.0-S0927024815001415-main\chunk_059.txt
 Non-UTF8 file read with latin-1: 1-s2.0-S0927024815001415-main\chunk_061.txt
 Non-UTF8 file read with latin-1: 1-s2.0-S0927024815003244-main\chunk_055.txt
 Non-UTF8 file read with latin-1: 1-s2.0-S0927024816000313-main\chunk_065.txt
 Non-UTF8 file read with latin-1: 1-s2.0-S0927024816300071-main\chunk_072.txt
 Non-UTF8 file read with latin-1: 1-s2.0-S1876610215007183-main\chunk_038.txt
 Non-UTF8 file read with latin-1: 1-s2.0-S1876610215007420-main\chunk_030.txt
 Non-UTF8 file read with latin-1: 1-s2.0-S1876610215007420-main\chunk_031.txt
 Non-UTF8 file read with latin-1: 1-s2.0-S1876610215007420-main\chunk_033.txt
 Non-UTF8 file read with latin-1: 1-s2.0-S1876610215007420-main\chunk_036.txt
 Non-UTF8 file read with lat

In [None]:
import os
from concurrent.futures import ThreadPoolExecutor
from langchain_core.documents import Document
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain_community.llms import Ollama

CHUNKS_FOLDER = "chunks"
VECTORSTORE_PATH = "vectorstore"

# 🧹 Load a single file
def load_file(path):
    try:
        with open(path, "r", encoding="utf-8") as f:
            content = f.read().strip()
    except UnicodeDecodeError:
        try:
            with open(path, "r", encoding="latin-1") as f:
                content = f.read().strip()
            print(f" Non-UTF8 file read with latin-1: {os.path.relpath(path, CHUNKS_FOLDER)}")
        except Exception as e:
            print(f" Could not read file: {path} - {e}")
            return None
    if not content:
        print(f" Skipped empty file: {os.path.relpath(path, CHUNKS_FOLDER)}")
        return None
    return Document(page_content=content, metadata={"source": os.path.relpath(path, CHUNKS_FOLDER)})

#  Load all documents concurrently from chunks/
def load_chunks():
    print(" Loading .txt files from 'chunks/' recursively...")
    paths = [
        os.path.join(root, file)
        for root, _, files in os.walk(CHUNKS_FOLDER)
        for file in files if file.endswith(".txt")
    ]
    with ThreadPoolExecutor() as executor:
        docs = list(executor.map(load_file, paths))
    documents = [doc for doc in docs if doc]
    print(f" Loaded {len(documents)} documents.")
    return documents

#  Prepare or load FAISS vectorstore
def prepare_vectorstore(documents):
    embedder = OllamaEmbeddings(model="nomic-embed-text")

    if os.path.exists(VECTORSTORE_PATH):
        print(" Loading existing FAISS vectorstore...")
        return FAISS.load_local(VECTORSTORE_PATH, embedder)

    print(" Building new FAISS vectorstore...")
    if not documents:
        raise ValueError(" No documents found in chunks/.")
    vectorstore = FAISS.from_documents(documents, embedder)
    vectorstore.save_local(VECTORSTORE_PATH)
    return vectorstore

#  Create RAG chain
def create_qa_chain(vectorstore):
    llm = Ollama(model="llama3.2:1b")
    retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
    return RetrievalQA.from_chain_type(llm=llm, retriever=retriever, return_source_documents=True)

#  Main
if __name__ == "__main__":
    docs = load_chunks()
    vectordb = prepare_vectorstore(docs)
    qa = create_qa_chain(vectordb)



In [None]:
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain_community.llms import Ollama

# Load embedding model and FAISS vectorstore
embedder = OllamaEmbeddings(model="nomic-embed-text")
vectorstore = FAISS.load_local("vectorstore", embedder, allow_dangerous_deserialization=True)

# Create retriever from vectorstore
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

# Load Ollama LLM
llm = Ollama(model="llama3.2:1b")

# Load QA chain that only uses provided sources (no external info)
qa_chain = load_qa_with_sources_chain(llm, chain_type="stuff")  # "stuff" uses only retrieved context

# Define a function that queries only the FAISS data
def query_faiss_only(question):
    docs = retriever.get_relevant_documents(question)
    result = qa_chain({"input_documents": docs, "question": question})
    return result

# Example usage
query = """
Act like a researcher assistant. Summarize what you know about 'dopedsilicon' 
and return the response in JSON format with the keys: "definition", "applications", and "source".
"""
response = query_faiss_only(query)

# Output the result and source documents
print(response["output_text"])
