In [1]:
import os
import sys
import pypdf

from langchain.llms import Ollama
from langchain_chroma import Chroma
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings

from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain

In [2]:
def ingest_multiple_pdfs(folder_path):
    # List all PDF files in the folder
    pdf_files = [f for f in os.listdir(folder_path) if f.endswith('.pdf')]

    all_chunks = []  # To store all chunks from all PDFs
    total_pages = 0  # To track total number of pages processed

    for pdf_file in pdf_files:
        file_path = os.path.join(folder_path, pdf_file)
        print(f"Processing: {pdf_file}")
        
        # Load and split the PDF
        loader = PyPDFLoader(file_path)
        pages = loader.load_and_split()
        total_pages += len(pages)

        # Split the pages by char
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1024,
            chunk_overlap=100,
            length_function=len,
            add_start_index=True,
        )

        chunks = text_splitter.split_documents(pages)
        all_chunks.extend(chunks)  # Add chunks from this PDF to the total
        print(f"Split {len(pages)} pages into {len(chunks)} chunks from {pdf_file}.")

    print(f"Processed {total_pages} pages into {len(all_chunks)} chunks across all PDFs.")

    # Create embeddings
    embedding = FastEmbedEmbeddings()

    # Create and persist vector store
    #vector_store = Chroma.from_documents(
    Chroma.from_documents(
        documents=all_chunks,
        embedding=embedding,
        persist_directory="./sql_chroma_db"
    )
    print("Vector store created and persisted.")

In [5]:
#Run this only once to create the vector database 
folder_path = "patchy_particles"
#ingest_multiple_pdfs(folder_path)

In [6]:
#Create an access token from Hugging face and use the same as the read and write token below
from huggingface_hub import login
access_token_read = "hf**********************************L"
access_token_write = "hf**********************************L"
login(token = access_token_read)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to C:\Users\remya\.cache\huggingface\token
Login successful


In [7]:
def rag_chain():
    model = Ollama(model="llama3", base_url="http://localhost:11434/",
                    temperature = 0.0)
    
    prompt = PromptTemplate.from_template(
        """
        <s> [Instructions] You are a friendly assistant. Answer the question based only on the following context. 
        If you don't know the answer, then reply, No Context availabel for this question {input}. [/Instructions] </s> 
        [Instructions] Question: {input} 
        Context: {context} 
        Answer: [/Instructions]
        """
    )
    #Load vector store
    embedding = FastEmbedEmbeddings()
    vector_store = Chroma(persist_directory="./sql_chroma_db", embedding_function=embedding)

    #Create chain
    retriever = vector_store.as_retriever(
        search_type="similarity_score_threshold",
        search_kwargs={
            "k": 3,
            "score_threshold": 0.5,
        },
    )

    document_chain = create_stuff_documents_chain(model, prompt)
    chain = create_retrieval_chain(retriever, document_chain)

    return chain

In [9]:
#Test
chain = rag_chain()

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

In [10]:
result = chain.invoke({"input": "What are the gaps in patchy particle self-assembly?"})
print(result["answer"])

What are the gaps in patchy particle self-assembly?

Based on the provided context, it seems that there is a gap in understanding the optimal patch size for producing monodisperse clusters. The synthesized particles had an A patch with α = 60° and a B patch with β = 40° half-opening angle, but the results suggest that the patch size of the wider patch was suboptimal for the first stage of assembly.

Additionally, there is a gap in understanding the role of the range of patch-patch interactions in the self-assembly process.


In [11]:
#Creating a function for the query search
def ask(query: str):
    #create chain
    chain = rag_chain()
    #invoke chain
    result = chain.invoke({"input": query})
    #print results with source
    print(result["answer"])
    for doc in result["context"]:
        print("Source: ", doc.metadata["source"])

In [None]:
ask("What are the gaps in patchy particle self-assembly?")