In [1]:
import os 
import sys
import pypdf

from langchain.llms import Ollama
from langchain_chroma import Chroma
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain

In [2]:
#Creating embeddings of the text from the PDF document
def ingest():
    #Load the document
    loader = PyPDFLoader("D:\\Ubuntu_Data\\Jupyter_Notebook\\RAG\\Paper1.pdf")
    pages = loader.load_and_split()
    
    #Split the pages by char
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1024,
        chunk_overlap=100,
        length_function=len,
        add_start_index=True,
    )
    chunks = text_splitter.split_documents(pages)
    print(f"Split {len(pages)} documents into {len(chunks)} chunks.")

    #Create embeddings
    embedding = FastEmbedEmbeddings()
    #Create Vector store
    Chroma.from_documents(documents=chunks, 
                          embedding=embedding, 
                          persist_directory="./sql_chroma_db")

In [3]:
#Create an access token from Hugging face and use the same as the read and write token below
from huggingface_hub import login
access_token_read = "hf**********************************L"
access_token_write = "hf**********************************L"
login(token = access_token_read)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to C:\Users\remya\.cache\huggingface\token
Login successful


In [4]:
#Run this only once to create the vector database 
#ingest()

In [5]:
def rag_chain():
    #model definition
    model = Ollama(model="llama3", base_url="http://localhost:11434/", temperature = 0.0)
    #Low temperature = less creativity or variation in the answer by the model

    #prompt definition
    prompt = PromptTemplate.from_template(
        """
        <s> [Instructions] You are a friendly assistant. Answer the question based only on the following context. 
        If you don't know the answer, then reply, No Context availabel for this question {input}. [/Instructions] </s> 
        [Instructions] Question: {input} 
        Context: {context} 
        Answer: [/Instructions]
        """
    )
    
    #Load text embeddings from vector store
    embedding = FastEmbedEmbeddings()
    vector_store = Chroma(persist_directory="./sql_chroma_db", embedding_function=embedding)

    #Create chain
    retriever = vector_store.as_retriever(
        search_type="similarity_score_threshold",
        search_kwargs={
            "k": 3,
            "score_threshold": 0.5,
        },
    )

    document_chain = create_stuff_documents_chain(model, prompt)
    chain = create_retrieval_chain(retriever, document_chain)

    return chain

In [7]:
chain = rag_chain()
print("chain generated")

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

chain generated


In [8]:
result = chain.invoke({"input": "What is a lobed colloid?"})
print(result["answer"])

A friendly assistant here!

Based on the provided context, I'd say that a "lobed colloid" refers to a type of colloidal particle that has uniform lobes or protrusions. In other words, it's a colloidal particle with a specific shape characterized by one or more rounded or pointed projections (lobes).


In [9]:
result1 = chain.invoke({"input": "What is polydispersity?"})
print(result1["answer"])

Polydispersity refers to the degree of variation in the size or shape of particles or molecules within a system. In this context, it appears that polydispersity is being used to describe the range of pore sizes and structures formed in porous hydrogel-like scaffolds. The text suggests that by tuning the polydispersity, one can attain a desired structure, such as a crystalline structure, which may be desirable for certain applications.


In [10]:
result2 = chain.invoke({"input": "What is radial distribution function?"})
print(result2["answer"])

The radial distribution function (RDF) gives the local density and arrangement of particles with respect to the distance from a reference particle (r) in comparison to the bulk density (ρ). It is calculated using equation 4, g(r), which determines the spatial arrangement and phase characteristics of each system.


In [11]:
#Creating a function for the query search
def ask(query: str):
    #create chain
    chain = rag_chain()
    #invoke chain
    result = chain.invoke({"input": query})
    #print results with source
    print(result["answer"])
    for doc in result["context"]:
        print("Source: ", doc.metadata["source"])

In [12]:
ask("What is a lobed colloid?")

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

A friendly assistant here!

Based on the provided context, I'd say that a "lobed colloid" refers to a type of colloidal particle that has uniform lobes or protrusions. In other words, it's a colloidal particle with a specific shape characterized by one or more rounded or pointed projections (lobes).
Source:  D:\Ubuntu_Data\Jupyter_Notebook\RAG\Paper1.pdf
Source:  D:\Ubuntu_Data\Jupyter_Notebook\RAG\Paper1.pdf
Source:  D:\Ubuntu_Data\Jupyter_Notebook\RAG\Paper1.pdf


In [13]:
ask("What is radial distribution function?")

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

The radial distribution function (RDF) gives the local density and arrangement of particles with respect to the distance from a reference particle (r) in comparison to the bulk density (ρ). It is calculated using equation 4, g(r), which determines the spatial arrangement and phase characteristics of each system.
Source:  D:\Ubuntu_Data\Jupyter_Notebook\RAG\Paper1.pdf
Source:  D:\Ubuntu_Data\Jupyter_Notebook\RAG\Paper1.pdf
Source:  D:\Ubuntu_Data\Jupyter_Notebook\RAG\Paper1.pdf
