In [None]:
!python --version

#should be 3.9 and above to run project successfully

In [None]:
import PIL
print(PIL.__version__)

#if your PIL is < 9.1.0 version, need to upgrade it.
#pip install --upgrade Pillow

Install dependencies above if you haven't done so (found in requirements.txt)

#    Import Langchain libraries

In [1]:
#from langchain_community.vectorstores import Chroma - discontinued
#from langchain_community.chat_models import ChatOllama - discontinued
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
import sys

In [8]:
from langchain_ollama import ChatOllama
from langchain_chroma import Chroma


#    Split PDF into bitesized chunks, so AI can ingest easily

In [9]:
def ingest():
    loader = PyPDFLoader("CNET0_sg.pdf")
    pages = loader.load_and_split()
    
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 1024,
        chunk_overlap = 100, 
        length_function = len,
        add_start_index = True, 
    )
    #split_documents is a langchain method. Alternatively, can also use create_documents  
    chunks = text_splitter.split_documents(pages)
    print(f"Split {len(pages)} documents into {len(chunks)} chunks.")
    
    #Generate vector embedding for each chunk
    embedding = FastEmbedEmbeddings()
    #Create vector store - aka create chroma db from the pdf doc.
    #Create db on the disk to make it modular for chatbots/cloud if uw
    Chroma.from_documents(documents=chunks,  embedding=embedding, persist_directory="./sql_chroma_db")

More about langchain's text splitting: 
[1](https://www.reddit.com/r/LangChain/comments/170mfkc/recursivecharactertextsplitter_create_documents/)

In [12]:
ingest()

Split 5 documents into 12 chunks.


In [13]:
from huggingface_hub import login

#Open API key file in read mode, and read
file = open("hugging_face_API.txt", "r") #replace with your API key from huggingface
content = file.read()

access_token_read = content
access_token_write = content
login(token = access_token_read)

# Creating a RAG chain

In [14]:
def rag_chain():
    model = ChatOllama(model="llama3")
    #
    prompt = PromptTemplate.from_template(
        """
        <s> [Instructions] You are a friendly assistant. Answer the question based only on the following context. 
        If you don't know the answer, then reply, No Context available for this question {input}. [/Instructions] </s> 
        [Instructions] Question: {input} 
        Context: {context} 
        Answer: [/Instructions]
        """
    )
    #Load vector store
    embedding = FastEmbedEmbeddings()
    vector_store = Chroma(persist_directory="./sql_chroma_db", embedding_function=embedding)

    #Create chain by using the vector store as a retriever obj
    #Retriever will search for docs based on similarity score >=0.5, up to 3 docs
    retriever = vector_store.as_retriever(
        search_type="similarity_score_threshold",
        search_kwargs={
            "k": 3,
            "score_threshold": 0.5,
        },
    )
    
    #create document change using llama3 and prompt tempalte
    document_chain = create_stuff_documents_chain(model, prompt)
    
    #create chain using document, and query
    chain = create_retrieval_chain(retriever, document_chain)
    
    return chain

In [16]:
def ask(query: str):
    chain = rag_chain()
    # invoke chain
    result = chain.invoke({"input": query})
    # print results
    print(result["answer"])
    for doc in result["context"]:
        print("Source: ", doc.metadata["source"])

# Ask a query

In [17]:
ask("How does CNET50 make soil testing more accessible for community gardeners and low-income households?")

According to the provided context, CNET50 makes soil testing more accessible for community gardeners and low-income households by bringing real-time, accessible soil insights to elderly residents, low-income households, and community growers through collaborations with groups like Edible Garden City, NParks Allotment Gardens, and social farming collectives. This allows community gardeners to make better decisions on fertilization and crop choices, leading to improved yields, lower costs, potential revenue streams, and more sustainable practices.
Source:  CNET0_sg.pdf
Source:  CNET0_sg.pdf
Source:  CNET0_sg.pdf


In [19]:
ask("In what ways does the CNET50 initiative support the UNSDG 10 (Reduced Inequalities)?")

No Context available for this question. Since the provided context only talks about CNET50, a portable soil health and carbon measurement device, its relationship with UNSDG 10 (Reduced Inequalities) is not explicitly mentioned. Therefore, I cannot provide an answer based on the given information.
Source:  CNET0_sg.pdf
Source:  CNET0_sg.pdf
Source:  CNET0_sg.pdf


In [None]:
# Debugging tips: 

1) Ollama server should be running when before ask query
2) Ensure document is in text pdf, not images. pypdf reads text based pdfs
3) Check dependencies if installed to environment 