!pip install langchain

!pip install langchain-community

In [None]:
!pip install pypdf

In [None]:
!pip install fastembed

In [None]:
!pip install chromadb

In [4]:
!python --version

Python 3.8.8


In [None]:
import PIL
print(PIL.__version__)

#if your PIL is < 9.1.0 version, need to upgrade it.
#pip install --upgrade Pillow

Install dependencies above if you haven't done so

#    Import Langchain libraries

In [1]:
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.chat_models import ChatOllama
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
import sys

#    Split PDF into bitesized chunks, so AI can ingest easily

In [2]:
def ingest():
    loader = PyPDFLoader("./aiforurbanagri.pdf")
    pages = loader.load_and_split()
    
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 1024,
        chunk_overlap = 100, 
        length_function = len,
        add_start_index = True, 
    )
    #split_documents is a langchain method. Alternatively, can also use create_documents  
    chunks = text_splitter.split_documents(pages)
    print(f"Split {len(pages)} documents into {len(chunks)} chunks.")
    
    #Generate vector embedding for each chunk
    embedding = FastEmbedEmbeddings()
    #Create vector store - aka create chroma db from the pdf doc.
    #Create db on the disk to make it modular for chatbots/cloud if uw
    Chroma.from_documents(documents=chunks,  embedding=embedding, persist_directory="./sql_chroma_db")

More about langchain's text splitting: 
[1](https://www.reddit.com/r/LangChain/comments/170mfkc/recursivecharactertextsplitter_create_documents/)

In [3]:
ingest()

incorrect startxref pointer(3)
parsing for Object Streams


Split 0 documents into 0 chunks.


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

TypeError: 'type' object is not subscriptable

In [None]:
from huggingface_hub import login

#Open API key file in read mode, and read
file = open("hugging_face_API.txt", "r")
content = file.read()

access_token_read = content
access_token_write = content
login(token = access_token_read)

In [None]:
print(content)

# Creating a RAG chain

In [None]:
def rag_chain():
    model = ChatOllama(model="llama3")
    #
    prompt = PromptTemplate.from_template(
        """
        <s> [Instructions] You are a friendly assistant. Answer the question based only on the following context. 
        If you don't know the answer, then reply, No Context available for this question {input}. [/Instructions] </s> 
        [Instructions] Question: {input} 
        Context: {context} 
        Answer: [/Instructions]
        """
    )
    #Load vector store
    embedding = FastEmbedEmbeddings()
    vector_store = Chroma(persist_directory="./sql_chroma_db", embedding_function=embedding)

    #Create chain by using the vector store as a retriever obj
    #Retriever will search for docs based on similarity score >=0.5, up to 3 docs
    retriever = vector_store.as_retriever(
        search_type="similarity_score_threshold",
        search_kwargs={
            "k": 3,
            "score_threshold": 0.5,
        },
    )
    
    #create document change using llama3 and prompt tempalte
    document_chain = create_stuff_documents_chain(model, prompt)
    
    #create chain using document, and query
    chain = create_retrieval_chain(retriever, document_chain)
    
    return chain

In [None]:
def ask(query: str):
    chain = rag_chain()
    # invoke chain
    result = chain.invoke({"input": query})
    # print results
    print(result["answer"])
    for doc in result["context"]:
        print("Source: ", doc.metadata["source"])