In [9]:
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import PyPDFLoader
from langchain_openai import ChatOpenAI
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
import sys
import os

In [10]:
def ingest():
    # Get the doc
    loader = PyPDFLoader("Data/Alice-extract.pdf")
    pages = loader.load_and_split()
    # Split the pages by char
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1024,
        chunk_overlap=100,
        length_function=len,
        add_start_index=True,
    )
    chunks = text_splitter.split_documents(pages)
    print(f"Split {len(pages)} documents into {len(chunks)} chunks.")
    #
    embedding = FastEmbedEmbeddings()
    #Create vector store
    Chroma.from_documents(documents=chunks,  embedding=embedding, persist_directory="./sql_chroma_db")

In [11]:
# only run this once to generate vector store
ingest()

Split 23 documents into 36 chunks.


In [None]:
# Set up OpenAI API key
# You can get your API key from: https://platform.openai.com/api-keys
openai_api_key = ""
os.environ["OPENAI_API_KEY"] = openai_api_key

In [13]:
def rag_chain():
    # Use OpenAI GPT model
    model = ChatOpenAI(
        model="gpt-3.5-turbo",  # You can also use "gpt-4" or "gpt-4-turbo"
        temperature=0.3,
        max_tokens=1000
    )
    
    prompt = PromptTemplate.from_template(
        """
        You are a friendly assistant. Answer the question based only on the following context. 
        If you don't know the answer, then reply, "No context available for this question: {input}".
        
        Question: {input} 
        Context: {context} 
        Answer:
        """
    )
    
    #Load vector store
    embedding = FastEmbedEmbeddings()
    vector_store = Chroma(persist_directory="./sql_chroma_db", embedding_function=embedding)

    #Create chain
    retriever = vector_store.as_retriever(
        search_type="similarity_score_threshold",
        search_kwargs={
            "k": 3,
            "score_threshold": 0.5,
        },
    )

    document_chain = create_stuff_documents_chain(model, prompt)
    chain = create_retrieval_chain(retriever, document_chain)
    
    return chain

In [14]:
# Install required packages for OpenAI
# Run this cell if you haven't installed the langchain-openai package
# !pip install langchain-openai

In [15]:
def ask(query: str):
    #
    chain = rag_chain()
    # invoke chain
    result = chain.invoke({"input": query})
    # print results
    print(result["answer"])
    for doc in result["context"]:
        print("Source: ", doc.metadata["source"])

In [17]:
ask("Where did Alice meet the rabbit ?")


Alice met the rabbit in a long, low hall lit up by a row of lamps hanging from the roof.
Source:  Data/Alice-extract.pdf
Source:  Data/Alice-extract.pdf
Source:  Data/Alice-extract.pdf
