In [20]:
import os
os.environ['USER_AGENT'] = 'RAGUuserAgent'
from langchain_community.document_loaders import WebBaseLoader
import bs4
import openai
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
import chromadb
from langchain_community.vectorstores import Chroma
from langchain_experimental.text_splitter import SemanticChunker

from langchain_core.runnables import RunnableParallel


In [21]:
openai.api_key = os.environ['OPENAI_API_KEY']
embedding_function = OpenAIEmbeddings()
llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)
str_output_parser = StrOutputParser()
user_query = "What are the advantages of using RAG?"


In [22]:
loader = WebBaseLoader(
    web_paths=['https://kbourne.github.io/chapter1.html'],
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    )
)
docs = loader.load()


In [23]:
text_splitter = SemanticChunker(embedding_function)
splits = text_splitter.split_documents(docs)

In [24]:
vectorstore = Chroma.from_documents(documents=splits, embedding=embedding_function)

In [25]:
retriever = vectorstore.as_retriever()

In [26]:
prompt = hub.pull("jclemens24/rag-prompt")



In [27]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [28]:
rag_chain_from_docs = (
    RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
    | prompt
    | llm
    | str_output_parser
)

In [29]:
rag_chain_with_source = RunnableParallel(
    {"context": retriever, "question": RunnablePassthrough()}).assign(answer=rag_chain_from_docs)

In [30]:
result = rag_chain_with_source.invoke(user_query)
result

{'context': [Document(metadata={'source': 'https://kbourne.github.io/chapter1.html'}, page_content='Can you imagine what you could do with all of the benefits mentioned above, but combined with all of the data within your company, about everything your company has ever done, about your customers and all of their interactions, or about all of your products and services combined with a knowledge of what a specific customer’s needs are? You do not have to imagine it, that is what RAG does! Even smaller companies are not able to access much of their internal data resources very effectively. Larger companies are swimming in petabytes of data that is not readily accessible or is not being fully utilized. Prior to RAG, most of the services you saw that connected customers or employees with the data resources of the company were really just scratching the surface of what is possible compared to if they could access ALL of the data in the company. With the advent of RAG and generative AI in gen