In [1]:
%pip install --upgrade pip

# Uninstall conflicting packages
%pip uninstall -y langchain-core langchain-openai langchain-experimental langchain-community langchain chromadb beautifulsoup4 python-dotenv PyPDF2 rank_bm25

# Install compatible versions of langchain libraries
%pip install langchain-core==0.3.6
%pip install langchain-openai==0.2.1
%pip install langchain-experimental==0.3.2
%pip install langchain-community==0.3.1
%pip install langchain==0.3.1

# Install remaining packages
%pip install chromadb==0.5.11
%pip install beautifulsoup4==4.12.3
%pip install python-dotenv==1.0.1
%pip install PyPDF2==3.0.1 -q --user
%pip install rank_bm25==0.2.2

Note: you may need to restart the kernel to use updated packages.
Found existing installation: langchain-core 0.3.28
Uninstalling langchain-core-0.3.28:
  Successfully uninstalled langchain-core-0.3.28
Found existing installation: langchain-openai 0.2.1
Uninstalling langchain-openai-0.2.1:
  Successfully uninstalled langchain-openai-0.2.1
Found existing installation: langchain-experimental 0.3.2
Uninstalling langchain-experimental-0.3.2:
  Successfully uninstalled langchain-experimental-0.3.2
Found existing installation: langchain-community 0.3.1
Uninstalling langchain-community-0.3.1:
  Successfully uninstalled langchain-community-0.3.1
Found existing installation: langchain 0.3.1
Uninstalling langchain-0.3.1:
  Successfully uninstalled langchain-0.3.1
Found existing installation: chromadb 0.5.11
Uninstalling chromadb-0.5.11:
  Successfully uninstalled chromadb-0.5.11
Found existing installation: beautifulsoup4 4.12.3
Uninstalling beautifulsoup4-4.12.3:
  Successfully uninstalled beau

In [1]:
import os
os.environ['USER_AGENT'] = 'RAGUserAgent'
import openai
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
import chromadb
from langchain_community.vectorstores import Chroma
from langchain_core.runnables import RunnableParallel
from dotenv import load_dotenv, find_dotenv
from langchain_core.prompts import PromptTemplate
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents.base import Document
from langchain_community.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever

In [2]:
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
openai.api_key = os.environ['OPENAI_API_KEY']
llm = ChatOpenAI(model_name="gpt-4o", temperature=0)
embedding_function = OpenAIEmbeddings()
pdf_path = "google-2023-environmental-report.pdf"
collection_name = "google_environmental_report"
str_output_parser = StrOutputParser()
user_query = "What are Google's environmental initiatives?"


In [3]:
docs = []
with open(pdf_path, "rb") as pdf_file:
    pdf_reader = PdfReader(pdf_file)
    pdf_text = "".join(page.extract_text() for page in pdf_reader.pages)
    docs = [Document(page_content=page) for page in pdf_text.split("\n\n")]

In [4]:
recursive_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", ". ", " ", ""],
    chunk_size=1000,
    chunk_overlap=200
)
splits = recursive_splitter.split_documents(docs)

In [5]:
dense_documents = [Document(page_content=doc.page_content, metadata={"id": str(i), "search_source": "dense"}) for i, doc in enumerate(splits)]
sparse_documents = [Document(page_content=doc.page_content, metadata={"id": str(i), "search_source": "sparse"}) for i, doc in enumerate(splits)]

In [6]:
chroma_client = chromadb.Client()
vectorstore = Chroma.from_documents(
    documents=dense_documents,
    embedding=embedding_function,
    collection_name=collection_name,
    client=chroma_client
)

In [7]:
dense_retriever = vectorstore.as_retriever(search_kwargs={"k": 10})
sparse_retriever = BM25Retriever.from_documents(sparse_documents, k=10)
ensemble_retriever = EnsembleRetriever(retrievers=[dense_retriever, sparse_retriever], weights=[0.5, 0.5], c=0)

In [8]:
prompt = PromptTemplate.from_template(
    """
    You are an environment expert assisting others in 
    understanding what large companies are doing to 
    improve the environment. Use the following pieces 
    of retrieved context with information about what 
    a particular company is doing to improve the 
    environment to answer the question. 
    
    If you don't know the answer, just say that you don't know.
    
    Question: {question} 
    Context: {context} 
    
    Answer:
    """
)

In [9]:
print(prompt)

input_variables=['context', 'question'] input_types={} partial_variables={} template="\n    You are an environment expert assisting others in \n    understanding what large companies are doing to \n    improve the environment. Use the following pieces \n    of retrieved context with information about what \n    a particular company is doing to improve the \n    environment to answer the question. \n    \n    If you don't know the answer, just say that you don't know.\n    \n    Question: {question} \n    Context: {context} \n    \n    Answer:\n    "


In [10]:
print(prompt.template)


    You are an environment expert assisting others in 
    understanding what large companies are doing to 
    improve the environment. Use the following pieces 
    of retrieved context with information about what 
    a particular company is doing to improve the 
    environment to answer the question. 
    
    If you don't know the answer, just say that you don't know.
    
    Question: {question} 
    Context: {context} 
    
    Answer:
    


In [11]:
prompt2 = PromptTemplate.from_template(
    """
    Your task is to help a marketing team create a 
    description for the website about the environmental
    initiatives our clients are promoting.

    Write a marketing description based on the information 
    provided in the context delimited by triple backticks. 
    
    If you don't know the answer, just say that you don't know.
    
    Question: {question} 
    Context: ```{context}``` 
    
    Answer:
    """
)

In [12]:
prompt3 = PromptTemplate.from_template(
    """
    Your task is to help a marketing team create a 
    description for the website about the environmental
    initiatives our clients are promoting.

    Write a marketing description based on the information 
    provided in the context delimited by triple backticks. 
    
    If you don't know the answer, just say that you don't know.

    Use at most 50 words.
    
    Question: {question} 
    Context: ```{context}```
    
    Answer:
    """
)

In [13]:
prompt4 = PromptTemplate.from_template(
    """
    Your task is to help a marketing team create a 
    description for the website about the environmental
    initiatives our clients are promoting.

    Write a marketing description based on the information 
    provided in the context delimited by triple backticks. 

    The description is intended for a technology audience, 
    so this should focus on only the aspects of the company's 
    efforts that relate to using technology.

    If you don't know the answer, just say that you don't know.
    
    Question: {question} 
    Context: ```{context}``` 
    
    Answer:
    """
)

In [14]:
prompt5 = PromptTemplate.from_template(
    """
    Your task is to generate a short summary of what a 
    company is doing to improve the environment. 

    Summarize the retrieved context below, delimited by 
    triple backticks, in at most 30 words.
    
    If you don't know the answer, just say that you don't 
    know.
    
    Question: {question} 
    Context: ```{context}```
    
    Answer:
    """
)

In [15]:
prompt6 = PromptTemplate.from_template(
    """
    Your task is to generate a short summary of what a 
    company is doing to improve the environment. 

    Summarize the retrieved context below, delimited by 
    triple backticks, in at most 30 words, and focusing 
    on any aspects that mention the eco-friendliness of 
    their products. 
    
    If you don't know the answer, just say that you don't 
    know.
    
    Question: {question} 
    Context: ```{context}```
    
    Answer:
    """
)

In [16]:
prompt7 = PromptTemplate.from_template(
    """
    Your task is to generate a short summary of what a 
    company is doing to improve the environment. 

    From the retrieved context below, delimited by 
    triple backticks, extract the information focusing 
    on any aspects that mention the eco-friendliness of 
    their products. Limit to 30 words.
    
    If you don't know the answer, just say that you don't 
    know.
    
    Question: {question} 
    Context: ```{context}```
    
    Answer:
    """
)

In [17]:
prompt8 = PromptTemplate.from_template(
    """
    Your task is to generate a short summary of what a 
    company is doing to improve the environment. 

    From the retrieved context below, delimited by 
    triple backticks, extract the information focusing 
    on any aspects that mention the eco-friendliness of 
    their products. Limit to 30 words.

    After this summary, determine what the sentiment 
    of context is, providing your answer as a single word, 
    either "positive" or "negative". 
    
    If you don't know the answer, just say that you don't 
    know.
    
    Question: {question} 
    Context: ```{context}```
    
    Answer:
    """
)

In [18]:
prompt9 = PromptTemplate.from_template(
    """
    Your task is to generate a short summary of what a 
    company is doing to improve the environment. 

    From the retrieved context below, delimited by 
    triple backticks, extract the information focusing 
    on any aspects that mention the eco-friendliness of 
    their products. Limit to 30 words.

    After this summary, determine any specific products
    that are identified in the context below, delimited 
    by triple backticks.  Indicate that this is a list
    of related products with the words 'Related products: '
    and then list those product names after those words.
    
    If you don't know the answer, just say that you don't 
    know.
    
    Question: {question} 
    Context: ```{context}```
    
    Answer:
    """
)

In [19]:
prompt10 = PromptTemplate.from_template(
    """
    Your task is to generate a short summary of what a 
    company is doing to improve the environment. 

    From the retrieved context below, delimited by 
    triple backticks, extract the information focusing 
    on any aspects that mention the eco-friendliness of 
    their products. Limit to 30 words.

    After this summary, determine eight topics that are 
    being discussed in the context below delimited 
    by triple backticks.  
    Make each item one or two words long. 
    Indicate that this is a list of related topics 
    with the words 'Related topics: '
    and then list those topics after those words.
    
    If you don't know the answer, just say that you don't 
    know.
    
    Question: {question} 
    Context: ```{context}```
    
    Answer:
    """
)

In [20]:
prompt11 = PromptTemplate.from_template(
    """
    Your task is to generate a short summary of what a 
    company is doing to improve the environment. 

    From the retrieved context below, delimited by 
    triple backticks, extract the information focusing 
    on any aspects that mention the eco-friendliness of 
    their products. Limit to 30 words.

    Translate the summary into three additional languages,
    Spanish, French, and English Pirate:
    labeling each language with a format like this:
    English: [summary]
    
    Spanish: [summary]
    
    French: [summary]
    
    English pirate: [summary]
    
    If you don't know the answer, just say that you don't 
    know.
    
    Question: {question} 
    Context: ```{context}```
    
    Answer:
    """
)

In [21]:
prompt12 = PromptTemplate.from_template(
    """
    Your task is to generate a short summary of what a 
    company is doing to improve the environment. 

    From the retrieved context below, delimited by 
    triple backticks, extract the information focusing 
    on any aspects that mention the eco-friendliness of 
    their products. Limit to 30 words.

    After providing the summary, translate the summary 
    into an email format with a more friendly and 
    casual tone.
    
    If you don't know the answer, just say that you don't 
    know.
    
    Question: {question} 
    Context: ```{context}```
    
    Answer:
    """
)

In [22]:
prompt13 = PromptTemplate.from_template(
    """
    Your task is to generate a short summary of what a 
    company is doing to improve the environment. 

    From the retrieved context below, delimited by 
    triple backticks, extract the information focusing 
    on any aspects that mention the eco-friendliness 
    of their products. Limit to 30 words.

    After providing the summary, provide a broader
    description of what the company is doing to 
    improve the environment and explain how this 
    can be useful to investors in that company.  
    
    For this broader description, do not use any of 
    the data provided in the context below, using 
    only the summary you have generated as the basis 
    for this description.
    
    If you don't know the answer, just say that you don't 
    know.
    
    Question: {question} 
    Context: ```{context}```
    
    Answer:
    """
)

In [23]:
relevance_prompt_template = PromptTemplate.from_template(
    """
    Given the following question and retrieved context, determine if the context is relevant to the question.
    Provide a score from 1 to 5, where 1 is not at all relevant and 5 is highly relevant.
    Return ONLY the numeric score, without any additional text or explanation.

    Question: {question}
    Retrieved Context: {retrieved_context}

    Relevance Score:"""
)

In [24]:
print(relevance_prompt_template.template)



    Given the following question and retrieved context, determine if the context is relevant to the question.
    Provide a score from 1 to 5, where 1 is not at all relevant and 5 is highly relevant.
    Return ONLY the numeric score, without any additional text or explanation.

    Question: {question}
    Retrieved Context: {retrieved_context}

    Relevance Score:


In [25]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


In [26]:
def extract_score(llm_output):
    try:
        score = float(llm_output.strip())
        return score
    except ValueError:
        return 0

# Chain it all together with LangChain
def conditional_answer(x):
    relevance_score = extract_score(x['relevance_score'])
    if relevance_score < 4:
        return "I don't know."
    else:
        return x['answer']

In [27]:
rag_chain_from_docs = (
    RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
    | RunnableParallel(
        {
            "relevance_score": (
                RunnablePassthrough()
                | (lambda x: relevance_prompt_template.format(question=x['question'], retrieved_context=x['context']))
                | llm
                | str_output_parser
            ), 
             "answer": (
                RunnablePassthrough()
                | prompt
                | llm
                | str_output_parser
            )
        }
    )
    | RunnablePassthrough().assign(final_answer=conditional_answer)
)

In [28]:
rag_chain_with_source = RunnableParallel(
    {"context": ensemble_retriever, "question": RunnablePassthrough()}
).assign(answer=rag_chain_from_docs)


In [None]:
result = rag_chain_with_source.invoke(user_query)
retrieved_docs = result['context']

print(f"Original Question: {user_query}\n")
print(f"Relevance Score: {result['answer']['relevance_score']}\n")
print(f"Final Answer:\n{result['answer']['final_answer']}\n\n")
print("Retrieved Documents:")
for i, doc in enumerate(retrieved_docs, start=1):
    print(f"Document {i}: Document ID: {doc.metadata['id']} source: {doc.metadata['search_source']}")
    print(f"Content:\n{doc.page_content}\n")