In [2]:
%pip install --upgrade pip

# Uninstall conflicting packages
%pip uninstall -y langchain-core langchain-openai langchain-experimental langchain-community langchain chromadb beautifulsoup4 python-dotenv PyPDF2 rank_bm25

# Install compatible versions of langchain libraries
%pip install langchain-core==0.3.6
%pip install langchain-openai==0.2.1
%pip install langchain-experimental==0.3.2
%pip install langchain-community==0.3.1
%pip install langchain==0.3.1

# Install remaining packages
%pip install chromadb==0.5.11
%pip install beautifulsoup4==4.12.3
%pip install python-dotenv==1.0.1
%pip install PyPDF2==3.0.1 -q --user
%pip install rank_bm25==0.2.2

Note: you may need to restart the kernel to use updated packages.
Found existing installation: langchain-core 0.3.28
Uninstalling langchain-core-0.3.28:
  Successfully uninstalled langchain-core-0.3.28
Found existing installation: langchain-openai 0.2.1
Uninstalling langchain-openai-0.2.1:
  Successfully uninstalled langchain-openai-0.2.1
Found existing installation: langchain-experimental 0.3.2
Uninstalling langchain-experimental-0.3.2:
  Successfully uninstalled langchain-experimental-0.3.2
Found existing installation: langchain-community 0.3.1
Uninstalling langchain-community-0.3.1:
  Successfully uninstalled langchain-community-0.3.1
Found existing installation: langchain 0.3.1
Uninstalling langchain-0.3.1:
  Successfully uninstalled langchain-0.3.1
Found existing installation: chromadb 0.5.11
Uninstalling chromadb-0.5.11:
  Successfully uninstalled chromadb-0.5.11
Found existing installation: beautifulsoup4 4.12.3
Uninstalling beautifulsoup4-4.12.3:
  Successfully uninstalled beau

In [1]:
import os
os.environ['USER_AGENT'] = 'RAGUserAgent'
import openai
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
import chromadb
from langchain_community.vectorstores import Chroma
from langchain_core.runnables import RunnableParallel
from dotenv import load_dotenv, find_dotenv
from langchain_core.prompts import PromptTemplate
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents.base import Document
from langchain_community.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever

In [2]:
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
openai.api_key = os.environ['OPENAI_API_KEY']
llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)
embedding_function = OpenAIEmbeddings()
pdf_path = "google-2023-environmental-report.pdf"
collection_name = "google_environmental_report"
str_output_parser = StrOutputParser()
user_query = "What are Google's environmental initiatives?"

In [3]:
docs = []
with open(pdf_path, "rb") as pdf_file:
    pdf_reader = PdfReader(pdf_file)
    pdf_text = "".join(page.extract_text() for page in pdf_reader.pages)
    docs = [Document(page_content=page) for page in pdf_text.split("\n\n")]

In [4]:
recursive_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", ". ", " ", ""],
    chunk_size=1000,
    chunk_overlap=200
)

splits = recursive_splitter.split_documents(docs)

In [5]:
dense_documents = [Document(page_content=doc.page_content, metadata={"id": str(i), "search_source": "dense"}) for i, doc in enumerate(splits)]
sparse_documents = [Document(page_content=doc.page_content, metadata={"id": str(i), "search_source": "sparse"}) for i, doc in enumerate(splits)]

In [6]:
chroma_client = chromadb.Client()
vectorstore = Chroma.from_documents(
    documents=dense_documents,
    embedding=embedding_function,
    collection_name=collection_name,
    client=chroma_client
)

In [7]:
dense_retriever = vectorstore.as_retriever(search_kwargs={"k": 10})
sparse_retriever = BM25Retriever.from_documents(sparse_documents, k=10)
ensemble_retriever = EnsembleRetriever(retrievers=[dense_retriever, sparse_retriever], weights=[0.5, 0.5], c=0)

In [8]:
prompt = PromptTemplate.from_template(
    """
    You are an environment expert assisting others in 
    understanding what large companies are doing to 
    improve the environment. Use the following pieces 
    of retrieved context with information about what 
    a particular company is doing to improve the 
    environment to answer the question. 
    
    If you don't know the answer, just say that you don't know.
    
    Question: {question} 
    Context: {context} 
    
    Answer:
    """
)

In [9]:
relevance_prompt_template = PromptTemplate.from_template(
    """
    Given the following question and retrieved context, determine if the context is relevant to the question.
    Provide a score from 1 to 5, where 1 is not at all relevant and 5 is highly relevant.
    Return ONLY the numeric score, without any additional text or explanation.

    Question: {question}
    Retrieved Context: {retrieved_context}

    Relevance Score:"""
)

In [10]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [11]:
def extract_score(llm_output):
    try:
        score = float(llm_output.strip())
        return score
    except ValueError:
        return 0

# Chain it all together with LangChain
def conditional_answer(x):
    relevance_score = extract_score(x['relevance_score'])
    if relevance_score < 4:
        return "I don't know."
    else:
        return x['answer']

In [12]:
from langchain.prompts.chat import ChatPromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate

def augment_query_generated(user_query):
    system_message_prompt = SystemMessagePromptTemplate.from_template(
        "You are a helpful expert environmental research assistant. Provide an example answer to the given question, that might be found in a document like an annual environmental report."
    )
    human_message_prompt = HumanMessagePromptTemplate.from_template("{query}")
    
    chat_prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])
    
    response = chat_prompt.format_prompt(query=user_query).to_messages()
    
    result = llm(response)
    content = result.content
    
    return content

In [16]:
original_query = "What are Google's environmental initiatives?"
hypothetical_answer = augment_query_generated(original_query)  # Keep as regular function call
joint_query = f"{original_query} {hypothetical_answer}"
print(joint_query)

What are Google's environmental initiatives? Google has implemented a variety of environmental initiatives aimed at reducing its carbon footprint and promoting sustainability. Key initiatives include:

1. **Carbon Neutrality**: Since 2007, Google has been carbon neutral, meaning that it offsets its carbon emissions through renewable energy purchases and investments in carbon offset projects.

2. **Renewable Energy Commitment**: Google has committed to operating on 24/7 carbon-free energy by 2030. This involves sourcing renewable energy for all of its data centers and campuses at all times, not just on an annual basis.

3. **Sustainable Data Centers**: Google continuously works to improve the energy efficiency of its data centers. The company has achieved a 50% reduction in energy usage for its data centers since 2010 through advanced cooling techniques and machine learning technologies.

4. **Circular Economy**: Google is focused on minimizing waste through a circular economy approach.

In [17]:
rag_chain_from_docs = (
    RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
    | RunnableParallel(
        {
            "relevance_score": (
                RunnablePassthrough()
                | (lambda x: relevance_prompt_template.format(question=x['question'], retrieved_context=x['context']))
                | llm
                | str_output_parser
            ), 
             "answer": (
                RunnablePassthrough()
                | prompt
                | llm
                | str_output_parser
            )
        }
    )
    | RunnablePassthrough().assign(final_answer=conditional_answer)
)

In [18]:
rag_chain_with_source = RunnableParallel(
    {"context": ensemble_retriever, "question": RunnablePassthrough()}
).assign(answer=rag_chain_from_docs)

In [19]:
result_alt = rag_chain_with_source.invoke(joint_query)
retrieved_docs_alt = result_alt['context']

print(f"Original Question: {joint_query}\n")
print(f"Relevance Score: {result_alt['answer']['relevance_score']}\n")
print(f"Final Answer:\n{result_alt['answer']['final_answer']}\n\n")
print("Retrieved Documents:")
for i, doc in enumerate(retrieved_docs_alt, start=1):
    print(f"Document {i}: Document ID: {doc.metadata['id']} source: {doc.metadata['search_source']}")
    print(f"Content:\n{doc.page_content}\n")

Original Question: What are Google's environmental initiatives? Google has implemented a variety of environmental initiatives aimed at reducing its carbon footprint and promoting sustainability. Key initiatives include:

1. **Carbon Neutrality**: Since 2007, Google has been carbon neutral, meaning that it offsets its carbon emissions through renewable energy purchases and investments in carbon offset projects.

2. **Renewable Energy Commitment**: Google has committed to operating on 24/7 carbon-free energy by 2030. This involves sourcing renewable energy for all of its data centers and campuses at all times, not just on an annual basis.

3. **Sustainable Data Centers**: Google continuously works to improve the energy efficiency of its data centers. The company has achieved a 50% reduction in energy usage for its data centers since 2010 through advanced cooling techniques and machine learning technologies.

4. **Circular Economy**: Google is focused on minimizing waste through a circula

In [20]:
from IPython.display import Markdown, display
markdown_text_alt = result_alt['answer']['final_answer']
display(Markdown(markdown_text_alt))

Google has implemented a variety of environmental initiatives aimed at reducing its carbon footprint and promoting sustainability. Key initiatives include:

1. **Carbon Neutrality**: Google has been carbon neutral since 2007, offsetting its carbon emissions through renewable energy purchases and investments in carbon offset projects.

2. **Renewable Energy Commitment**: The company aims to operate on 24/7 carbon-free energy by 2030, sourcing renewable energy for all its data centers and campuses at all times.

3. **Sustainable Data Centers**: Google has improved the energy efficiency of its data centers, achieving a 50% reduction in energy usage since 2010 through advanced cooling techniques and machine learning.

4. **Circular Economy**: Google focuses on minimizing waste through a circular economy approach, aiming for 100% of its products to be made from recycled or renewable materials.

5. **Sustainable Product Design**: The company designs energy-efficient and environmentally friendly products, using sustainable materials and ensuring longevity and recyclability.

6. **Community Engagement and Advocacy**: Google engages with communities and stakeholders to promote environmental awareness and sustainability practices, partnering with organizations focused on climate action.

7. **Biodiversity and Conservation**: Google invests in projects that support biodiversity and conservation, protecting ecosystems and promoting sustainable land use.

8. **Empowering Individuals**: Google aims to help 1 billion people make more sustainable choices through features in its products, such as eco-friendly routing in Google Maps and carbon emissions information in Google Flights.

9. **Collaboration and Innovation**: Google collaborates with various organizations and initiatives to accelerate climate action and sustainability efforts globally.

Through these initiatives, Google aims to lead by example in the tech industry and contribute to global efforts to combat climate change and promote sustainability.

In [21]:
result = rag_chain_with_source.invoke(user_query)
retrieved_docs = result['context']

print(f"Original Question: {user_query}\n")
print(f"Relevance Score: {result['answer']['relevance_score']}\n")
print(f"Final Answer:\n{result['answer']['final_answer']}\n\n")
print("Retrieved Documents:")
for i, doc in enumerate(retrieved_docs, start=1):
    print(f"Document {i}: Document ID: {doc.metadata['id']} source: {doc.metadata['search_source']}")
    print(f"Content:\n{doc.page_content}\n")

Original Question: What are Google's environmental initiatives?

Relevance Score: 5

Final Answer:
Google has implemented a variety of environmental initiatives aimed at improving sustainability and reducing carbon emissions. Here are some key aspects of their efforts:

1. **Empowering Individuals**: Google has reached its goal of helping 1 billion people make more sustainable choices through features in its products, such as eco-friendly routing in Google Maps, energy efficiency features in Google Nest thermostats, and carbon emissions information in Google Flights.

2. **Net-Zero Carbon**: Google has committed to achieving net-zero carbon emissions and has invested in renewable energy sources, including wind farms like the Golden Hills wind farm in California.

3. **Water Stewardship**: Their Bay View campus is designed to be net water-positive, incorporating stormwater retention and restoration of natural habitats.

4. **Circular Economy**: Google is working towards a circular econo

In [22]:
from IPython.display import Markdown, display
markdown_text = result['answer']['final_answer']
display(Markdown(markdown_text))

Google has implemented a variety of environmental initiatives aimed at improving sustainability and reducing carbon emissions. Here are some key aspects of their efforts:

1. **Empowering Individuals**: Google has reached its goal of helping 1 billion people make more sustainable choices through features in its products, such as eco-friendly routing in Google Maps, energy efficiency features in Google Nest thermostats, and carbon emissions information in Google Flights.

2. **Net-Zero Carbon**: Google has committed to achieving net-zero carbon emissions and has invested in renewable energy sources, including wind farms like the Golden Hills wind farm in California.

3. **Water Stewardship**: Their Bay View campus is designed to be net water-positive, incorporating stormwater retention and restoration of natural habitats.

4. **Circular Economy**: Google is working towards a circular economy by engaging with suppliers to reduce energy consumption and greenhouse gas emissions, and by promoting the reuse and repair of products.

5. **Sustainable Operations**: Google has made its data centers among the most efficient in the world, focusing on maximizing the efficient use of energy, water, and materials.

6. **Collaboration and Partnerships**: Google collaborates with various organizations, such as the Nature Conservancy and the iMasons Climate Accord, to support projects aimed at carbon reduction and environmental restoration.

7. **Innovative Technology**: Google is leveraging its technology to provide data analytics tools that help organizations optimize their operations for sustainability, such as predicting wind power output for energy suppliers.

8. **Public Policy Advocacy**: Google actively engages in public policy discussions to promote strong climate action and sustainable practices.

9. **Sustainability Reporting**: Google requires its suppliers to report environmental data and has conducted audits to ensure compliance with environmental standards.

Overall, Google's environmental initiatives are structured around empowering individuals, collaborating with partners, and operating sustainably, with a goal to collectively reduce carbon emissions significantly by 2030.