In [1]:
%pip install PyPDF2==3.0.1 -q --user
%pip install rank_bm25==0.2.2

Note: you may need to restart the kernel to use updated packages.
Collecting rank_bm25==0.2.2
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2
Note: you may need to restart the kernel to use updated packages.


In [59]:
import os
os.environ['USER_AGENT']='RAGUserAgent'
import openai
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
import chromadb
from langchain_community.vectorstores import Chroma
from langchain_core.runnables import RunnableParallel
from dotenv import load_dotenv, find_dotenv
from langchain_core.prompts import PromptTemplate

from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain_community.retrievers import BM25Retriever

In [60]:
openai.api_key = os.getenv('OPENAI_API_KEY')
embedding_function = OpenAIEmbeddings()
llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)
pdf_path = "google-2023-environmental-report.pdf"

In [61]:
collection_name = "google_environmental_report"
str_output_parser = StrOutputParser()
user_query = "What are Google's environmental initiatives?"

In [62]:
pdf_reader = PdfReader(pdf_path)
text = ""
for page in pdf_reader.pages:
    text += page.extract_text()

In [63]:
character_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", ". ", " ", ""],
    chunk_size=1000,
    chunk_overlap = 200
)
splits = character_splitter.split_text(text)


In [64]:
documents = [Document(page_content=text, metadata={"id": str(i)}) for i, text in enumerate(splits)]

In [65]:
chroma_client = chromadb.Client()
vectorstore = Chroma.from_documents(documents=documents, embedding=embedding_function, collection_name=collection_name, client=chroma_client)

In [66]:
dense_retriever = vectorstore.as_retriever(search_kwargs={"k": 10})

In [67]:
sparse_retriever = BM25Retriever.from_documents(documents, k=10)

In [68]:
def hybrid_search(query, k=10, dense_weight=0.5, sparse_weight=0.5):
    dense_docs = dense_retriever.get_relevant_documents(query)[:k]
    dense_doc_ids = [doc.metadata['id'] for doc in dense_docs]
    print("\nCompare IDs:")
    print("dense IDs:", dense_doc_ids)
    sparse_docs = sparse_retriever.get_relevant_documents(query)[:k]
    sparse_doc_ids = [doc.metadata['id'] for doc in sparse_docs]
    print("sparse IDs: ", sparse_doc_ids)
    
    all_doc_ids = list(set(dense_doc_ids + sparse_doc_ids))
    dense_reciprocal_ranks = {doc_id: 0.0 for doc_id in all_doc_ids}
    sparse_reciprocal_ranks = {doc_id: 0.0 for doc_id in all_doc_ids}
    
    for i, doc_id in enumerate(dense_doc_ids):
        dense_reciprocal_ranks[doc_id] = 1.0 / (i + 1)

    for i, doc_id in enumerate(sparse_doc_ids):
        sparse_reciprocal_ranks[doc_id] = 1.0 / (i + 1)

    combined_reciprocal_ranks = {doc_id: 0.0 for doc_id in all_doc_ids}
    for doc_id in all_doc_ids:
        combined_reciprocal_ranks[doc_id] = dense_weight * dense_reciprocal_ranks[doc_id] + sparse_weight * sparse_reciprocal_ranks[doc_id]

    sorted_doc_ids = sorted(all_doc_ids, key = lambda doc_id: combined_reciprocal_ranks[doc_id], reverse=True)
    sorted_docs = []
    all_docs = dense_docs + sparse_docs
    for doc_id in sorted_doc_ids:
        matching_docs = [doc for doc in all_docs if doc.metadata['id'] == doc_id]
        if matching_docs:
            doc = matching_docs[0]
            doc.metadata['score'] = combined_reciprocal_ranks[doc_id]
            doc.metadata['rank'] = sorted_doc_ids.index(doc_id) + 1
            if len(matching_docs) > 1:
                doc.metadata['retriever'] = 'both'
            elif doc in dense_docs:
                doc.metadata['retriever'] = 'dense'
            else:
                doc.metadata['retriever'] = 'sparse'
            sorted_docs.append(doc)

    return sorted_docs[:k]
        
        


In [69]:
prompt = hub.pull("jclemens24/rag-prompt")



In [70]:
relevance_prompt_template = PromptTemplate.from_template(
    """
    Given the following question and retrieved context, determine if the context is relevant to the question.
    Provide a score from 1 to 5, where 1 is not at all relevant and 5 is highly relevant.
    Return ONLY the numeric score, without any additional text or explanation.

    Question: {question}
    Retrieved Context: {retrieved_context}

    Relevance Score:"""
)

In [71]:
# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [72]:
def extract_score(llm_output):
    try:
        score = float(llm_output.strip())
        return score
    except ValueError:
        return 0

# Chain it all together with LangChain
def conditional_answer(x):
    relevance_score = extract_score(x['relevance_score'])
    if relevance_score < 4:
        return "I don't know."
    else:
        return x['answer']

In [73]:
rag_chain_from_docs = (
    RunnablePassthrough.assign(context = (lambda x: format_docs(x["context"])))
    | RunnableParallel(
        {"relevance_score": (
            RunnablePassthrough()
            | (lambda x: relevance_prompt_template.format(question = x['question'], retrieved_context = x['context']))
            | llm
            | str_output_parser
        ), "answer": (
            RunnablePassthrough()
            | prompt
            | llm
            | str_output_parser
        )}
    )
    | RunnablePassthrough().assign(final_answer = conditional_answer)
)


In [74]:
rag_chain_with_source = RunnableParallel(
    {"context": hybrid_search, "question": RunnablePassthrough()}).assign(answer = rag_chain_from_docs)

In [75]:
result = rag_chain_with_source.invoke(user_query)
relevance_score = result['answer']['relevance_score']
final_answer = result['answer']['final_answer']
retrieved_docs = result['context']

print(f"\nOriginal Question: {user_query}\n")
print(f"Relevance Score: {relevance_score}\n")
print(f"Final Answer:\n {final_answer}\n\n")

print("Retrieved Documents:")
for i, doc in enumerate(retrieved_docs, start=1):
    doc_id = doc.metadata['id']
    doc_score = doc.metadata.get('score', 'N/A')
    doc_rank = doc.metadata.get('rank', 'N/A')
    doc_retriever = doc.metadata.get('retriever', 'N/A')
    print(f"Document {i}: Document ID: {doc_id} Score: {doc_score} Rank: {doc_rank} Retriever: {doc_retriever}\n")
    print(f"Content:\n{doc.page_content}\n")


Compare IDs:
dense IDs: ['12', '12', '12', '12', '311', '311', '311', '311', '13', '13']
sparse IDs:  ['150', '309', '298', '311', '328', '415', '139', '432', '91', '22']

Original Question: What are Google's environmental initiatives?

Relevance Score: 5

Final Answer:
 Google's environmental initiatives include a focus on sustainability across various aspects of their operations and supply chain. They engage with suppliers to reduce energy consumption and greenhouse gas (GHG) emissions, requiring them to report environmental data and comply with environmental management standards. Google conducts audits to ensure compliance with environmental criteria and assesses risks related to their operations and supply chain.

In 2022, Google aimed to help 1 billion people make more sustainable choices through features in their products, such as eco-friendly routing in Google Maps and energy efficiency features in Google Nest thermostats. They aspire to help reduce 1 gigaton of carbon equivale