Import environment variables

In [22]:
!pip install -q langchain==0.2.16 langchain-openai ragas==0.1.14 pandas langchain-qdrant qdrant-client python-dotenv langchain-anthropic langchain_experimental

In [23]:
from dotenv import load_dotenv
import os

load_dotenv('../app/.env')

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")

if not OPENAI_API_KEY or not ANTHROPIC_API_KEY:
    print("Error retrieving API keys")

Load our document corpus from a file. (fetch_data.ipynb can be used to generate the file)

In [24]:
myfile = "source_documents.json"

import json
from langchain.schema import Document

# Load JSON data
with open(myfile, 'r') as file:
    data = json.load(file)

# Convert JSON data into a list of LangChain Document objects
docs = [
    Document(page_content=item["page_content"], metadata=item["metadata"])
    for item in data
]

print(f"loaded {len(docs)} docs")

loaded 216 docs


Baseline chunking strategy: Fixed width, 1500 chars

In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,       
    chunk_overlap=150,     
)

split_docs = []

for doc in docs:

    splits = text_splitter.split_text(doc.page_content)
    for i,split in enumerate(splits):
        metadata_with_chunk = {**doc.metadata, "chunk_id": i}
            
        # Create the document with the updated metadata
        split_doc = Document(page_content=split, metadata=metadata_with_chunk)
        split_docs.append(split_doc)

print(f"len(docs): {len(docs)}, len(split_docs):{len(split_docs)}")
print(split_docs[0])

len(docs): 216, len(split_docs):1349
page_content='alzheimer's disease and dementia | alzheimer's disease and dementia | cdc     alzheimer's disease and dementia alzheimer's basics learn about signs and symptoms of alzheimer's disease and who is affected. aug. 15, 2024 dementia basics learn about common types of dementia, signs and symptoms, and risk factors. aug. 17, 2024 signs and symptoms of alzheimer's learn how to recognize the early signs of alzheimer's disease. signs and symptoms of dementia learn what early signs and symptoms of dementia to look out for. tools and resources find a variety of resources about alzheimer’s disease and healthy aging. reducing risk learn what lifestyle behaviors can reduce the risk of developing dementia. additional topics healthy aging at any age information to help you stay healthy and strong throughout your life. sept. 3, 2024 alzheimer's disease program evidence-based, scientific information to educate, inform, and assist translating research int

In [25]:
from langchain_openai import OpenAIEmbeddings
embedding_model = "text-embedding-3-large"
openai_embeddings = OpenAIEmbeddings(
    model=embedding_model,
    openai_api_key=OPENAI_API_KEY  
)

In [28]:
from langchain_experimental.text_splitter import SemanticChunker
from tqdm import tqdm

semantic_text_splitter = SemanticChunker(openai_embeddings,
    breakpoint_threshold_type="percentile")

semantic_split_docs = []

for doc in tqdm(docs):

    splits = semantic_text_splitter.split_text(doc.page_content)
    for i,split in enumerate(splits):
        metadata_with_chunk = {**doc.metadata, "chunk_id": i}
            
        # Create the document with the updated metadata
        semantic_split_doc = Document(page_content=split, metadata=metadata_with_chunk)
        semantic_split_docs.append(semantic_split_doc)

print(f"len(docs): {len(docs)}, len(semantic_split_docs):{len(semantic_split_docs)}")
print(semantic_split_docs[0])

100%|██████████| 216/216 [03:01<00:00,  1.19it/s]

len(docs): 216, len(semantic_split_docs):1074
page_content='alzheimer's disease and dementia | alzheimer's disease and dementia | cdc     alzheimer's disease and dementia alzheimer's basics learn about signs and symptoms of alzheimer's disease and who is affected. aug. 15, 2024 dementia basics learn about common types of dementia, signs and symptoms, and risk factors. aug. 17, 2024 signs and symptoms of alzheimer's learn how to recognize the early signs of alzheimer's disease. signs and symptoms of dementia learn what early signs and symptoms of dementia to look out for. tools and resources find a variety of resources about alzheimer’s disease and healthy aging. reducing risk learn what lifestyle behaviors can reduce the risk of developing dementia. additional topics healthy aging at any age information to help you stay healthy and strong throughout your life. sept. 3, 2024 alzheimer's disease program evidence-based, scientific information to educate, inform, and assist translating res




Let's add the docs to a vector store. Make sure qdrant is running first (see README.md for more details). We can create it once and re-use it after that.

In [29]:
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient

url="http://localhost:6333"

client = QdrantClient(url=url, prefer_grpc=True)
print(client.get_collections())

# Delete any existing collections here, otherwise the code below will extend rather than overwrite them

collections=[CollectionDescription(name='PottyTraining'), CollectionDescription(name='DementiaCare_Semantic'), CollectionDescription(name='DementiaCare_Fixed')]


In [30]:
collection_name_fixed = "DementiaCare_Fixed"
collection_name_semantic = "DementiaCare_Semantic"


In [None]:

client.delete_collection(collection_name_fixed)
try:
    qdrant_vector_store_fixed = QdrantVectorStore.from_documents(
        split_docs,
        openai_embeddings,
        url=url,
        prefer_grpc=True,
        collection_name=collection_name_fixed,
    )
except Exception as e:
    print(f"Encountered error creating vector store: {e}")

if qdrant_vector_store_fixed: print(f"Created vector store {collection_name_fixed}")


In [31]:

client.delete_collection(collection_name_semantic)
try:
    qdrant_vector_store_semantic = QdrantVectorStore.from_documents(
        semantic_split_docs,
        openai_embeddings,
        url=url,
        prefer_grpc=True,
        collection_name=collection_name_semantic,
    )
except Exception as e:
    print(f"Encountered error creating vector store: {e}")

if qdrant_vector_store_semantic: print(f"Created vector store {collection_name_semantic}")

print(client.get_collections())


NameError: name 'qdrant_vector_store_fixed' is not defined

In [32]:
# If the collections already exist, just load them
from langchain_qdrant import QdrantVectorStore
url="http://localhost:6333"


store_fixed = QdrantVectorStore.from_existing_collection(
    embedding=openai_embeddings,
    collection_name=collection_name_fixed,
    url=url
)

store_semantic = QdrantVectorStore.from_existing_collection(
    embedding=openai_embeddings,
    collection_name=collection_name_semantic,
    url=url
)

Set up RAGAS

In [33]:
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall
)
RAGAS_METRICS = [ faithfulness, answer_relevancy, context_precision, context_recall ]

Set up retrievers

In [34]:
from langchain.retrievers import EnsembleRetriever

mmr_retriever_fixed = store_fixed.as_retriever(
    search_type="mmr",
    search_kwargs={'k': 10, 'lambda_mult': 0.1}
)
similarity_retriever_fixed = store_fixed.as_retriever(k=10)
ensemble_retriever_fixed = EnsembleRetriever(retrievers=[mmr_retriever_fixed,similarity_retriever_fixed])

results = ensemble_retriever_fixed.invoke("How does stress impact dementia caregivers?")
for result in results[:3]: print(result)

page_content='dementia caregivers, though the phenomenon appears to be more severe or extreme for dementia caregivers. Further, dementia caregivers more often experience emotional stress and physical strain than do non-dementia caregivers—a situation that contributes to the disproportionately worsening health among dementia caregivers. Prior research has documented the ways caregiving can affect caregivers’ health and well-being. Caregiving has been associated with higher levels depression and anxiety, compromised immune function, and increased mortality.17 These negative health impacts mean that caregiving is a crucially important public health issue—caregivers’ health suffers under the pressure of caregiving responsibilities, and such decline in health compromises the caregiver’s ability to provide care. This problem is particularly pressing for dementia caregivers, as health impacts are worse and loved ones rely on these caregivers for more daily assistance. Therefore, it is critica

In [35]:

mmr_retriever_semantic = store_semantic.as_retriever(
    search_type="mmr",
    search_kwargs={'k': 10, 'lambda_mult': 0.1}
)
similarity_retriever_semantic = store_semantic.as_retriever(k=10)
ensemble_retriever_semantic = EnsembleRetriever(retrievers=[mmr_retriever_semantic,similarity_retriever_semantic])

results = ensemble_retriever_semantic.invoke("How does stress impact dementia caregivers?")
print("\n")
for result in results[:3]: print(result)




page_content='Levine, and S. Samis, “Home Alone: Family Caregivers Providing Complex Chronic Care,” AARP Public Policy Institute & United Hospital Fund, 2012. 2 Considering all of the responsibilities that dementia caregivers often shoulder, it is of no surprise that the Burden of Care Index2 shows them as one of the more burdened groups of caregivers. Nearly half of dementia caregivers are in a high-burden situation. Dementia caregivers are not the most-burdened group—for example, cancer caregivers are more likely to be in high-burden care relationships (62 percent).3 However, whereas cancer caregiver relationships are short and episodic, dementia caregiver relationships tend to be longer: nearly seven in ten (69 percent) dementia caregivers have provided care for more than a year, and three in ten have provided care for more than five years. This high burden of care over a longer period can take a significant mental and physical toll on dementia caregivers. Nearly half of dementia 

Test it out in a simple RAG chain: Create a prompt, initialize an LLM, and then use the retriever in a chain

In [36]:
from langchain_core.prompts import PromptTemplate

RAG_PROMPT_TEMPLATE = """
You are a helpful assistant. Answer the question based on the context. If you don't know, say you don't know.

<context>
{context}
</context>

<question>
{query}
<question>
"""

rag_prompt = PromptTemplate.from_template(RAG_PROMPT_TEMPLATE)

In [37]:
from langchain_anthropic import ChatAnthropic

haiku_model_id = "claude-3-haiku-20240307" # cheaper and better to use for prototyping, although we'll use 3.5 in our app
claude_3_5_sonnet_model_id = "claude-3-5-sonnet-20240620"

llm = ChatAnthropic(
    model=haiku_model_id,    
    anthropic_api_key=ANTHROPIC_API_KEY,
    temperature=0
)

In [38]:
from datasets import Dataset
import pandas as pd
from tqdm.asyncio import tqdm_asyncio

async def gen_rag_responses(rag_chain) -> Dataset:
    """Wrapper function to run a RAG chain against a test dataset and generate/store responses"""
    test_df = pd.read_csv("ragas_test_data.csv")

    test_questions = test_df["question"].to_list()
    test_gt = test_df["ground_truth"].to_list()
    print("read test questions")

    answers = []
    contexts = []

    print("generating responses")
    for question in tqdm_asyncio(test_questions,desc="Processing Questions"):
        response = await rag_chain.ainvoke({"query" : question})
        answers.append(response["response"].content)
        contexts.append([context.page_content for context in response["context"]])

    # Put in huggingface dataset format and save it for later re-use
    response_dataset = Dataset.from_dict({
        "question" : test_questions,
        "answer" : answers,
        "contexts" : contexts,
        "ground_truth" : test_gt
    })

    return response_dataset

In [39]:
from operator import itemgetter
from langchain_core.runnables import RunnablePassthrough
from ragas import evaluate


In [23]:

# standard RAG that passes the context through
fixed_similarity_rag_chain = (
    {"context": itemgetter("query") | similarity_retriever_fixed | (lambda docs: docs[:4]), "query": itemgetter("query")} 
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt | llm, "context": itemgetter("context")}
)

response_dataset = await gen_rag_responses(fixed_similarity_rag_chain)

from ragas import evaluate

results = evaluate(response_dataset, 
                   RAGAS_METRICS)

# Check out the results
print(results)
results_df = pd.DataFrame([results])


read test questions
generating responses


Processing Questions: 100%|██████████| 30/30 [01:37<00:00,  3.26s/it]
Evaluating: 100%|██████████| 120/120 [01:09<00:00,  1.73it/s]


{'faithfulness': 0.9507, 'answer_relevancy': 0.7835, 'context_precision': 0.8676, 'context_recall': 0.9200}


In [13]:
fixed_mmr_rag_chain = (
    {"context": itemgetter("query") | mmr_retriever_fixed | (lambda docs: docs[:4]), "query": itemgetter("query")} 
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt | llm, "context": itemgetter("context")}
)

response_dataset = await gen_rag_responses(fixed_mmr_rag_chain)


read test questions
generating responses


Processing Questions: 100%|██████████| 30/30 [01:47<00:00,  3.59s/it]


NameError: name 'evaluate' is not defined

In [15]:
results = evaluate(response_dataset, 
                   RAGAS_METRICS)

# Check out the results
print(results)
results_df = pd.DataFrame([results])

Evaluating: 100%|██████████| 120/120 [01:02<00:00,  1.92it/s]


{'faithfulness': 0.9091, 'answer_relevancy': 0.9486, 'context_precision': 0.8074, 'context_recall': 0.8522}


In [16]:
fixed_ensemble_rag_chain = (
    {"context": itemgetter("query") | ensemble_retriever_fixed | (lambda docs: docs[:4]), "query": itemgetter("query")} 
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt | llm, "context": itemgetter("context")}
)

response_dataset = await gen_rag_responses(fixed_ensemble_rag_chain)
results = evaluate(response_dataset, 
                   RAGAS_METRICS)

# Check out the results
print(results)
results_df = pd.DataFrame([results])

read test questions
generating responses


Processing Questions: 100%|██████████| 30/30 [01:43<00:00,  3.44s/it]
Evaluating: 100%|██████████| 120/120 [00:53<00:00,  2.25it/s]


{'faithfulness': 0.9586, 'answer_relevancy': 0.8842, 'context_precision': 0.8537, 'context_recall': 0.9133}


In [40]:
semantic_similarity_rag_chain = (
    {"context": itemgetter("query") | similarity_retriever_semantic | (lambda docs: docs[:4]), "query": itemgetter("query")} 
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt | llm, "context": itemgetter("context")}
)

response_dataset = await gen_rag_responses(semantic_similarity_rag_chain)
results = evaluate(response_dataset, 
                   RAGAS_METRICS)

# Check out the results
print(results)
results_df = pd.DataFrame([results])

read test questions
generating responses


Processing Questions:   0%|          | 0/30 [00:00<?, ?it/s]

Processing Questions: 100%|██████████| 30/30 [01:40<00:00,  3.36s/it]
Evaluating: 100%|██████████| 120/120 [01:06<00:00,  1.80it/s]


{'faithfulness': 0.9521, 'answer_relevancy': 0.8805, 'context_precision': 0.8713, 'context_recall': 0.9272}


In [42]:
semantic_mmr_rag_chain = (
    {"context": itemgetter("query") | mmr_retriever_semantic | (lambda docs: docs[:4]), "query": itemgetter("query")} 
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt | llm, "context": itemgetter("context")}
)

response_dataset = await gen_rag_responses(semantic_mmr_rag_chain)
results = evaluate(response_dataset, 
                   RAGAS_METRICS)

# Check out the results
print(results)
results_df = pd.DataFrame([results])

read test questions
generating responses


Processing Questions:   0%|          | 0/30 [00:00<?, ?it/s]

Processing Questions: 100%|██████████| 30/30 [01:56<00:00,  3.87s/it]
Evaluating: 100%|██████████| 120/120 [00:48<00:00,  2.48it/s]


{'faithfulness': 0.9500, 'answer_relevancy': 0.8781, 'context_precision': 0.8185, 'context_recall': 0.8211}


In [43]:
semantic_ensemble_rag_chain = (
    {"context": itemgetter("query") | ensemble_retriever_semantic | (lambda docs: docs[:4]), "query": itemgetter("query")} 
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt | llm, "context": itemgetter("context")}
)

response_dataset = await gen_rag_responses(semantic_ensemble_rag_chain)
results = evaluate(response_dataset, 
                   RAGAS_METRICS)

# Check out the results
print(results)
results_df = pd.DataFrame([results])

read test questions
generating responses


Processing Questions: 100%|██████████| 30/30 [01:53<00:00,  3.78s/it]
Evaluating:  29%|██▉       | 35/120 [00:14<00:26,  3.19it/s]No statements were generated from the answer.
Evaluating: 100%|██████████| 120/120 [00:53<00:00,  2.23it/s]


{'faithfulness': 0.9345, 'answer_relevancy': 0.8475, 'context_precision': 0.8333, 'context_recall': 0.8983}


In [44]:
ensemble_retriever_similarity = EnsembleRetriever(retrievers=[similarity_retriever_semantic,similarity_retriever_fixed])
similarity_ensemble_rag_chain = (
    {"context": itemgetter("query") | ensemble_retriever_similarity | (lambda docs: docs[:4]), "query": itemgetter("query")} 
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt | llm, "context": itemgetter("context")}
)

response_dataset = await gen_rag_responses(similarity_ensemble_rag_chain)
results = evaluate(response_dataset, 
                   RAGAS_METRICS)

# Check out the results
print(results)
results_df = pd.DataFrame([results])

read test questions
generating responses


Processing Questions: 100%|██████████| 30/30 [01:46<00:00,  3.56s/it]
Evaluating: 100%|██████████| 120/120 [00:54<00:00,  2.20it/s]


{'faithfulness': 0.9335, 'answer_relevancy': 0.9111, 'context_precision': 0.8833, 'context_recall': 0.9500}


- Fixed-size similarity retriever: {'faithfulness': 0.9507, 'answer_relevancy': 0.7835, 'context_precision': 0.8676, 'context_recall': 0.9200}
- Fixed-size MMR retriever: {'faithfulness': 0.9091, 'answer_relevancy': 0.9486, 'context_precision': 0.8074, 'context_recall': 0.8522}
- Fixed-size ensemble retriever: {'faithfulness': 0.9586, 'answer_relevancy': 0.8842, 'context_precision': 0.8537, 'context_recall': 0.9133}

- Semantic chunking similarity retriever:  {'faithfulness': 0.9521, 'answer_relevancy': 0.8805, 'context_precision': 0.8713, 'context_recall': 0.9272}
- Semantic chunking ensemble retriever: {'faithfulness': 0.9345, 'answer_relevancy': 0.8475, 'context_precision': 0.8333, 'context_recall': 0.8983}
- Semantic chunking MMR retriever: {'faithfulness': 0.9500, 'answer_relevancy': 0.8781, 'context_precision': 0.8185, 'context_recall': 0.8211}

- Semantic and fixed size similarity ensemble:  {'faithfulness': 0.9335, 'answer_relevancy': 0.9111, 'context_precision': 0.8833, 'context_recall': 0.9500}