In [1]:
!pip install -q langchain-core==0.2.40 langchain-openai==0.1.25 langchain-huggingface==0.0.3 

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
langchain-cohere 0.3.0 requires langchain-core<0.4,>=0.3.0, but you have langchain-core 0.2.40 which is incompatible.
langchain-experimental 0.3.2 requires langchain-community<0.4.0,>=0.3.0, but you have langchain-community 0.2.17 which is incompatible.
langchain-experimental 0.3.2 requires langchain-core<0.4.0,>=0.3.6, but you have langchain-core 0.2.40 which is incompatible.


In [2]:
!pip install -qU ragas

In [5]:
import os
os.getcwd()

'c:\\Users\\andre\\OneDrive\\Documents\\AIE4\\AIE4\\Week 7\\Day 2\\Activity2'

In [12]:
import os
import getpass

os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API Key:")

In [13]:
os.environ["COHERE_API_KEY"] = getpass.getpass("Cohere API Key:")

In [14]:
from langchain_community.document_loaders.csv_loader import CSVLoader
from datetime import datetime, timedelta

documents = []
directory_path = "c:\\Users\\andre\\OneDrive\\Documents\\AIE4\\AIE4\\Week 7\\Day 2\\"

for i in range(1, 5):
  loader = CSVLoader(
      file_path=f"{directory_path}jw{i}.csv",
      metadata_columns=["Review_Date", "Review_Title", "Review_Url", "Author", "Rating"]
  )

  movie_docs = loader.load()
  for doc in movie_docs:

    # Add the "Movie Title" (John Wick 1, 2, ...)
    doc.metadata["Movie_Title"] = f"John Wick {i}"

    # convert "Rating" to an `int`, if no rating is provided - assume 0 rating
    doc.metadata["Rating"] = int(doc.metadata["Rating"]) if doc.metadata["Rating"] else 0

    # newer movies have a more recent "last_accessed_at"
    doc.metadata["last_accessed_at"] = datetime.now() - timedelta(days=4-i)

  documents.extend(movie_docs)

In [17]:
from langchain_community.vectorstores import Qdrant
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

vectorstore = Qdrant.from_documents(
    documents,
    embeddings,
    location=":memory:",
    collection_name="JohnWick"
)

naive_retriever = vectorstore.as_retriever(search_kwargs={"k" : 10})

In [10]:
from tqdm import tqdm
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

generator_llm = ChatOpenAI(model="gpt-4o-mini-2024-07-18")
critic_llm = ChatOpenAI(model="gpt-4o-mini")
embeddings = OpenAIEmbeddings()

generator = TestsetGenerator.from_langchain(
    generator_llm,
    critic_llm,
    embeddings
)

distributions = {
    simple: 0.5,
    multi_context: 0.4,
    reasoning: 0.1
}

In [None]:
testset = generator.generate_with_langchain_docs(documents, 20, distributions, with_debugging_logs=True)

In [28]:
testset.to_pandas().to_csv("john_wick_dataset.csv")

In [1]:
import pandas as pd
testset = pd.read_csv("john_wick_dataset.csv")

In [5]:
import os
import getpass
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = getpass.getpass("LangChain API Key:")

In [6]:
from langsmith import Client

client = Client()

dataset_name = "John Wick Questions"

dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="Questions about John Wick"
)

In [7]:
for test in testset.iterrows():
  client.create_example(
      inputs={
          "question": test[1]["question"]
      },
      outputs={
          "answer": test[1]["ground_truth"]
      },
      metadata={
          "context": test[0]
      },
      dataset_id=dataset.id
  )

#### LCEL Chain

In [10]:
test_questions = testset['question'].values.tolist()
test_groundtruths = testset['ground_truth'].values.tolist()

In [16]:
from langchain.prompts import ChatPromptTemplate

RAG_PROMPT = """\
Given a provided context and question, you must answer the question based only on context.

If you cannot answer the question based on the context - you must say "I don't know".

Context: {context}
Question: {question}
"""

rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT)

In [18]:
from langchain_openai import ChatOpenAI

chat_model = ChatOpenAI(model='gpt-4o-mini')

## Retrievers

**BM25**

In [19]:
from langchain_community.retrievers import BM25Retriever

bm25_retriever = BM25Retriever.from_documents(documents)

In [20]:
from langchain_core.runnables import RunnablePassthrough
from operator import itemgetter
from langchain_core.output_parsers import StrOutputParser

bm25_retriever_chain = (
    {"context": itemgetter("question") | bm25_retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt | chat_model, "context": itemgetter("context")}
)

In [24]:
from datasets import Dataset
from ragas import evaluate
from tqdm import tqdm
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    answer_correctness,
    context_recall,
    context_precision,
    ContextEntityRecall
)

metrics = [
    # faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    # ContextEntityRecall,
    # answer_correctness,
]

In [29]:
def get_response_dataset(chain, test_questions):
    answers = []
    contexts = []

    for question in test_questions:
        response = chain.invoke({"question" : question})
        answers.append(response["response"].content)
        contexts.append([context.page_content for context in response["context"]])

    response_dataset = Dataset.from_dict({
        "question" : test_questions,
        "answer" : answers,
        "contexts" : contexts,
        "ground_truth" : test_groundtruths
    })

    return response_dataset

In [26]:
bm25_results = evaluate(get_response_dataset(bm25_retriever_chain, test_questions), metrics)
bm25_results

{'answer_relevancy': 0.5114, 'context_recall': 0.7588, 'context_precision': 0.6111}

**Contextual Compression**

Contextual Compression has dependencies that render it incompatible with RAGAS. I ran Contextual_compression_and_Ensemble.ipynb in Colab to obtain the dataset, and then imported that dataset into this notebook.

In [None]:
#cohere might require the most updated version of langchain or some other package (incompatible with RAGAS)
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain_cohere import CohereRerank

compressor = CohereRerank(model="rerank-english-v3.0")
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=naive_retriever
)

In [None]:
contextual_compression_retrieval_chain = (
    {"context": itemgetter("question") | compression_retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt | chat_model, "context": itemgetter("context")}
)

In [106]:
contextual_compression_dataset = Dataset.from_parquet("contextual_compression_dataset.parquet")

In [109]:
contextual_compression_results = evaluate(contextual_compression_dataset, metrics)

Evaluating: 100%|██████████| 57/57 [00:17<00:00,  3.17it/s]


In [110]:
contextual_compression_results

{'answer_relevancy': 0.9690, 'context_recall': 0.9474, 'context_precision': 0.8202}

**Multi-query**

In [34]:
from langchain.retrievers.multi_query import MultiQueryRetriever

multi_query_retriever = MultiQueryRetriever.from_llm(
    retriever=naive_retriever, llm=chat_model
)

In [35]:
multi_query_retrieval_chain = (
    {"context": itemgetter("question") | multi_query_retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt | chat_model, "context": itemgetter("context")}
)

In [36]:
multi_query_results = evaluate(get_response_dataset(multi_query_retrieval_chain, test_questions), metrics)
multi_query_results

Evaluating: 100%|██████████| 57/57 [01:35<00:00,  1.67s/it]


{'answer_relevancy': 0.8175, 'context_recall': 1.0000, 'context_precision': 0.7363}

**Parent document**

In [37]:
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore
from langchain_text_splitters import RecursiveCharacterTextSplitter
from qdrant_client import QdrantClient, models

parent_docs = documents
child_splitter = RecursiveCharacterTextSplitter(chunk_size=200)

In [38]:
client = QdrantClient(location=":memory:")

client.create_collection(
    collection_name="full_documents",
    vectors_config=models.VectorParams(size=1536, distance=models.Distance.COSINE)
)

parent_document_vectorstore = Qdrant(
    collection_name="full_documents", embeddings=OpenAIEmbeddings(model="text-embedding-3-small"), client=client
)

  parent_document_vectorstore = Qdrant(


In [39]:
store = InMemoryStore()

parent_document_retriever = ParentDocumentRetriever(
    vectorstore = parent_document_vectorstore,
    docstore=store,
    child_splitter=child_splitter,
)

parent_document_retriever.add_documents(parent_docs, ids=None)

In [40]:
parent_document_retrieval_chain = (
    {"context": itemgetter("question") | parent_document_retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt | chat_model, "context": itemgetter("context")}
)

In [41]:
parent_document_results = evaluate(get_response_dataset(parent_document_retrieval_chain, test_questions), metrics)
parent_document_results

Evaluating: 100%|██████████| 57/57 [00:17<00:00,  3.27it/s]


{'answer_relevancy': 0.6104, 'context_recall': 0.8421, 'context_precision': 0.7719}

**Ensemble**

EnsembleRetriever has the same issues as ContextualCompression. See previous note about ContextualCompression.

In [None]:
from langchain.retrievers import EnsembleRetriever

retriever_list = [bm25_retriever, naive_retriever, parent_document_retriever, compression_retriever, multi_query_retriever]
equal_weighting = [1/len(retriever_list)] * len(retriever_list)

ensemble_retriever = EnsembleRetriever(
    retrievers=retriever_list, weights=equal_weighting
)

In [None]:
ensemble_retrieval_chain = (
    {"context": itemgetter("question") | ensemble_retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt | chat_model, "context": itemgetter("context")}
)

In [111]:
ensemble_retrieval_dataset = Dataset.from_parquet("ensemble_response_dataset.parquet")
ensemble_retrieval_results = evaluate(ensemble_retrieval_dataset, metrics)
ensemble_retrieval_results

Generating train split: 19 examples [00:00, 1260.25 examples/s]
Evaluating: 100%|██████████| 57/57 [02:05<00:00,  2.20s/it]


{'answer_relevancy': 0.9711, 'context_recall': 1.0000, 'context_precision': 0.6940}

The results from the retriever comparison study are provided below. For this specific dataset, the metric I would want to optimize would be answer_relevancy as that metric is ideal when the primary goal is to ensure that the final answer directly addresses the user’s question. From the table, we can see that Ensemble had the highest answer relevancy score. However, the score for contextual compression was only 21 thousandths of a point lower, and it was less than 1/6th the cost of the ensemble retriever. In addition, the latency for contextual compression was roughly 1/7th the latency of ensemble. Therefore, I would choose the contextual compression retriever for this dataset.

| Method                 | Cost   | Num Tokens | Answer_relevancy | Context_recall | Context_precision | Latency (s) |
|------------------------|--------|------------|------------------|----------------|-------------------|---------|
| Ensemble               | 0.1415 | 876,695  | 0.9711           | 1.0000         | 0.6940            | 125.54   |
| Contextual Compression | 0.0224 | 127,186  | 0.9690           | 0.9474         | 0.8202            | 18.04 |
| Multi-query            | 0.1081 | 664,295  | 0.8175           | 1.0000         | 0.7363            | 95.42 |
| Parent                 | 0.0157 | 86,947   | 0.6104           | 0.8421         | 0.7719            | 17.46 |
| BM 25                  | 0.0264 | 150,360  | 0.5114           | 0.7588         | 0.6111            | 20.03 |
