In [1]:
from ragas.metrics import context_recall
from ragas.metrics import context_precision
from ragas import evaluate

In [2]:
import nest_asyncio
from sympy import false

nest_asyncio.apply()

In [3]:
import os
import openai
from getpass import getpass

openai.api_key = getpass("Please provide your OpenAI Key: ")
os.environ["OPENAI_API_KEY"] = openai.api_key

In [4]:
from langchain_community.document_loaders import PyMuPDFLoader

doc1 = "Blueprint-for-an-AI-Bill-of-Rights.pdf"
loader = PyMuPDFLoader(
    doc1
)
documents = loader.load()

doc2 = "NIST.AI.600-1.pdf"
loader = PyMuPDFLoader(
    doc2
)
documents.extend(loader.load())

In [5]:
len(documents)

137

#### Loading OpenAI Embeddings Model

We'll need a process by which we can convert our text into vectors that allow us to compare to our query vector.

Let's use OpenAI's `text-embedding-ada-002` for this task!

- [`OpenAIEmbeddings`](https://api.python.langchain.com/en/latest/embeddings/langchain_openai.embeddings.base.OpenAIEmbeddings.html#langchain-openai-embeddings-base-openaiembeddings)

> NOTE: We are purposefully using an older embedding model to try and answer the guiding question: Is TE3 better than Ada-002?

In [6]:
from langchain_openai import OpenAIEmbeddings

EMBEDDING_MODEL = "text-embedding-ada-002"

embeddings = OpenAIEmbeddings(
    model = EMBEDDING_MODEL
)

#### Creating a QDrant VectorStore

Now that we have documents - we'll need a place to store them alongside their embeddings.

- [`Qdrant`](https://api.python.langchain.com/en/latest/qdrant/langchain_qdrant.qdrant.QdrantVectorStore.html#langchain_qdrant.qdrant.QdrantVectorStore)

> NOTE: You'll need to provide the embedding dimension for Ada-002!

In [7]:
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams

LOCATION = ":memory:"
COLLECTION_NAME = "PMarca Blogs"
VECTOR_SIZE = 1536

In [8]:
qdrant_client = QdrantClient(LOCATION)

qdrant_client.create_collection(
    collection_name = COLLECTION_NAME,
    vectors_config = VectorParams(size = VECTOR_SIZE, distance = Distance.COSINE)
)

qdrant_vector_store = QdrantVectorStore(
    client = qdrant_client,
    collection_name = COLLECTION_NAME,
    embedding = embeddings
)

qdrant_vector_store.add_documents(documents)

['fbad130f71ec46a583c960ec96b80ea6',
 '7eb5c5d7526c49cd8d84fd63d07a7852',
 '379ce24d7ada43dd84cc5f5d72c6bc33',
 '3b85b9b25e9b407787b09dcc69007586',
 '1b712612487b4a2c9addfbc122418a25',
 '067686f87ec84da7b2ce1c20adab4024',
 'ec48356dba2447a5af5cf0151f90b2cd',
 '1b4a29f1e3f74d669ee56d61e5f82885',
 '818e1d11679a499f81259187372d42bf',
 '096d67db6ff7488fbdfcb522fa3ebdbe',
 '6ca0fe3c1815495ba9b8b1b5ba8f78f8',
 'b03ef3d4c2a644a3bdd6984dc9eb187e',
 '645dc8440f1646598b06b58a13043b5a',
 'afb189468ac447a9b74607df84d3f782',
 'f94ec4e30bfa46adba1946956f2ec0f6',
 '62634279dd2948b0970e22cca1d7bdb7',
 '1d8c19d185d345358609f5c1fdd4546d',
 '60e441d45183449092e8f907f1605b20',
 '6da538b0aa384c9a8d5a7d27d866a6cd',
 '0b80220dfef84a96849611a81b7ee473',
 '0769143291f9479dbe9234ccb63eb68e',
 '35c619ff2dae44a6b89a948db5fe93f4',
 '67136c3c288740f48bc8729bd38ce70a',
 '3c1cd3d141dc4eb29e302dce1ad537c1',
 'b8e39281b9014744b245cb11cede3392',
 'fd9a4af874334a079016bbbb8abb7008',
 '3b034f109edd40dcaba68afcdd27059d',
 

In [9]:
retriever = qdrant_vector_store.as_retriever()

In [10]:
from langchain import hub

retrieval_qa_prompt = hub.pull("langchain-ai/retrieval-qa-chat")



In [11]:
print(retrieval_qa_prompt.messages[0].prompt.template)

Answer any use questions based solely on the context below:

<context>
{context}
</context>


As you can see - the prompt template is simple (and has a small error) - so we'll create our own to be a bit more specific!

In [12]:
from langchain.prompts import ChatPromptTemplate

template = """
Answer the question based on the following context. If you can't find the answer within the context, respond with 'I don't know'.

Question:
{question}

Context:
{context}
"""

prompt = ChatPromptTemplate.from_template(template)

#### Setting Up our Basic QA Chain

Now we can instantiate our basic RAG chain!

We'll use LCEL directly just to see an example of it - but you could just as easily use an abstraction here to achieve the same goal!

We'll also ensure to pass-through our context - which is critical for RAGAS.

In [13]:
from operator import itemgetter

from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

primary_qa_llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)

retrieval_augmented_qa_chain = (
    # INVOKE CHAIN WITH: {"question" : "<<SOME USER QUESTION>>"}
    # "question" : populated by getting the value of the "question" key
    # "context"  : populated by getting the value of the "question" key and chaining it into the base_retriever
    {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
    # "context"  : is assigned to a RunnablePassthrough object (will not be called or considered in the next step)
    #              by getting the value of the "context" key from the previous step
    | RunnablePassthrough.assign(context=itemgetter("context"))
    # "response" : the "context" and "question" values are used to format our prompt object and then piped
    #              into the LLM and stored in a key called "response"
    # "context"  : populated by getting the value of the "context" key from the previous step
    | {"response": prompt | primary_qa_llm, "context": itemgetter("context")}
)

### Generating Responses with RAG Pipeline

Now that we have some QC pairs, and some ground truths, let's evaluate our RAG pipeline using Ragas.

The process is, again, quite straightforward - thanks to Ragas and LangChain!

Let's start by extracting our questions and ground truths from our create testset.

We can start by converting our test dataset into a Pandas DataFrame.

In [14]:
import pandas as pd

test_df = pd.read_csv("testset.csv")

In [15]:
test_df

Unnamed: 0.1,Unnamed: 0,question,contexts,ground_truth,evolution_type,metadata,episode_done
0,0,What are some strategies that aim to mitigate ...,"['assessments, auditing mechanisms, assessment...","Assessments, auditing mechanisms, assessment o...",simple,[{'source': 'Blueprint-for-an-AI-Bill-of-Right...,True
1,1,How are risks associated with transparency and...,['34 \nMS-2.7-009 Regularly assess and verify ...,Risks associated with transparency and account...,simple,"[{'source': 'NIST.AI.600-1.pdf', 'file_path': ...",True
2,2,How can risks from confabulations impact real-...,['contextual and/or domain expertise. \nRisks...,Risks from confabulations may impact real-worl...,simple,"[{'source': 'NIST.AI.600-1.pdf', 'file_path': ...",True
3,3,What expectations should be set for automated ...,['SAFE AND EFFECTIVE \nSYSTEMS \nWHAT SHOULD B...,The expectations for automated systems should ...,simple,[{'source': 'Blueprint-for-an-AI-Bill-of-Right...,True
4,4,What are the potential risks associated with d...,"['tracked, e.g., via a specialized type in a d...",The potential risks associated with data reuse...,simple,[{'source': 'Blueprint-for-an-AI-Bill-of-Right...,True
5,5,How should machine learning models be monitore...,['based on changing real-world conditions or d...,This ongoing monitoring should include continu...,simple,[{'source': 'Blueprint-for-an-AI-Bill-of-Right...,True
6,6,What is the purpose of red-teaming in identify...,['environment and in collaboration with AI dev...,The purpose of red-teaming in identifying pote...,simple,"[{'source': 'NIST.AI.600-1.pdf', 'file_path': ...",True
7,7,"How can the AI model be explained, validated, ...","['35 \nMEASURE 2.9: The AI model is explained,...","The AI model can be explained, validated, and ...",simple,"[{'source': 'NIST.AI.600-1.pdf', 'file_path': ...",True
8,8,What risks do GAI systems pose to data privacy?,['2.4. Data Privacy \nGAI systems raise severa...,GAI systems pose risks to data privacy by requ...,simple,"[{'source': 'NIST.AI.600-1.pdf', 'file_path': ...",True
9,9,How do school audio surveillance systems monit...,"['teenage girl was pregnant, and sent maternit...",School audio surveillance systems monitor stud...,simple,[{'source': 'Blueprint-for-an-AI-Bill-of-Right...,True


In [16]:
test_questions = test_df["question"].values.tolist()
test_groundtruths = test_df["ground_truth"].values.tolist()

Now we'll generate responses using our RAG pipeline using the questions we've generated - we'll also need to collect our retrieved contexts for each question.

We'll do this in a simple loop to see exactly what's happening!

In [17]:
answers = []
contexts = []

for question in test_questions:
  response = retrieval_augmented_qa_chain.invoke({"question" : question})
  answers.append(response["response"].content)
  contexts.append([context.page_content for context in response["context"]])

Now we can wrap our information in a Hugging Face dataset for use in the Ragas library.

In [18]:
from datasets import Dataset

response_dataset = Dataset.from_dict({
    "question" : test_questions,
    "answer" : answers,
    "contexts" : contexts,
    "ground_truth" : test_groundtruths
})

Let's take a peek and see what that looks like!

In [19]:
response_dataset[0]

{'question': "What are some strategies that aim to mitigate risks posed by the use of AI to companies' reputation, legal responsibilities, and product safety concerns, including documentation procedures specific to model assessments?",
 'answer': "Some strategies that aim to mitigate risks posed by the use of AI to companies' reputation, legal responsibilities, and product safety concerns include:\n\n1. **Periodic Monitoring**: Conduct periodic monitoring of AI-generated content for privacy risks and address any possible instances of personally identifiable information (PII) or sensitive data exposure.\n\n2. **Intellectual Property Processes**: Implement processes for responding to potential intellectual property infringement claims or other rights.\n\n3. **Integration with Governance**: Connect new generative AI (GAI) policies, procedures, and processes to existing model, data, software development, and IT governance, as well as to legal, compliance, and risk management activities.\n\

## Task 1: Evaluating our Pipeline with Ragas

Now that we have our response dataset - we can finally get into the "meat" of Ragas - evaluation!

First, we'll import the desired metrics, then we can use them to evaluate our created dataset!

Check out the specific metrics we'll be using in the Ragas documentation:

- [Faithfulness](https://docs.ragas.io/en/stable/concepts/metrics/faithfulness.html)
- [Answer Relevancy](https://docs.ragas.io/en/stable/concepts/metrics/answer_relevance.html)
- [Context Precision](https://docs.ragas.io/en/stable/concepts/metrics/context_precision.html)
- [Context Recall](https://docs.ragas.io/en/stable/concepts/metrics/context_recall.html)
- [Answer Correctness](https://docs.ragas.io/en/stable/concepts/metrics/answer_correctness.html)

See the accompanied presentation for more in-depth explanations about each of the metrics!

In [20]:
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    answer_correctness,
    context_recall,
    context_precision,
)

metrics = [
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    answer_correctness,
]

All that's left to do is call "evaluate" and away we go!

In [21]:
results = evaluate(response_dataset, metrics)

Evaluating:   0%|          | 0/95 [00:00<?, ?it/s]

In [22]:
results

{'faithfulness': 0.9392, 'answer_relevancy': 0.9721, 'context_recall': 0.9868, 'context_precision': 0.9123, 'answer_correctness': 0.6878}

In [23]:
results_df = results.to_pandas()
results_df

Unnamed: 0,question,contexts,answer,ground_truth,faithfulness,answer_relevancy,context_recall,context_precision,answer_correctness
0,What are some strategies that aim to mitigate ...,[ \n26 \nMAP 4.1: Approaches for mapping AI te...,Some strategies that aim to mitigate risks pos...,"Assessments, auditing mechanisms, assessment o...",0.962963,0.922407,1.0,1.0,0.277714
1,How are risks associated with transparency and...,[ \n16 \nGOVERN 1.5: Ongoing monitoring and pe...,Risks associated with transparency and account...,Risks associated with transparency and account...,0.666667,0.960816,1.0,1.0,0.584416
2,How can risks from confabulations impact real-...,[ \n6 \n2.2. Confabulation \n“Confabulation” r...,Risks from confabulations can significantly im...,Risks from confabulations may impact real-worl...,1.0,0.951874,1.0,0.916667,0.529269
3,What expectations should be set for automated ...,[ \n \n \n \n \n \n \nSAFE AND EFFECTIVE \nSYS...,To ensure that automated systems are safe and ...,The expectations for automated systems should ...,1.0,0.996364,1.0,1.0,0.338468
4,What are the potential risks associated with d...,[ \n \n \n \nDATA PRIVACY \nEXTRA PROTECTIONS ...,The potential risks associated with data reuse...,The potential risks associated with data reuse...,1.0,1.0,0.75,1.0,0.808657
5,How should machine learning models be monitore...,[ \n \n \n \n \n \n \n \n \n \n \n \nSAFE AND ...,Machine learning models should be monitored an...,This ongoing monitoring should include continu...,1.0,0.929352,1.0,1.0,0.534392
6,What is the purpose of red-teaming in identify...,[ \n50 \nParticipatory Engagement Methods \nOn...,The purpose of red-teaming in identifying pote...,The purpose of red-teaming in identifying pote...,1.0,1.0,1.0,0.916667,0.846671
7,"How can the AI model be explained, validated, ...",[ \n35 \nMEASURE 2.9: The AI model is explaine...,"The AI model can be explained, validated, and ...","The AI model can be explained, validated, and ...",1.0,0.971875,1.0,1.0,0.489919
8,What risks do GAI systems pose to data privacy?,[ \n7 \nunethical behavior. Text-to-image mode...,GAI systems pose several risks to data privacy...,GAI systems pose risks to data privacy by requ...,1.0,0.988109,1.0,1.0,0.66353
9,How do school audio surveillance systems monit...,[ \n \n \n \n \n \n \nDATA PRIVACY \nEXTRA PRO...,School audio surveillance systems monitor stud...,School audio surveillance systems monitor stud...,1.0,0.96498,1.0,1.0,0.877946


## Task : Testing OpenAI's Claim

Now that we've seen how our retriever can impact the performance of our RAG pipeline - let's see how changing our embedding model impacts performance.

####🏗️ Activity #1:

Please provide markdown, or code comments, to explain which each of the following steps are doing!

In [41]:
# Uses the TE3 embedding model (instead of ada) 
te3_embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

In [45]:
# Creates a new collection for use with the te3 embedding model
# qdrant_client.create_collection(
#     collection_name=COLLECTION_NAME+"TE3",
#     vectors_config=VectorParams(size=1536, distance=Distance.COSINE),
# )

# Creates the vector store/index
qdrant_vector_store = QdrantVectorStore(
    client=qdrant_client,
    collection_name=COLLECTION_NAME+"TE3",
    embedding=te3_embeddings,
)

# Adds documents to the store/index
qdrant_vector_store.add_documents(documents)

['b3d1d68342ad4955a21a90f4bc8b09da',
 '78a60a0b893e461e8f98a78a5a176770',
 '1ed248a16def4b248aa9af755ddd7922',
 '9a2a8eaa2bd148ab9134f44f64217a87',
 '52525476cd9d4744b4e2220648d51130',
 '73d6cfcd2e2e487c8f7726868baf0a82',
 'db8c22fcf244462287ba0e2337ab36ec',
 '3fa70a257f734670a4490a953743a9f4',
 '9ddfcd8917634ec2b4a630263ca5568a',
 '84849aeec268423a8c3421e2305999fe',
 '14b6c22c39f7450aa56e49f8b2f0124c',
 '800bafe24af84ef5b0074f98b3dbb232',
 '73c2c526e02e49278ea197d885608019',
 'debe8e24129a4d64961efb1153d46831',
 '4039f60b144543eb9f0b22ac8517a993',
 '3f0e4e14300f4dec920a11930e5f0032',
 '0a6e439b324344f1a9e475600e164f1c',
 '81401c874e5040c288aed437181d5f0b',
 '7ca2e4bae89d4259b3d64450d516015a',
 '065c57b2bac54e919e752c4d174a66a7',
 '70cbdd7b345c4fd4b1c805760dd8c479',
 '7d10aca352004b2b81602a87290f1d05',
 '7827b4bb590e42e08f51a1a46534ae1e',
 '59411f5d635a4357bb793e2e475dd13a',
 '327a75dfdc7948399694120447ec74f1',
 '70fd9007754e4c83ba5c189ecb682a92',
 'f3c7a34aef734172a3486803e2f52692',
 

In [47]:
# Adapts the vector store/index as a retriever/callable
te3_retriever = qdrant_vector_store.as_retriever()

In [48]:
from langchain.chains.combine_documents import create_stuff_documents_chain

# Creates chain that can be used for passing list of documents to a model
document_chain = create_stuff_documents_chain(primary_qa_llm, retrieval_qa_prompt)

In [49]:
from langchain.chains import create_retrieval_chain

# Creates a retrieval chain that retrieves documents then passes them to a document chain (from previous step)
te3_retrieval_chain = create_retrieval_chain(te3_retriever, document_chain)

In [50]:
# Invoke chain for each test question and collect answer and context
answers = []
contexts = []

for question in test_questions:
  response = te3_retrieval_chain.invoke({"input" : question})
  answers.append(response["answer"])
  contexts.append([context.page_content for context in response["context"]])

In [51]:
# Creates a dataset from the results from the previous step
te3_response_dataset_advanced_retrieval = Dataset.from_dict({
    "question" : test_questions,
    "answer" : answers,
    "contexts" : contexts,
    "ground_truth" : test_groundtruths
})

In [52]:
# Use ragas to evaluate the dataset against our chosen metrics
te3_advanced_retrieval_results = evaluate(te3_response_dataset_advanced_retrieval, metrics)

Evaluating:   0%|          | 0/95 [00:00<?, ?it/s]

In [53]:
# Print the evaluation results
te3_advanced_retrieval_results

{'faithfulness': 0.8819, 'answer_relevancy': 0.9663, 'context_recall': 0.9079, 'context_precision': 0.9123, 'answer_correctness': 0.6938}

In [45]:
# # Compare the baseline results to our te3 retrieval results
# df_baseline = pd.DataFrame(list(results.items()), columns=['Metric', 'ADA'])
# df_comparison = pd.DataFrame(list(te3_advanced_retrieval_results.items()), columns=['Metric', 'TE3'])
# 
# df_merged = pd.merge(df_baseline, df_comparison, on='Metric')
# 
# df_merged['Baseline -> TE3'] = df_merged['TE3'] - df_merged['ADA']
# 
# df_merged

Unnamed: 0,Metric,ADA,TE3,Baseline -> TE3
0,faithfulness,0.706871,0.777672,0.070801
1,answer_relevancy,0.717355,0.972447,0.255092
2,context_recall,0.600877,0.622807,0.02193
3,context_precision,0.703216,0.628655,-0.074561
4,answer_correctness,0.593403,0.625143,0.03174


####❓ Question #3:

Do you think, in your opinion, `text-embedding-3-small` is significantly better than `ada`?

Yes, on almost every metric we care about except context precision, te3 provided significantly better results. The answer relevancy was hugely improved.

# Test Fine-tuned Embedding Model

In [31]:
from langchain_huggingface import HuggingFaceEmbeddings

finetune_embeddings = HuggingFaceEmbeddings(model_name="finetuned_arctic")

Some weights of BertModel were not initialized from the model checkpoint at finetuned_arctic and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [32]:
qdrant_client.create_collection(
    collection_name=COLLECTION_NAME+"FT",
    vectors_config=VectorParams(size=768, distance=Distance.COSINE),
)

qdrant_vector_store = QdrantVectorStore(
    client=qdrant_client,
    collection_name=COLLECTION_NAME+"FT",
    embedding=finetune_embeddings,
)

qdrant_vector_store.add_documents(documents)

['7f26e8b7385d49c5becca7bc6495f46c',
 'f7d6f55bcf0440449d623d388ca1fe73',
 '95997a77fda54973a3407194b7989952',
 '931829664f7e4291a634a415e4ee53f4',
 'ff7336687df0493788bcdd8511ea8584',
 'fbf62cea62cc46a7a5865c91e1abc9f3',
 '6aabede721fb4fc785b90cc5735a787e',
 '07fee9da81da4061a1146c07e56b6aaa',
 '14dbcf1b48ba4ca3bf81d204eea46c6e',
 '15cf4ff7dba340f6a654fe69697df424',
 '82458c1087b4461eb1358c5c54218e84',
 '6d37582534a74c33a51b82192ec8cd2d',
 'd59a7573cb3e43de90d020dbe8b0476c',
 '0948b10820c94f1994c1e416f1ad9d55',
 '225f585948004ec6ad7bd6de5dbd4a6d',
 '7e9a2f476c9e42a3a294ba134252f714',
 '704f814ad8884cf5ab6c4a685743b252',
 '88cbb7be59314734ac33a0c24379f5a4',
 '30ce4198e28941c98a8565b51d086188',
 '6edf310cafe843fa96ab023b16f53cb5',
 'd4e4217bc15341cb9ee22d84cec6f5e4',
 '34bdcf6ed70e471eb00120c47bb44323',
 '6847581f273d42a88c06ef99ee3c88bd',
 'efb58810f07d41869a1be5c2d4b3d8f6',
 '3345ab7997b34a2bad2e2be7340cd5b2',
 '855139db38204be1af7dd402acf621bc',
 'daa0c244d3894925a24a1a88bcaecf07',
 

In [34]:
ft_retriever = qdrant_vector_store.as_retriever()

In [35]:
ft_retrieval_chain = create_retrieval_chain(ft_retriever, document_chain)

In [36]:
answers = []
contexts = []

for question in test_questions:
    response = ft_retrieval_chain.invoke({"input" : question})
    answers.append(response["answer"])
    contexts.append([context.page_content for context in response["context"]])

In [37]:
ft_response_dataset_advanced_retrieval = Dataset.from_dict({
    "question" : test_questions,
    "answer" : answers,
    "contexts" : contexts,
    "ground_truth" : test_groundtruths
})

In [38]:
ft_advanced_retrieval_results = evaluate(ft_response_dataset_advanced_retrieval, metrics)

Evaluating:   0%|          | 0/95 [00:00<?, ?it/s]

In [39]:
ft_advanced_retrieval_results

{'faithfulness': 0.9134, 'answer_relevancy': 0.9168, 'context_recall': 0.9035, 'context_precision': 0.9430, 'answer_correctness': 0.6306}

In [54]:
# df_baseline = pd.DataFrame(list(results.items()), columns=['Metric', 'ADA'])
# df_comparison = pd.DataFrame(list(te3_advanced_retrieval_results.items()), columns=['Metric', 'TE3'])
df_baseline = pd.DataFrame(list(te3_advanced_retrieval_results.items()), columns=['Metric', 'TE3'])
df_comparison = pd.DataFrame(list(ft_advanced_retrieval_results.items()), columns=['Metric', 'FT'])

df_merged = pd.merge(df_baseline, df_comparison, on='Metric')

df_merged['TE3 -> FT'] = df_merged['FT'] - df_merged['TE3']

df_merged

Unnamed: 0,Metric,TE3,FT,TE3 -> FT
0,faithfulness,0.881923,0.913377,0.031454
1,answer_relevancy,0.966298,0.916824,-0.049473
2,context_recall,0.907895,0.903509,-0.004386
3,context_precision,0.912281,0.942982,0.030702
4,answer_correctness,0.693761,0.630627,-0.063134


# Test Chunking Strategies

In [56]:
doc1 = PyMuPDFLoader("Blueprint-for-an-AI-Bill-of-Rights.pdf").load()
doc2 = PyMuPDFLoader("NIST.AI.600-1.pdf").load()

In [57]:
import tiktoken
from langchain.text_splitter import RecursiveCharacterTextSplitter

def tiktoken_len(text):
    tokens = tiktoken.encoding_for_model("gpt-4o").encode(
        text,
    )
    return len(tokens)

text_splitter1 = RecursiveCharacterTextSplitter(
    chunk_size = 500,
    chunk_overlap = 20,
    length_function = tiktoken_len,
)

In [58]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings

text_splitter2 = SemanticChunker(OpenAIEmbeddings())

In [59]:
from langchain_community.vectorstores import Qdrant
from langchain_openai.embeddings import OpenAIEmbeddings

embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")

qdrant_vectorstore1 = Qdrant.from_documents(
    documents=text_splitter1.split_documents(doc1) + text_splitter1.split_documents(doc2),
    embedding=embedding_model,
    location=":memory:"
)

NameError: name 'text_splitter1' is not defined

In [None]:
qdrant_vectorstore2 = Qdrant.from_documents(
    documents=text_splitter2.split_documents(doc1) + text_splitter2.split_documents(doc2),
    embedding=embedding_model,
    location=":memory:"
)

In [None]:
qdrant_retriever1 = qdrant_vectorstore1.as_retriever()

In [None]:
qdrant_retriever2 = qdrant_vectorstore2.as_retriever()

In [None]:
import pandas as pd

test_df = pd.read_csv("testset.csv")

In [None]:
test_questions = test_df["question"].values.tolist()
test_groundtruths = test_df["ground_truth"].values.tolist()

In [None]:
from operator import itemgetter

from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

primary_qa_llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)

retrieval_augmented_qa_chain1 = (
    # INVOKE CHAIN WITH: {"question" : "<<SOME USER QUESTION>>"}
    # "question" : populated by getting the value of the "question" key
    # "context"  : populated by getting the value of the "question" key and chaining it into the base_retriever
        {"context": itemgetter("question") | qdrant_retriever1, "question": itemgetter("question")}
        # "context"  : is assigned to a RunnablePassthrough object (will not be called or considered in the next step)
        #              by getting the value of the "context" key from the previous step
        | RunnablePassthrough.assign(context=itemgetter("context"))
        # "response" : the "context" and "question" values are used to format our prompt object and then piped
        #              into the LLM and stored in a key called "response"
        # "context"  : populated by getting the value of the "context" key from the previous step
        | {"response": prompt | primary_qa_llm, "context": itemgetter("context")}
)

In [None]:
retrieval_augmented_qa_chain2 = (
    # INVOKE CHAIN WITH: {"question" : "<<SOME USER QUESTION>>"}
    # "question" : populated by getting the value of the "question" key
    # "context"  : populated by getting the value of the "question" key and chaining it into the base_retriever
        {"context": itemgetter("question") | qdrant_retriever2, "question": itemgetter("question")}
        # "context"  : is assigned to a RunnablePassthrough object (will not be called or considered in the next step)
        #              by getting the value of the "context" key from the previous step
        | RunnablePassthrough.assign(context=itemgetter("context"))
        # "response" : the "context" and "question" values are used to format our prompt object and then piped
        #              into the LLM and stored in a key called "response"
        # "context"  : populated by getting the value of the "context" key from the previous step
        | {"response": prompt | primary_qa_llm, "context": itemgetter("context")}
)

In [None]:
answers1 = []
contexts1 = []

for question in test_questions:
    response = retrieval_augmented_qa_chain1.invoke({"question" : question})
    answers1.append(response["response"].content)
    contexts1.append([context.page_content for context in response["context"]])

In [None]:
answers2 = []
contexts2 = []

for question in test_questions:
    response = retrieval_augmented_qa_chain2.invoke({"question" : question})
    answers2.append(response["response"].content)
    contexts2.append([context.page_content for context in response["context"]])

In [None]:
from datasets import Dataset

response_dataset1 = Dataset.from_dict({
    "question" : test_questions,
    "answer" : answers1,
    "contexts" : contexts1,
    "ground_truth" : test_groundtruths
})

In [None]:
from datasets import Dataset

response_dataset2 = Dataset.from_dict({
    "question" : test_questions,
    "answer" : answers2,
    "contexts" : contexts2,
    "ground_truth" : test_groundtruths
})

In [None]:
results1 = evaluate(response_dataset1, metrics)
results1

In [None]:
results2 = evaluate(response_dataset1, metrics)
results2

In [None]:
df_baseline = pd.DataFrame(list(results1.items()), columns=['Metric', 'RCTS'])
df_comparison = pd.DataFrame(list(results2.items()), columns=['Metric', 'SC'])

df_merged = pd.merge(df_baseline, df_comparison, on='Metric')

df_merged['RCTS -> SC'] = df_merged['RCTS'] - df_merged['SC']

df_merged