In [None]:
'''
Simple RAG Implementation
Author: Christian Sarmiento
Purpose: This notebook is intended to get a simple implementation of RAG set up with LangChain.
Date Created: 10/1/24
Last Updated: 11/28/24
Data: https://archive.ics.uci.edu/dataset/450/sports+articles+for+objectivity+analysis
Sources:
- https://python.langchain.com/docs/tutorials/rag/
- https://python.langchain.com/docs/tutorials/llm_chain/
- https://medium.com/@dinabavli/rag-basics-basic-implementation-of-retrieval-augmented-generation-rag-e80e0791159d
- ChatGPT: o1-preview
-----------------------------------------------------------------------------------------------------------------------
RAG Research             |               Machine Learning Independent Study             |              DR. EITEL LAURIA
'''

In [44]:
# Download RAGAS for RAG metrics
%pip install ragas
%pip install nltk

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [19]:
# Imports
import sys
sys.path.append("/Users/christiansarmiento/Library/CloudStorage/OneDrive-MaristCollege/Machine Learning/Private Code")
from api_keys import openAIKey
from api_keys import langchainKey
from langchain_openai import ChatOpenAI
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain import hub  # for RAG prompt
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain.docstore.document import Document
from langchain_core.messages import AIMessage, HumanMessage
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import MessagesPlaceholder
from langchain_core.prompts import ChatPromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter


import pandas as pd
import os
import gradio as gr  # easy frontend implementation
import numpy as np
import time
import warnings
warnings.filterwarnings("ignore")


from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness, SemanticSimilarity
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas import SingleTurnSample


Failed to batch ingest runs: langsmith.utils.LangSmithRateLimitError: Rate limit exceeded for https://api.smith.langchain.com/runs/batch. HTTPError('429 Client Error: Too Many Requests for url: https://api.smith.langchain.com/runs/batch', '{"detail":"Monthly unique traces usage limit exceeded"}')
post: trace=a4d8d137-ae81-49e9-9ebd-604a51b49ffb,id=e298dfbb-e96e-4408-a711-0b005564d66e; trace=a4d8d137-ae81-49e9-9ebd-604a51b49ffb,id=ef7e7017-33f0-4ed5-9ac2-f0c7092c1b20; trace=a4d8d137-ae81-49e9-9ebd-604a51b49ffb,id=98c94455-1623-4462-8ff0-d802b763d73c; trace=a4d8d137-ae81-49e9-9ebd-604a51b49ffb,id=ebe3fe47-6c30-468d-8aa4-3252f14b2481; trace=a4d8d137-ae81-49e9-9ebd-604a51b49ffb,id=378897f6-1a70-4d3a-b13c-98a40414cd95; trace=a4d8d137-ae81-49e9-9ebd-604a51b49ffb,id=18277165-eff0-4ec0-8702-e21aaa70aeb6; trace=a4d8d137-ae81-49e9-9ebd-604a51b49ffb,id=0c7122fb-ac3d-429f-bbf6-9270c8d033b2; trace=9e855d11-b7d8-4416-958b-b74bc5a91721,id=9e855d11-b7d8-4416-958b-b74bc5a91721; trace=d41ddfe7-c7df-41c7

In [2]:
# LangChain Enviornment Variables
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = langchainKey()
os.environ["OPENAI_API_KEY"] = openAIKey()

In [3]:
# Load OpenAI model with metric wrapper
#llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini"))
#evalEmbeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())
llm = ChatOpenAI(model="gpt-4o-mini")

In [13]:
# Load Data
folderPath = "/Users/christiansarmiento/Library/CloudStorage/OneDrive-MaristCollege/Machine Learning/Data/sports_articles_corpus/Raw data"
sportsArticles = []

for fileName in os.listdir(folderPath):
    filePath = os.path.join(folderPath, fileName)
    loader = TextLoader(filePath, encoding='latin1')  # UTF-8 not working for the files
    doc = loader.load()
    sportsArticles.extend(doc)

In [14]:
# Split Documents into Chunks
textSplitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, add_start_index=True)
texts = textSplitter.split_documents(sportsArticles)

In [15]:
# Store Documents in Vector DB (Chroma)
vectorDB = Chroma.from_documents(documents=texts, embedding=OpenAIEmbeddings())

In [16]:
# Setup Retrieval System
retriever = vectorDB.as_retriever(search_type="similarity", search_kwargs={"k": 3})  # Retrieves 3 documents

# To get retrieved documents:
# retrievedDocuments = retriever.invoke("query")

In [None]:
# Setting up the RAG Chain

# Function to format documents into the prompt
def formatDocs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Setup RAG Chain
prompt = hub.pull("rlm/rag-prompt")
ragChain = (
    {"context": retriever | formatDocs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [22]:
# Results
for chunk in ragChain.stream("Explain the offside rule in soccer."):
    print(chunk, end="", flush=True)

The offside rule in soccer states that a player is in an offside position if they are nearer to the opponent's goal line than both the ball and the second-to-last opponent when the ball is played to them, unless they are in their own half or level with the second-to-last opponent. Being in an offside position is not an offense in itself; the player must become involved in active play to be penalized. The rule aims to prevent players from gaining an unfair advantage by lingering near the opponent's goal.

In [2]:
# Test if answers are coming from the llm or from the documents
# Try giving documents that aren't real then asking questions on things off of that
# Avoids the model relying on trained info 
# Play with the system prompt

# Next step after QA - feed answers into the system to make it more conversational
# Implement Gradio
# Knowledge Graph 
# Identifying Metrics - do research!!


# Implement with Marist Data

In [4]:
# Load Data
csvPath = "/Users/christiansarmiento/Library/CloudStorage/OneDrive-MaristCollege/Machine Learning/Data/Cleaned_QA.csv"
maristQA = pd.read_csv(csvPath, header=None)

# To use RecursiveCharacterTextSplitter, we need a list of dictionaries
maristContext = [Document(page_content=text) for text in maristQA[1].tolist()]

In [5]:
# Split Documents into Chunks
textSplitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, add_start_index=True)
texts = textSplitter.split_documents(maristContext)

In [6]:
# Store Documents in Vector DB (Chroma)
vectorDB = Chroma.from_documents(documents=texts, embedding=OpenAIEmbeddings())

In [7]:
# Setup Retrieval System
retriever = vectorDB.as_retriever(search_type="similarity", search_kwargs={"k": 3})  # Retrieves 3 documents

In [8]:
# Prompts
systemPrompt = (
    
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
    
)

prompt = ChatPromptTemplate.from_messages(
    
    [
        ("system", systemPrompt),
        ("human", "{input}"),
    ]
    
)

contextualizeSystemPrompt = (
    "Given a chat history and the latest user question "
    "which might reference context in the chat history, "
    "formulate a standalone question which can be understood "
    "without the chat history. Do NOT answer the question, "
    "just reformulate it if needed and otherwise return it as is."
)

contextualizePrompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualizeSystemPrompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
historyAwareRetriever = create_history_aware_retriever(
    llm, retriever, contextualizePrompt
)

qaPrompt = ChatPromptTemplate.from_messages(
    [
        ("system", systemPrompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

In [9]:
# Make chains
questionAnswerChain = create_stuff_documents_chain(llm, qaPrompt)
ragChain = create_retrieval_chain(historyAwareRetriever, questionAnswerChain)

In [11]:
# Talking to ChatGPT, making RAG conversational
conversationHistory = []
userQuery = input("Prompt (0 to quit): ")
while userQuery != '0':

    # Print input - this is just for a VSCode enviornment to see I/O together, feel free to comment out in Jupyter
    print(f"User: {userQuery}")

    # Call ChatGPT using RAG chain
    llmResponse = ragChain.invoke({"input": userQuery, "chat_history": conversationHistory})
    print(f"LLM: {llmResponse['answer']}")
    print()
    conversationHistory.extend([
        
        HumanMessage(content=userQuery),
        AIMessage(content=llmResponse["answer"]),
        
    ])

    # New prompt
    userQuery = input("Prompt (0 to quit): ")

Marist College is located on the banks of the Hudson River and also has a campus in Florence, Italy.

Marist College is in Poughkeepsie, New York, situated along the Hudson River. The Florence campus is located in Florence, Italy.



In [10]:
# Frontend with Gradio

'''
Function that calls the RAG chain for Gradio
'''
evaluationSamples = []
def simpleRAG(userQuery, history, correctAnswer):

    # Ensure there is a list to use for the conversation history
    if history is None:
        history = []

    # Call ChatGPT using RAG chain
    llmResponse = ragChain.invoke({"input": userQuery, "chat_history": history})

    # Get response and context for evaluation
    responseText = llmResponse["answer"]
    retrievedContexts = [context.page_content for context in retriever.get_relevant_documents(userQuery)]

    # Save information for RAG metrics
    evaluationSamples.append({
        "user_input": userQuery,
        "retrieved_contexts": retrievedContexts,
        "response": responseText,
        "reference": correctAnswer    # ground truth 
    })

    # Save chat history for conversational aspect
    history.extend([
        
        HumanMessage(content=userQuery),
        AIMessage(content=llmResponse["answer"]),
        
    ])

    # Save input and output to history
    history.append(HumanMessage(content=userQuery))
    history.append(AIMessage(content=llmResponse["answer"]))

    # Prepare display of data
    #chatDisplay = [(msg.content, "User" if isinstance(msg, HumanMessage) else "LLM") for msg in history]

    return history  #, chatDisplay


In [None]:
# Frontend
interface = gr.Interface(
    fn=simpleRAG,  
    inputs=["text", "state", gr.Textbox(label="Correct Answer")],  
    outputs=["chatbot", "state"],  
    title="Simple RAG",  
    description="Initial setup for a simple conversational RAG process."
)

# Launch the frontend
interface.launch()

In [11]:
# Function to evaluate our RAG pipeline
async def pipelineEvaluation(dataset, metrics):

    # Run through our runs
    results = []
    for run in dataset:

        # Save our inputs/outputs
        inputQuery = run["user_input"]
        groundTruthAnswer = run["reference"]
        contexts = run["retrieved_contexts"]
        response = run["response"]

        # Create a SingleTurnSample object
        sample = SingleTurnSample(
            user_input=inputQuery,
            response=response,
            reference=groundTruthAnswer,
            retrieved_contexts=contexts 
        )

        # Evaluate metrics
        runResults = {"input_query": inputQuery}
        for metric in metrics:

            # Get the score for the given metric
            try:

                score = await metric.single_turn_ascore(sample)
                runResults[type(metric).__name__] = score

            except Exception as e:
                # Catch errors for debugging
                runResults[type(metric).__name__] = f"Error: {str(e)}"
        
        # Save metric results
        results.append(runResults)

    # Calculate mean and standard deviation for each metric
    metricsStats = {}
    for metric in metrics:
        metricName = type(metric).__name__
        scores = [result[metricName] for result in results if isinstance(result[metricName], (int, float))]
        
        # Only calculate stats if there are valid scores
        if scores:
            metricsStats[metricName] = {
                "mean": np.mean(scores),
                "std_dev": np.std(scores),
            }
            
        else:
            metricsStats[metricName] = {
                "mean": "No valid scores",
                "std_dev": "No valid scores",
            }
    
    return results, metricsStats

In [12]:
# Load metrics
evalMetrics = [LLMContextRecall(llm=LangchainLLMWrapper(llm)), 
               FactualCorrectness(llm=LangchainLLMWrapper(llm)), 
               Faithfulness(llm=LangchainLLMWrapper(llm)), 
               SemanticSimilarity(embeddings=LangchainEmbeddingsWrapper(OpenAIEmbeddings()))]

In [13]:
# Evaluate our pipeline responses
evalResults = await pipelineEvaluation(evaluationSamples, evalMetrics)
for result in evalResults:
    print(result)

{'input_query': 'Who is Carolyn Matheus?', 'LLMContextRecall': 0.0, 'FactualCorrectness': 0.0, 'Faithfulness': 1.0, 'SemanticSimilarity': 0.9328844088386496}


In [13]:
# Sample records from our dataset
maristTestSample = maristQA.sample(50, replace=False)
maristTestSample.head()

Unnamed: 0,0,1
486,When do you apply to the special education tea...,Education DepartmentDual Certification: Childh...
468,Eileen Curley Bio?,Contact InformationAcademic SchoolOfficeEmailP...
518,Dr. Joanne Myers role,Contact InformationAcademic SchoolOfficeEmailP...
553,Phone number Dr. Wermuth?,School of Liberal ArtsDr. Thomas S. WermuthVic...
14,Where is student financial services?,Graduate: Student Financial ServicesReceive th...


In [None]:
# Run our chain with each question and evaluate
chatHistory = None
for row in maristTestSample.iterrows():
    chatHistory = simpleRAG(row[1][0], chatHistory, row[1][1])

## Evaluation
evalResults, metricStats = await pipelineEvaluation(evaluationSamples, evalMetrics)
for result in evalResults:
    print(result)

for metric in metricStats.keys():
    print(f"{metric} - Mean: {metricStats[metric]['mean']}, St. Dev: {metricStats[metric]['std_dev']}")

# Define Simple RAG w/o Conversational Aspect
- Since we are looking at samples independently here, no need to have a conversational aspect here
- This may help with runtimes since as of right now, this is slower than the other two implementations

In [4]:
# Setting up the RAG Chain

## Load data
csvPath = "/Users/christiansarmiento/Library/CloudStorage/OneDrive-MaristCollege/Machine Learning/Data/Cleaned_QA.csv"
maristQA = pd.read_csv(csvPath, header=None)
maristContext = [Document(page_content=text) for text in maristQA[1].tolist()]

## Split Documents into Chunks
textSplitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, add_start_index=True)
texts = textSplitter.split_documents(maristContext)

# Store Documents in Vector DB (Chroma)
vectorDB = Chroma.from_documents(documents=texts, embedding=OpenAIEmbeddings())

# Setup Retrieval System
retriever = vectorDB.as_retriever(search_type="similarity", search_kwargs={"k": 3})  # Retrieves 3 document

# Function to format documents into the prompt
def formatDocs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Setup RAG Chain
prompt = hub.pull("rlm/rag-prompt")
ragChain = (
    {"context": retriever | formatDocs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [16]:
# SimpleRAG function

'''
Function that calls the runs the simpleRAG chain and saves responses to a global list for access
'''
evaluationSamples = []
def simpleRAG(userQuery, correctAnswer):

    # Call ChatGPT using RAG chain
    responseText = ragChain.invoke(userQuery)

    # Get context for evaluation
    retrievedContexts = [context.page_content for context in retriever.get_relevant_documents(userQuery)]

    # Ensure that we are only passing along contexts that are different from eachother
    finalContexts = [retrievedContexts[0]]
    for context in retrievedContexts:

        if context != retrievedContexts[0]:
            finalContexts.append(context)

    print(f"Final Context Length: {len(finalContexts)}")
        
    # Save information for RAG metrics
    evaluationSamples.append({
        "user_input": userQuery,
        "retrieved_contexts": finalContexts,
        "response": responseText,
        "reference": correctAnswer    # ground truth 
    })

In [6]:
# Function to evaluate our RAG pipeline
async def pipelineEvaluation(dataset, metrics):

    # Run through our runs
    results = []
    for run in dataset:

        # Save our inputs/outputs
        inputQuery = run["user_input"]
        groundTruthAnswer = run["reference"]
        contexts = run["retrieved_contexts"]
        response = run["response"]

        # Create a SingleTurnSample object
        sample = SingleTurnSample(
            user_input=inputQuery,
            response=response,
            reference=groundTruthAnswer,
            retrieved_contexts=contexts 
        )

        # Evaluate metrics
        runResults = {"input_query": inputQuery}
        for metric in metrics:

            # Get the score for the given metric
            try:

                score = await metric.single_turn_ascore(sample)
                runResults[type(metric).__name__] = score

            except Exception as e:
                # Catch errors for debugging
                runResults[type(metric).__name__] = f"Error: {str(e)}"
        
        # Save metric results
        results.append(runResults)

    # Calculate mean and standard deviation for each metric
    metricsStats = {}
    for metric in metrics:
        metricName = type(metric).__name__
        scores = [result[metricName] for result in results if isinstance(result[metricName], (int, float))]
        
        # Only calculate stats if there are valid scores
        if scores:
            metricsStats[metricName] = {
                "mean": np.mean(scores),
                "std_dev": np.std(scores),
            }
            
        else:
            metricsStats[metricName] = {
                "mean": "No valid scores",
                "std_dev": "No valid scores",
            }
    
    return results, metricsStats

In [28]:
# Function to clean up evaluation code
async def computeEvaluationMetrics(samples, metrics):
    '''
    Helper method to compute the metrics and its averages for a given sample. parameter "samples" is a list of
    dictonaries that was aggregated from running a RAG process and collecting the input and output data. parameter
    "metrics" is a list of metric objects from RAGAS to evaluate the given samples. 
    '''

    # Keep track of time
    startTime = time.time()

    # Evaluate every sample
    print("Starting Evaluation...")
    evalResults, metricStats = await pipelineEvaluation(samples, metrics)
    print("Evaluation Finished!")
    for result in evalResults:
        print(result)

    # Compute and output the mean and standard deviation for each metric
    print("+-+-+-+-+-+-+-+-+-+-+-+FINAL RESULTS+-+-+-+-+-+-+-+-+-+-+-+")
    for metric in metricStats.keys():
        print(f"{metric} - Mean: {metricStats[metric]['mean']}, St. Dev: {metricStats[metric]['std_dev']}")
    
    # Elapsed time
    endTime = time.time()
    elapsedTime = endTime - startTime
    if elapsedTime > 60:
        secToMin = elapsedTime / 60
        if secToMin > 60:
            print(f"\nExecution Time: {(secToMin / 60):.2f} hrs")
        else:
            print(f"\nExecution Time: {secToMin:.2f} min")
    
    else:
        print(f"\nExecution Time: {(elapsedTime):.2f} sec")

In [21]:
# Load metrics
evalMetrics = [LLMContextRecall(llm=LangchainLLMWrapper(llm)), 
               FactualCorrectness(llm=LangchainLLMWrapper(llm)), 
               Faithfulness(llm=LangchainLLMWrapper(llm)), 
               SemanticSimilarity(embeddings=LangchainEmbeddingsWrapper(OpenAIEmbeddings()))]

In [13]:
# Sample records from our dataset
maristTestSample = maristQA.sample(50, replace=False)
maristTestSample.head()

Unnamed: 0,0,1
301,Undergrad admission Contact details for genera...,Undergraduate: Admission CounselorsHave questi...
295,What is the cost of ASIP Program?,InternshipsAlbany Summer Internship Program (A...
508,What position does Dr. Stephen M. Mercier hold?,Contact InformationAcademic SchoolOfficeEmailP...
75,Who is Dr. Sally Dwyer-McNulty?,Contact InformationAcademic SchoolOfficeEmailP...
460,Who is the chair of the history department?,Contact InformationAcademic SchoolOfficeEmailP...


In [17]:
# Run simple RAG on sampled dataset from SelfRAG (we know these questions work with SelfRAG so we will run them here
# as a benchmark)
#verifiedCSVPath = "/Users/christiansarmiento/Library/CloudStorage/OneDrive-MaristCollege/Machine Learning/Data/marist_sampled_verified_QA.csv"
#verifiedQA = pd.read_csv(verifiedCSVPath)

## Run process pipeline
for row in maristTestSample.iterrows():
    simpleRAG(row[1][0], row[1][1])

print("Generations Done!")

Final Context Length: 2
Final Context Length: 1
Final Context Length: 1
Final Context Length: 1
Final Context Length: 1
Final Context Length: 1
Final Context Length: 1
Final Context Length: 1
Final Context Length: 2
Final Context Length: 1
Final Context Length: 1
Final Context Length: 1
Final Context Length: 2
Final Context Length: 1
Final Context Length: 1
Final Context Length: 1
Final Context Length: 2
Final Context Length: 1
Final Context Length: 2
Final Context Length: 1
Final Context Length: 2
Final Context Length: 1
Final Context Length: 1
Final Context Length: 1
Final Context Length: 1
Final Context Length: 1
Final Context Length: 1
Final Context Length: 3
Final Context Length: 3
Final Context Length: 1


Failed to batch ingest runs: langsmith.utils.LangSmithRateLimitError: Rate limit exceeded for https://api.smith.langchain.com/runs/batch. HTTPError('429 Client Error: Too Many Requests for url: https://api.smith.langchain.com/runs/batch', '{"detail":"Monthly unique traces usage limit exceeded"}')
post: trace=95362aff-61ad-47cf-b0bf-fbdcc8657b6c,id=95362aff-61ad-47cf-b0bf-fbdcc8657b6c; trace=95362aff-61ad-47cf-b0bf-fbdcc8657b6c,id=79551a88-714c-42d0-9154-392af7bbe738; trace=95362aff-61ad-47cf-b0bf-fbdcc8657b6c,id=1a8365fb-423f-49c7-9369-0041fa7b7a70; trace=95362aff-61ad-47cf-b0bf-fbdcc8657b6c,id=10f7bda5-f65f-4907-a1e4-b10969b2ab6e; trace=95362aff-61ad-47cf-b0bf-fbdcc8657b6c,id=4e2d23d6-f266-4f3f-9d20-d19068d6c2e4; trace=95362aff-61ad-47cf-b0bf-fbdcc8657b6c,id=ccab9e9e-8c9f-4c52-8aa0-87e5d075e367; trace=95362aff-61ad-47cf-b0bf-fbdcc8657b6c,id=1a39d9e5-19d1-4192-8ff4-c680a25d4ce6; trace=95362aff-61ad-47cf-b0bf-fbdcc8657b6c,id=900926c5-cacf-4b67-9788-f7f60f65e694; trace=95362aff-61ad-47cf

Final Context Length: 2
Final Context Length: 1
Final Context Length: 1
Final Context Length: 2
Final Context Length: 3
Final Context Length: 1
Final Context Length: 3
Final Context Length: 1
Final Context Length: 1
Final Context Length: 1
Final Context Length: 1
Final Context Length: 1
Final Context Length: 3
Final Context Length: 1
Final Context Length: 3
Final Context Length: 3
Final Context Length: 2
Final Context Length: 1
Final Context Length: 1
Final Context Length: 1
Generations Done!


Failed to batch ingest runs: langsmith.utils.LangSmithRateLimitError: Rate limit exceeded for https://api.smith.langchain.com/runs/batch. HTTPError('429 Client Error: Too Many Requests for url: https://api.smith.langchain.com/runs/batch', '{"detail":"Monthly unique traces usage limit exceeded"}')
post: trace=9f1065c2-ca17-40c7-98ee-e6abff775453,id=9f1065c2-ca17-40c7-98ee-e6abff775453; trace=9f1065c2-ca17-40c7-98ee-e6abff775453,id=283cb2c0-5244-459b-aeb1-08a2dc2d8fa9; trace=9f1065c2-ca17-40c7-98ee-e6abff775453,id=bdb904a7-47ed-40db-b663-703badd5ee3b; trace=9f1065c2-ca17-40c7-98ee-e6abff775453,id=2f44cce9-ed13-470b-8008-da5ef4dae8d8; trace=9f1065c2-ca17-40c7-98ee-e6abff775453,id=8f9e7568-431f-4f76-b645-2d90b75a082a; trace=9f1065c2-ca17-40c7-98ee-e6abff775453,id=2d094ee3-8adc-4a70-84ea-c8b99004cabc; trace=9f1065c2-ca17-40c7-98ee-e6abff775453,id=8cbd283e-486c-4447-acd6-d862345b1f06; trace=9f1065c2-ca17-40c7-98ee-e6abff775453,id=f8ed4961-a257-4e6f-b64a-6c13f97d5051; trace=9f1065c2-ca17-40c7

In [25]:
# Evaluation
await computeEvaluationMetrics(evaluationSamples, [evalMetrics[0]])

Starting Evaluation...


Failed to batch ingest runs: langsmith.utils.LangSmithRateLimitError: Rate limit exceeded for https://api.smith.langchain.com/runs/batch. HTTPError('429 Client Error: Too Many Requests for url: https://api.smith.langchain.com/runs/batch', '{"detail":"Monthly unique traces usage limit exceeded"}')
post: trace=6c5ffc0a-f9ea-4eba-9928-c7585877afe3,id=6c5ffc0a-f9ea-4eba-9928-c7585877afe3
Failed to batch ingest runs: langsmith.utils.LangSmithRateLimitError: Rate limit exceeded for https://api.smith.langchain.com/runs/batch. HTTPError('429 Client Error: Too Many Requests for url: https://api.smith.langchain.com/runs/batch', '{"detail":"Monthly unique traces usage limit exceeded"}')
post: trace=b5d3d559-ee1f-4fd0-866d-010239e0d97a,id=b5d3d559-ee1f-4fd0-866d-010239e0d97a; trace=febf2c39-6b24-4bcf-beb8-fa489ba20ab4,id=febf2c39-6b24-4bcf-beb8-fa489ba20ab4; trace=2f8aa9c4-d4c1-4c7d-b3d6-b402918b0770,id=2f8aa9c4-d4c1-4c7d-b3d6-b402918b0770; trace=3f120d6f-8717-4d2c-a278-9a724866e7f1,id=3f120d6f-87

Evaluation Finished!
{'input_query': 'Undergrad admission Contact details for general enquires?', 'LLMContextRecall': 0.6}
{'input_query': 'What is the cost of ASIP Program?', 'LLMContextRecall': 0.6}
{'input_query': 'What position does Dr. Stephen M. Mercier hold?', 'LLMContextRecall': 0.125}
{'input_query': 'Who is Dr. Sally Dwyer-McNulty?', 'LLMContextRecall': 1.0}
{'input_query': 'Who is the chair of the history department?', 'LLMContextRecall': 0.0}
{'input_query': 'What non-profit internships are there?', 'LLMContextRecall': 0.7666666666666667}
{'input_query': 'Cost of credit for Masters  of Arts in Museum Studies?', 'LLMContextRecall': 0.2}
{'input_query': 'Who is Dr. Pau-San Haruta?', 'LLMContextRecall': 0.6666666666666666}
{'input_query': 'About Fanfarelli', 'LLMContextRecall': 0.0}
{'input_query': 'Assistant dean of school of communications', 'LLMContextRecall': 0.7222222222222222}
{'input_query': 'Who is Subir Sengupta?', 'LLMContextRecall': 1.0}
{'input_query': 'What is the

Failed to batch ingest runs: langsmith.utils.LangSmithRateLimitError: Rate limit exceeded for https://api.smith.langchain.com/runs/batch. HTTPError('429 Client Error: Too Many Requests for url: https://api.smith.langchain.com/runs/batch', '{"detail":"Monthly unique traces usage limit exceeded"}')
post: trace=125b7755-0406-4462-a9ca-51d1b0bba921,id=125b7755-0406-4462-a9ca-51d1b0bba921; trace=125b7755-0406-4462-a9ca-51d1b0bba921,id=a4612d47-3186-46f7-a730-20608377214c; trace=125b7755-0406-4462-a9ca-51d1b0bba921,id=f7c15738-b276-442f-8cd9-11e5b3847892; trace=157e8561-7d18-48ce-8f53-fbd4169bd9bc,id=157e8561-7d18-48ce-8f53-fbd4169bd9bc; trace=157e8561-7d18-48ce-8f53-fbd4169bd9bc,id=f93d9950-a1af-41d6-9d23-75731c743289; trace=157e8561-7d18-48ce-8f53-fbd4169bd9bc,id=8a18993d-f83d-4a56-97b9-eedbfe496ae7; trace=ac49ccf5-ed5a-4541-846f-1b69c2a9c0d8,id=ac49ccf5-ed5a-4541-846f-1b69c2a9c0d8; trace=ac49ccf5-ed5a-4541-846f-1b69c2a9c0d8,id=e1509c53-d7d1-46c6-824c-fe9d9adad185; trace=ac49ccf5-ed5a-4541

In [26]:
# Evaluation for FactualCorrectness
await computeEvaluationMetrics(evaluationSamples, [evalMetrics[1]])

Starting Evaluation...


Failed to batch ingest runs: langsmith.utils.LangSmithRateLimitError: Rate limit exceeded for https://api.smith.langchain.com/runs/batch. HTTPError('429 Client Error: Too Many Requests for url: https://api.smith.langchain.com/runs/batch', '{"detail":"Monthly unique traces usage limit exceeded"}')
post: trace=c5e8245a-7ade-450c-98f4-48862cfd006e,id=c5e8245a-7ade-450c-98f4-48862cfd006e; trace=c5e8245a-7ade-450c-98f4-48862cfd006e,id=2c60ab47-a6cc-47b0-b8ce-33119b2d9081; trace=c5e8245a-7ade-450c-98f4-48862cfd006e,id=a95b5623-52aa-4684-b288-a7572f0acdf9; trace=647b80e6-b63b-4365-9a64-60c62038e663,id=647b80e6-b63b-4365-9a64-60c62038e663; trace=647b80e6-b63b-4365-9a64-60c62038e663,id=a0def91a-9b54-482b-9368-df693aa6b543; trace=647b80e6-b63b-4365-9a64-60c62038e663,id=7e69ad69-bf5b-4f49-92db-66734fe7281d; trace=0048d989-a1f3-4b02-a15e-280bda7bbbbe,id=0048d989-a1f3-4b02-a15e-280bda7bbbbe; trace=0048d989-a1f3-4b02-a15e-280bda7bbbbe,id=991f0623-2bf4-4881-9b66-2d1dfdafb6eb; trace=0048d989-a1f3-4b02

Evaluation Finished!
{'input_query': 'Undergrad admission Contact details for general enquires?', 'FactualCorrectness': 0.33}
{'input_query': 'What is the cost of ASIP Program?', 'FactualCorrectness': 0.26}
{'input_query': 'What position does Dr. Stephen M. Mercier hold?', 'FactualCorrectness': 0.0}
{'input_query': 'Who is Dr. Sally Dwyer-McNulty?', 'FactualCorrectness': 0.62}
{'input_query': 'Who is the chair of the history department?', 'FactualCorrectness': 0.0}
{'input_query': 'What non-profit internships are there?', 'FactualCorrectness': 0.1}
{'input_query': 'Cost of credit for Masters  of Arts in Museum Studies?', 'FactualCorrectness': 0.21}
{'input_query': 'Who is Dr. Pau-San Haruta?', 'FactualCorrectness': 0.29}
{'input_query': 'About Fanfarelli', 'FactualCorrectness': 0.0}
{'input_query': 'Assistant dean of school of communications', 'FactualCorrectness': 0.36}
{'input_query': 'Who is Subir Sengupta?', 'FactualCorrectness': 0.33}
{'input_query': 'What is the minimum gpa to ge

Failed to batch ingest runs: langsmith.utils.LangSmithRateLimitError: Rate limit exceeded for https://api.smith.langchain.com/runs/batch. HTTPError('429 Client Error: Too Many Requests for url: https://api.smith.langchain.com/runs/batch', '{"detail":"Monthly unique traces usage limit exceeded"}')
post: trace=e274578a-22ae-4e82-8154-18f8d91060f0,id=49ff8ed8-a235-4fee-a7c4-c87ee19d5dff; trace=e274578a-22ae-4e82-8154-18f8d91060f0,id=0bbbb8c3-2988-4dbd-b71e-83e717bf5d3c; trace=e274578a-22ae-4e82-8154-18f8d91060f0,id=3cc3af08-f3fb-42b2-8da7-7d523d5eb0a6; trace=e274578a-22ae-4e82-8154-18f8d91060f0,id=81849003-99dd-4fb5-b13d-57e983822993; trace=3f24bbd0-f416-4442-99e4-8cf13b98e1e1,id=3f24bbd0-f416-4442-99e4-8cf13b98e1e1; trace=3f24bbd0-f416-4442-99e4-8cf13b98e1e1,id=871a43d4-8efc-4194-8089-ba8d933f78f7; trace=3f24bbd0-f416-4442-99e4-8cf13b98e1e1,id=a55760f9-550e-4a28-88d2-b576a1164e12; trace=3f24bbd0-f416-4442-99e4-8cf13b98e1e1,id=4c394e11-923b-4775-8bbd-e999716782e5; trace=3f24bbd0-f416-4442

In [27]:
# Evaluation for Faithfulness
await computeEvaluationMetrics(evaluationSamples, [evalMetrics[2]])

Starting Evaluation...


Failed to batch ingest runs: langsmith.utils.LangSmithRateLimitError: Rate limit exceeded for https://api.smith.langchain.com/runs/batch. HTTPError('429 Client Error: Too Many Requests for url: https://api.smith.langchain.com/runs/batch', '{"detail":"Monthly unique traces usage limit exceeded"}')
post: trace=0a0b3723-f868-4e4d-bfaf-6c4b6c1a7ccd,id=0a0b3723-f868-4e4d-bfaf-6c4b6c1a7ccd; trace=0a0b3723-f868-4e4d-bfaf-6c4b6c1a7ccd,id=36bcf38a-ec76-4975-86d1-cf0aeb6e40dc; trace=0a0b3723-f868-4e4d-bfaf-6c4b6c1a7ccd,id=6532bea6-0add-4034-a087-adce8ecc9f3d
Failed to batch ingest runs: langsmith.utils.LangSmithConnectionError: Connection error caused failure to POST https://api.smith.langchain.com/runs/batch in LangSmith API. Please confirm your internet connection. SSLError(MaxRetryError("HTTPSConnectionPool(host='api.smith.langchain.com', port=443): Max retries exceeded with url: /runs/batch (Caused by SSLError(SSLEOFError(8, 'EOF occurred in violation of protocol (_ssl.c:2406)')))"))
Content

Evaluation Finished!
{'input_query': 'Undergrad admission Contact details for general enquires?', 'Faithfulness': 1.0}
{'input_query': 'What is the cost of ASIP Program?', 'Faithfulness': 1.0}
{'input_query': 'What position does Dr. Stephen M. Mercier hold?', 'Faithfulness': 1.0}
{'input_query': 'Who is Dr. Sally Dwyer-McNulty?', 'Faithfulness': 1.0}
{'input_query': 'Who is the chair of the history department?', 'Faithfulness': 1.0}
{'input_query': 'What non-profit internships are there?', 'Faithfulness': 0.8888888888888888}
{'input_query': 'Cost of credit for Masters  of Arts in Museum Studies?', 'Faithfulness': 1.0}
{'input_query': 'Who is Dr. Pau-San Haruta?', 'Faithfulness': 1.0}
{'input_query': 'About Fanfarelli', 'Faithfulness': 0.8}
{'input_query': 'Assistant dean of school of communications', 'Faithfulness': 1.0}
{'input_query': 'Who is Subir Sengupta?', 'Faithfulness': 0.875}
{'input_query': 'What is the minimum gpa to get an internship?', 'Faithfulness': 1.0}
{'input_query': 

Failed to batch ingest runs: langsmith.utils.LangSmithRateLimitError: Rate limit exceeded for https://api.smith.langchain.com/runs/batch. HTTPError('429 Client Error: Too Many Requests for url: https://api.smith.langchain.com/runs/batch', '{"detail":"Monthly unique traces usage limit exceeded"}')
post: trace=0c1dd7a3-7a7a-44c5-8959-525c165151d2,id=38708c99-6698-4618-a502-815e6f243e47; trace=0c1dd7a3-7a7a-44c5-8959-525c165151d2,id=dc15e619-e0bd-4d02-be6d-6f0e3cc373be; trace=0c1dd7a3-7a7a-44c5-8959-525c165151d2,id=9cd7f29a-4512-4353-a9d7-7c9fe498ed67; trace=110bf2a7-262c-44c3-94ce-6a238f55dad6,id=110bf2a7-262c-44c3-94ce-6a238f55dad6; trace=110bf2a7-262c-44c3-94ce-6a238f55dad6,id=7561f627-d4f1-49bd-b6bb-f598bee1f589; trace=110bf2a7-262c-44c3-94ce-6a238f55dad6,id=9420f1b6-663d-4889-b482-3784bee68e28; trace=110bf2a7-262c-44c3-94ce-6a238f55dad6,id=2e398373-f0be-4a61-a7dc-65b9723780f9; trace=110bf2a7-262c-44c3-94ce-6a238f55dad6,id=63a5f99a-b051-4ad4-b105-d73b646db469; trace=89fa7083-a942-460c

In [24]:
# Evaluation for SemanticSimilarity
await computeEvaluationMetrics(evaluationSamples, [evalMetrics[3]])

Starting Evaluation...
Evaluation Finished!
{'input_query': 'Undergrad admission Contact details for general enquires?', 'SemanticSimilarity': 0.9355849329864713}
{'input_query': 'What is the cost of ASIP Program?', 'SemanticSimilarity': 0.8725719598337792}
{'input_query': 'What position does Dr. Stephen M. Mercier hold?', 'SemanticSimilarity': 0.8035092985077776}
{'input_query': 'Who is Dr. Sally Dwyer-McNulty?', 'SemanticSimilarity': 0.9647212455250856}
{'input_query': 'Who is the chair of the history department?', 'SemanticSimilarity': 0.7795354215224463}
{'input_query': 'What non-profit internships are there?', 'SemanticSimilarity': 0.7575654237420436}
{'input_query': 'Cost of credit for Masters  of Arts in Museum Studies?', 'SemanticSimilarity': 0.8702278837860309}
{'input_query': 'Who is Dr. Pau-San Haruta?', 'SemanticSimilarity': 0.9525824258347897}
{'input_query': 'About Fanfarelli', 'SemanticSimilarity': 0.8647435165018217}
{'input_query': 'Assistant dean of school of communic