In [None]:
'''
Corrective RAG Implementation
Author: Christian Sarmiento
Purpose: This notebook is intended to get a Self-RAG implementation set up with LangChain/LangGraph.
Date Created: 12/4/24
Last Updated: 12/4/24
Data: Marist College Administrative Corpus Dataset
Sources:
- https://blog.langchain.dev/agentic-rag-with-langgraph/
- https://github.com/langchain-ai/langgraph/blob/main/examples/rag/langgraph_crag.ipynb?ref=blog.langchain.dev
Note: Most of the code for graph implementation of CRAG was taken from the second source.
-----------------------------------------------------------------------------------------------------------------------
RAG Research             |               Machine Learning Independent Study             |              DR. EITEL LAURIA
'''

In [1]:
%pip install tavily-python

Collecting tavily-python
  Downloading tavily_python-0.5.0-py3-none-any.whl.metadata (11 kB)
Downloading tavily_python-0.5.0-py3-none-any.whl (14 kB)
Installing collected packages: tavily-python
Successfully installed tavily-python-0.5.0
Note: you may need to restart the kernel to use updated packages.


In [7]:
pip install -U duckduckgo-search

Collecting duckduckgo-search
  Downloading duckduckgo_search-7.2.0-py3-none-any.whl.metadata (17 kB)
Collecting primp>=0.9.3 (from duckduckgo-search)
  Downloading primp-0.9.3-cp38-abi3-macosx_10_12_x86_64.whl.metadata (12 kB)
Downloading duckduckgo_search-7.2.0-py3-none-any.whl (19 kB)
Downloading primp-0.9.3-cp38-abi3-macosx_10_12_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m26.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: primp, duckduckgo-search
Successfully installed duckduckgo-search-7.2.0 primp-0.9.3
Note: you may need to restart the kernel to use updated packages.


In [1]:
# Imports
from langchain_openai import ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain.schema import Document
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from typing_extensions import TypedDict
from langgraph.graph import END, StateGraph, START
from langchain_community.tools.tavily_search import TavilySearchResults
from langchain.tools import DuckDuckGoSearchResults
from langchain.prompts import PromptTemplate


import asyncio
from queue import Queue
import threading 
import pandas as pd
import os
import gradio as gr  # easy frontend implementation
from pprint import pprint
import numpy as np
import json
from typing import List, Dict, Optional, Tuple
import sys
sys.path.append("/Users/christiansarmiento/Library/CloudStorage/OneDrive-MaristCollege/Machine Learning/Private Code")
from api_keys import openAIKey
from api_keys import langchainKey
from api_keys import tavilyKey  # web search packages



from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness, SemanticSimilarity
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas import SingleTurnSample


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  exec(code_obj, self.user_global_ns, self.user_ns)
  from .autonotebook import tqdm as notebook_tqdm




In [2]:
# LangChain Enviornment Variables
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = langchainKey()
os.environ["OPENAI_API_KEY"] = openAIKey()
os.environ["TAVILY_API_KEY"] = tavilyKey()

In [24]:
# Load Data
csvPath = "/Users/christiansarmiento/Library/CloudStorage/OneDrive-MaristCollege/Machine Learning/Data/Marist_QA.csv"
maristQA = pd.read_csv(csvPath, header=None)

# To use RecursiveCharacterTextSplitter, we need a list of dictionaries
maristContext = [Document(page_content=text) for text in maristQA[1].tolist()]

In [25]:
# Split Documents into Chunks
textSplitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, add_start_index=True)
texts = textSplitter.split_documents(maristContext)

In [26]:
# Store Documents in Vector DB (Chroma)
vectorDB = Chroma.from_documents(documents=texts, embedding=OpenAIEmbeddings())

# Setup Retrieval System
retriever = vectorDB.as_retriever(search_type="similarity", search_kwargs={"k": 3})  # Retrieves 3 documents

In [27]:
# Define Grader class for document grading in CRAG
class GradeDocuments(BaseModel):
    """Binary score for relevance check on retrieved documents."""

    binary_score: str = Field(
        description="Documents are relevant to the question, 'yes' or 'no'"
    )

# LLM with function call
llmDocGrader = ChatOpenAI(model="gpt-4o-mini", temperature=0)
structuredLLMGrader = llmDocGrader.with_structured_output(GradeDocuments)

# Prompt
system = """You are a grader assessing relevance of a retrieved document to a user question. \n 
    It does not need to be a stringent test. The goal is to filter out erroneous retrievals. \n
   tic meaning related to the user question, grade it as relevant. \n
    Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question."""
gradePrompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "Retrieved document: \n\n {document} \n\n User question: {question}"),
    ]
)

# Grader Chain
retrievalGrader = gradePrompt | structuredLLMGrader

# Testing it
question = "agent memory"
docs = retriever.get_relevant_documents(question)
docTxt = docs[1].page_content
print(retrievalGrader.invoke({"question": question, "document": docTxt}))

binary_score='no'


In [28]:
# Generation Chain

# Prompt
prompt = hub.pull("rlm/rag-prompt")

# LLM
llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)

# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Chain
ragChain = prompt | llm | StrOutputParser()

# Run
generation = ragChain.invoke({"context": docs, "question": question})
print(generation)

I don't know.


In [29]:
# Question Rewriter
llmRewriter = ChatOpenAI(model="gpt-4o-mini", temperature=0)

# Prompt
system = """You a question re-writer that converts an input question to a better version that is optimized \n 
     for vectorstore retrieval. Look at the input and try to reason about the underlying semantic intent / meaning."""
rewritePrompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        (
            "human",
            "Here is the initial question: \n\n {question} \n Formulate an improved question.",
        ),
    ]
)

questionRewriter = rewritePrompt | llm | StrOutputParser()
questionRewriter.invoke({"question": question})

'What are the key concepts and techniques related to agent memory in artificial intelligence?'

In [30]:
# Web Search Component
webSearchTool = TavilySearchResults(k=3)

In [31]:
# Define metric evaluator

## Evaluation LLM & embeddings
evalLLM = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini", temperature=0))
evalEmbeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())

## Initialize metrics with LLM and embeddings
contextRecall = LLMContextRecall(llm=evalLLM)
faithfulness = Faithfulness(llm=evalLLM)
factualCorrectness = FactualCorrectness(llm=evalLLM)
semanticSimilarity = SemanticSimilarity(embeddings=evalEmbeddings)

## Collect metrics
evalMetrics = [
    contextRecall,
    faithfulness,
    factualCorrectness,
    semanticSimilarity
]

In [41]:
# Define Graph structure for Self-RAG

# Graph State
class GraphState(TypedDict):
    """
    Represents the state of our graph.

    Attributes:
        question: question
        generation: LLM generation
        documents: list of documents
        web_search: whether to add search
        metrics: evaluation metrics for each generation 
    """
    question: str
    generation: str
    webSearch: str
    documents: List[str]
    metrics: Optional[Dict[str, float]]
    visitedInternet: bool

## Nodes

# Retrieval Node     
def retrieve(state):
    """
    Retrieve documents

    Args:
        state (dict): The current graph state

    Returns:
        state (dict): New key added to state, documents, that contains retrieved documents
    """
    print("---RETRIEVE---")
    question = state["question"]

    # Retrieval
    documents = retriever.get_relevant_documents(question)
    return {"documents": documents, "question": question, "visitedInternet": state["visitedInternet"]}


# Generation Node
def generate(state):
    """
    Generate answer

    Args:
        state (dict): The current graph state

    Returns:
        state (dict): New key added to state, generation, that contains LLM generation
    """
    print("---GENERATE---")
    question = state["question"]
    documents = state["documents"]

    # RAG generation
    generation = ragChain.invoke({"context": documents, "question": question})

    # Return updated state
    updatedState = {"documents": documents, "question": question, "generation": generation, "visitedInternet": state["visitedInternet"]}

    return updatedState

# Grader Node
def gradeDocuments(state):
    """
    Determines whether the retrieved documents are relevant to the question.

    Args:
        state (dict): The current graph state

    Returns:
        state (dict): Updates documents key with only filtered relevant documents
    """

    print("---CHECK DOCUMENT RELEVANCE TO QUESTION---")
    question = state["question"]
    documents = state["documents"]

    # Score each doc
    filteredDocs = []
    webSearch = "No"
    for d in documents:

        score = retrievalGrader.invoke(
            {"question": question, "document": d.page_content}
        )
        grade = score.binary_score
        if grade == "yes":
            print("---GRADE: DOCUMENT RELEVANT---")
            filteredDocs.append(d)

        else:
            print("---GRADE: DOCUMENT NOT RELEVANT---")
            webSearch = "Yes"
            continue

    return {"documents": filteredDocs, "question": question, "webSearch": webSearch, "visitedInternet": state["visitedInternet"]}


# Rewriter node
def transformQuery(state):
    """
    Transform the query to produce a better question.

    Args:
        state (dict): The current graph state

    Returns:
        state (dict): Updates question key with a re-phrased question
    """

    print("---TRANSFORM QUERY---")
    question = state["question"]
    documents = state["documents"]

    # Re-write question
    betterQuestion = questionRewriter.invoke({"question": question})
    return {"documents": documents, "question": betterQuestion, "visitedInternet": state["visitedInternet"]}

def webSearch(state):
    """
    Web search based on the re-phrased question.

    Args:
        state (dict): The current graph state

    Returns:
        state (dict): Updates documents key with appended web results
    """

    print("---WEB SEARCH---")
    question = state["question"]
    documents = state["documents"]
    internetFlag = state["visitedInternet"]

    # Web search
    docs = webSearchTool.invoke({"query": question})
    web_results = "\n".join([d["content"] for d in docs])
    web_results = Document(page_content=web_results)
    documents.append(web_results)
    internetFlag = True

    return {"documents": documents, "question": question, "visitedInternet": internetFlag}

## Edges

# Generation edge
def decideToGenerate(state):
    """
    Determines whether to generate an answer, or re-generate a question.

    Args:
        state (dict): The current graph state

    Returns:
        str: Binary decision for next node to call
    """

    print("---ASSESS GRADED DOCUMENTS---")
    state["question"]
    webSearch = state["webSearch"]
    state["documents"]

    if webSearch == "Yes":
        # All documents have been filtered check_relevance
        # We will re-generate a new query
        print(
            "---DECISION: ALL DOCUMENTS ARE NOT RELEVANT TO QUESTION, TRANSFORM QUERY---"
        )
        return "transformQuery"
    
    else:
        # We have relevant documents, so generate answer
        print("---DECISION: GENERATE---")
        return "generate"
    
async def evaluateMetrics(state):
    """
    Evaluate metrics for the current RAG pipeline response.

    Args:
        state (dict): The current graph state.

    Returns:
        state (dict): Adds a 'metrics' key containing evaluation scores.
    """
    print("---EVALUATING METRICS---")
    question = state["question"]
    generation = state["generation"]
    documents = state["documents"]

    # Mock ground truth if unavailable (replace with actual reference if possible)
    groundTruth = state.get("groundTruth", "Expected answer based on context.")

    # Prepare retrieved contexts
    retrievedContexts = [doc.page_content for doc in documents]

    # Create a SingleTurnSample object
    sample = SingleTurnSample(
        user_input=question,
        response=generation,
        reference=groundTruth,
        retrieved_contexts=retrievedContexts,
    )

    # Evaluate metrics
    state["metrics"] = {
        "LLMContextRecall": await contextRecall.single_turn_ascore(sample),
        "Faithfulness": await faithfulness.single_turn_ascore(sample),
        "FactualCorrectness": await factualCorrectness.single_turn_ascore(sample),
        "SemanticSimilarity": await semanticSimilarity.single_turn_ascore(sample),
    }

    return state

In [42]:
workflow = StateGraph(GraphState)

# Define the nodes
workflow.add_node("retrieve", retrieve)  
workflow.add_node("gradeDocuments", gradeDocuments)  
workflow.add_node("generate", generate)  
workflow.add_node("transformQuery", transformQuery)  
workflow.add_node("webSearchNode", webSearch)
workflow.add_node("evaluateMetrics", evaluateMetrics)  

# Build graph
workflow.add_edge(START, "retrieve")
workflow.add_edge("retrieve", "gradeDocuments")
workflow.add_conditional_edges(
    "gradeDocuments",
    decideToGenerate,
    {
        "transformQuery": "transformQuery",
        "generate": "generate",
    },
)
workflow.add_edge("transformQuery", "webSearchNode")
workflow.add_edge("webSearchNode", "generate")
workflow.add_edge("generate", "evaluateMetrics")
workflow.add_edge("evaluateMetrics", END)

# Compile
app = workflow.compile()

In [16]:
# Inital test run
inputs = {"question": "Who is Carolyn Matheus?"}
async for output in app.astream(inputs):
    for key, value in output.items():

        # Print node
        pprint(f"Node '{key}':")

        # Optional: print full state at each node
        # pprint.pprint(value["keys"], indent=2, width=80, depth=None)

        # Print metrics
        if "metrics" in value:
            pprint("Metrics: ")
            pprint(value["metrics"])

    pprint("\n---\n")

# Final generation
pprint("Final Generation: ")
pprint(value["generation"])

# Final metrics
if "metrics" in value:
    pprint("Final Metrics: ")
    pprint(value["metrics"])

---RETRIEVE---
"Node 'retrieve':"
'\n---\n'
---CHECK DOCUMENT RELEVANCE TO QUESTION---
---GRADE: DOCUMENT RELEVANT---
---GRADE: DOCUMENT RELEVANT---
---GRADE: DOCUMENT RELEVANT---
---ASSESS GRADED DOCUMENTS---
---DECISION: GENERATE---
"Node 'gradeDocuments':"
'\n---\n'
---GENERATE---
"Node 'generate':"
'\n---\n'
---EVALUATING METRICS---
"Node 'evaluateMetrics':"
'Metrics: '
{'FactualCorrectness': 0.0,
 'Faithfulness': 1.0,
 'LLMContextRecall': 0.0,
 'SemanticSimilarity': 0.7259663840810067}
'\n---\n'
'Final Generation: '
('Dr. Carolyn C. Matheus is an Associate Professor of Information Systems and '
 'the Director of the Honors Program at Marist College. She holds a PhD in '
 'Organizational Studies with a focus on leadership from SUNY Albany and has '
 'received the National Society of Leadership and Success award for Excellence '
 'in Teaching. Dr. Matheus is involved in faculty-student research projects '
 'and offers seminars on authentic leadership and innovation.')
'Final Metrics

In [65]:
# Put CRAG into Gradio
evaluationSamples = []
ragResults = []
async def correctiveRAG(userQuery, history, correctAnswer=None):
    """
    Gradio-compatible function to process CRAG workflow.
    Args:
        userQuery (str): The user's question.
        history (list): Conversation history.
        correctAnswer (str): The ground truth answer for metrics (optional).

    Returns:
        tuple: (chatDisplay, history)
    """

    # Variables
    inputs = {"question": userQuery, "visitedInternet": False}
    finalOutput = None
    internetFlag = False

    # Start the workflow
    async for output in app.astream(inputs):

        # Saving final output for metric purposes
        finalOutput = output

        # Check if the process went to the internet
        if "webSearchNode" in output:
            internetFlag = True

        # Printing out each node state for clarity
        for key, value in output.items():

            # Print node
            pprint(f"Node '{key}':")

            # Print metrics
            if "metrics" in value:
                pprint("Metrics: ")
                pprint(value["metrics"])
    
    # Get the generation and its metrics
    finalNodeKey = list(finalOutput.keys())[-1]  # Get the key of the last executed node
    nodeOutput = finalOutput[finalNodeKey]  # Access the nested state
    generation = nodeOutput.get("generation", "No generation produced.")
    metrics = nodeOutput.get("metrics", {})

    # Update history
    if history is None:
        history = []
    
    history.extend([
        {"role": "user", "content": userQuery},
        {"role": "llm", "content": generation}
    ])

    # Display output for gradio
    #chatDisplay = [(msg["content"], "User" if msg["role"] == "user" else "LLM") for msg in history]

    # Append metrics to evaluationSamples for tracking (if correctAnswer is provided)
    if correctAnswer:
        evaluationSamples.append({
            "user_input": userQuery,
            "retrieved_contexts": [doc.page_content for doc in finalOutput.get("documents", [])],
            "response": generation,
            "reference": correctAnswer,
            "metrics": metrics,
        })
    
    # Save the result of the query with the metrics 
    ragResults.append({
        "question": userQuery,
        "generation": generation,
        "retry": internetFlag
    })

    return history #, chatDisplay

In [None]:
# Gradio frontend
interface = gr.Interface(
    fn=correctiveRAG,
    inputs=[
        gr.Textbox(label="Ask a Question", placeholder="Enter your question here..."),
        gr.State(),  # Keeps track of conversation history
        gr.Textbox(label="Correct Answer (Optional)", placeholder="For evaluation purposes..."),
    ],
    outputs=[
        gr.Chatbot(label="CorrectiveRAG Conversation"),
        gr.State(),  # Updates conversation history
    ],
    title="CorrectiveRAG Implementation",
    description="Interact with the CRAG workflow for document-grounded question answering.",
)

# Launch the interface
interface.launch()



* Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




---RETRIEVE---
"Node 'retrieve':"
---CHECK DOCUMENT RELEVANCE TO QUESTION---
---GRADE: DOCUMENT RELEVANT---
---GRADE: DOCUMENT RELEVANT---
---GRADE: DOCUMENT RELEVANT---
---ASSESS GRADED DOCUMENTS---
---DECISION: GENERATE---
"Node 'gradeDocuments':"
---GENERATE---
"Node 'generate':"
---EVALUATING METRICS---
"Node 'evaluateMetrics':"
'Metrics: '
{'FactualCorrectness': 0.0,
 'Faithfulness': 1.0,
 'LLMContextRecall': 0.0,
 'SemanticSimilarity': 0.7259663840810067}


In [61]:
# Function to evaluate our RAG pipeline when given ground truth
async def pipelineEvaluation(dataset, metrics):

    # Run through our runs
    results = []
    for run in dataset:

        # Save our inputs/outputs
        inputQuery = run["user_input"]
        groundTruthAnswer = run["reference"]
        contexts = run["retrieved_contexts"]
        response = run["response"]

        # Create a SingleTurnSample object
        sample = SingleTurnSample(
            user_input=inputQuery,
            response=response,
            reference=groundTruthAnswer,
            retrieved_contexts=contexts 
        )

        # Evaluate metrics
        runResults = {"question": inputQuery}
        for metric in metrics:

            # Get the score for the given metric
            try:

                score = await metric.single_turn_ascore(sample)
                runResults[type(metric).__name__] = score

            except Exception as e:
                # Catch errors for debugging
                runResults[type(metric).__name__] = f"Error: {str(e)}"
        
        # Save metric results
        results.append(runResults)
    
    # Calculate mean and standard deviation for each metric
    metricsStats = {}
    for metric in metrics:
        metricName = type(metric).__name__
        scores = [result[metricName] for result in results if isinstance(result[metricName], (int, float))]
        
        # Only calculate stats if there are valid scores
        if scores:
            metricsStats[metricName] = {
                "mean": np.mean(scores),
                "std_dev": np.std(scores),
            }
            
        else:
            metricsStats[metricName] = {
                "mean": "No valid scores",
                "std_dev": "No valid scores",
            }
    
    return results, metricsStats

In [44]:
# Load metrics
evalMetrics = [LLMContextRecall(llm=LangchainLLMWrapper(llm)), 
               FactualCorrectness(llm=LangchainLLMWrapper(llm)), 
               Faithfulness(llm=LangchainLLMWrapper(llm)), 
               SemanticSimilarity(embeddings=LangchainEmbeddingsWrapper(OpenAIEmbeddings()))]

In [21]:
# Evaluate our pipeline responses
evalResults = await pipelineEvaluation(evaluationSamples, evalMetrics)
for result in evalResults:
    print(result)

{'input_query': 'Who is Carolyn Matheus?', 'LLMContextRecall': 0.0, 'FactualCorrectness': 0.0, 'Faithfulness': 0.0, 'SemanticSimilarity': 0.928909234587568}


In [74]:
# Sample 222 records from our dataset
maristTestSample = maristQA.sample(100, replace=False)
maristTestSample.head()

Unnamed: 0,0,1
586,How to avoid spams,"""\u00a0Impersonation emails\u00a0Robocalls cla..."
587,What is the phone number for student financial...,"""Transfer Student Admission Department Student..."
568,What happens to the funds in my Marist Money a...,"""Student Services Department Card Services Nam..."
103,When is the the office of financial services o...,"""Job Location and Development Information (Fox..."
105,tell me about marist counseling,"""Counseling Services Department Counseling Ser..."


In [None]:
# Run our chain with each question and evaluate
chatHistory = None
for row in maristTestSample.iterrows():
    chatHistory = await correctiveRAG(row[1][0], chatHistory, row[1][1])

## Evaluation
evalResults, metricStats = await pipelineEvaluation(evaluationSamples, evalMetrics)
for result in evalResults:
    print(result)

for metric in metricStats.keys():
    print(f"{metric} - Mean: {metricStats[metric]['mean']}, St. Dev: {metricStats[metric]['std_dev']}")

---RETRIEVE---
"Node 'retrieve':"
---CHECK DOCUMENT RELEVANCE TO QUESTION---
---GRADE: DOCUMENT RELEVANT---
---GRADE: DOCUMENT RELEVANT---
---GRADE: DOCUMENT RELEVANT---
---ASSESS GRADED DOCUMENTS---
---DECISION: GENERATE---
"Node 'gradeDocuments':"
---GENERATE---
"Node 'generate':"
---EVALUATING METRICS---
"Node 'evaluateMetrics':"
'Metrics: '
{'FactualCorrectness': 0.0,
 'Faithfulness': 1.0,
 'LLMContextRecall': 0.0,
 'SemanticSimilarity': 0.7596856019922399}
---RETRIEVE---
"Node 'retrieve':"
---CHECK DOCUMENT RELEVANCE TO QUESTION---
---GRADE: DOCUMENT RELEVANT---
---GRADE: DOCUMENT RELEVANT---
---GRADE: DOCUMENT RELEVANT---
---ASSESS GRADED DOCUMENTS---
---DECISION: GENERATE---
"Node 'gradeDocuments':"
---GENERATE---
"Node 'generate':"
---EVALUATING METRICS---
"Node 'evaluateMetrics':"
'Metrics: '
{'FactualCorrectness': 0.0,
 'Faithfulness': 1.0,
 'LLMContextRecall': 0.0,
 'SemanticSimilarity': 0.7045918562747744}
---RETRIEVE---
"Node 'retrieve':"
---CHECK DOCUMENT RELEVANCE TO QUE

The LLM did not return a valid classification.


"Node 'evaluateMetrics':"
'Metrics: '
{'FactualCorrectness': 0.0,
 'Faithfulness': 1.0,
 'LLMContextRecall': nan,
 'SemanticSimilarity': 0.7383440103663941}
---RETRIEVE---
"Node 'retrieve':"
---CHECK DOCUMENT RELEVANCE TO QUESTION---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT RELEVANT---
---ASSESS GRADED DOCUMENTS---
---DECISION: ALL DOCUMENTS ARE NOT RELEVANT TO QUESTION, TRANSFORM QUERY---
"Node 'gradeDocuments':"
---TRANSFORM QUERY---
"Node 'transformQuery':"
---WEB SEARCH---
"Node 'webSearchNode':"
---GENERATE---
"Node 'generate':"
---EVALUATING METRICS---
"Node 'evaluateMetrics':"
'Metrics: '
{'FactualCorrectness': 0.0,
 'Faithfulness': 1.0,
 'LLMContextRecall': 0.0,
 'SemanticSimilarity': 0.7349868099610156}
---RETRIEVE---
"Node 'retrieve':"
---CHECK DOCUMENT RELEVANCE TO QUESTION---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---ASSESS GRADED DOCUMENTS---
---DECISION: ALL D

Failed to batch ingest runs: langsmith.utils.LangSmithRateLimitError: Rate limit exceeded for https://api.smith.langchain.com/runs/batch. HTTPError('429 Client Error: Too Many Requests for url: https://api.smith.langchain.com/runs/batch', '{"detail":"Monthly unique traces usage limit exceeded"}')
post: trace=5dd6db36-90e3-4207-b4ac-c7d41cf5ba24,id=5dd6db36-90e3-4207-b4ac-c7d41cf5ba24; trace=5dd6db36-90e3-4207-b4ac-c7d41cf5ba24,id=5589d19a-9bc9-485e-bc90-ca775f49d5d7; trace=5dd6db36-90e3-4207-b4ac-c7d41cf5ba24,id=61a27edd-012b-497d-ac66-b1096d2dbae1; patch: trace=dabfbcec-c8bb-43c0-82d7-def740475b96,id=cbe9cc23-f9c6-4b37-98e9-7be1960274a6; trace=dabfbcec-c8bb-43c0-82d7-def740475b96,id=dabfbcec-c8bb-43c0-82d7-def740475b96; trace=dabfbcec-c8bb-43c0-82d7-def740475b96,id=b1101fbd-dbfb-4827-9b48-019d595c0283
Failed to batch ingest runs: langsmith.utils.LangSmithRateLimitError: Rate limit exceeded for https://api.smith.langchain.com/runs/batch. HTTPError('429 Client Error: Too Many Requests f

{'question': 'What GPA do I need for grad school?', 'LLMContextRecall': 1.0, 'FactualCorrectness': 0.11, 'Faithfulness': 0.0, 'SemanticSimilarity': 0.8139424641650321}
{'question': 'Who published Displacements and Transformations in Caribbean Cultures?', 'LLMContextRecall': 0.16666666666666666, 'FactualCorrectness': 0.14, 'Faithfulness': 0.0, 'SemanticSimilarity': 0.8410372116485767}
{'question': 'Who is the chair of the english department?', 'LLMContextRecall': 0.16666666666666666, 'FactualCorrectness': 0.0, 'Faithfulness': 0.0, 'SemanticSimilarity': 0.793254676322769}
{'question': 'What are some of the academic resources I can leverage?', 'LLMContextRecall': 0.0, 'FactualCorrectness': 0.74, 'Faithfulness': 0.0, 'SemanticSimilarity': 0.8964862653975958}
{'question': 'Andrew Tokash Classes?', 'LLMContextRecall': 0.0, 'FactualCorrectness': 0.07, 'Faithfulness': 0.0, 'SemanticSimilarity': 0.8689408453581797}
{'question': "Who's the person to go to for Latin American and Caribbean Studies

Failed to batch ingest runs: langsmith.utils.LangSmithRateLimitError: Rate limit exceeded for https://api.smith.langchain.com/runs/batch. HTTPError('429 Client Error: Too Many Requests for url: https://api.smith.langchain.com/runs/batch', '{"detail":"Monthly unique traces usage limit exceeded"}')
post: trace=e568f6a0-bbcd-45cd-9cbb-12462a59b0ec,id=e568f6a0-bbcd-45cd-9cbb-12462a59b0ec; trace=e568f6a0-bbcd-45cd-9cbb-12462a59b0ec,id=a6fa4e94-d088-49e5-b7ac-e8fbd6522ee9; trace=e568f6a0-bbcd-45cd-9cbb-12462a59b0ec,id=5f062a7b-982d-41fd-88ec-f7d7cf611cad; trace=e568f6a0-bbcd-45cd-9cbb-12462a59b0ec,id=44007c99-0a77-4b1a-9c62-66257e6e8d80; trace=e568f6a0-bbcd-45cd-9cbb-12462a59b0ec,id=870fcd51-3f00-48b9-97d9-ee86222ff501; trace=e568f6a0-bbcd-45cd-9cbb-12462a59b0ec,id=7daee0f7-541e-447d-8eb1-be2e3ed8b89a; trace=e568f6a0-bbcd-45cd-9cbb-12462a59b0ec,id=dd80434a-c0ad-4757-9a11-c87f717ff589; trace=e568f6a0-bbcd-45cd-9cbb-12462a59b0ec,id=06968678-e7f0-42ca-992b-db64dac1144c; trace=e568f6a0-bbcd-45cd

In [76]:
# Make our dataframe of results
processResults = pd.DataFrame(ragResults)
evalResultsDF = pd.DataFrame(evalResults)
finalResultsDF = processResults.merge(evalResultsDF, on="question", how="left")
finalResultsDF

Unnamed: 0,question,generation,retry,LLMContextRecall,FactualCorrectness,Faithfulness,SemanticSimilarity
0,What GPA do I need for grad school?,You typically need a minimum overall GPA of 3....,False,1.000000,0.11,0.0,0.813942
1,What GPA do I need for grad school?,You typically need a minimum overall GPA of 3....,False,0.500000,0.10,0.0,0.813942
2,Who published Displacements and Transformation...,"The publisher of the book ""Displacements and T...",True,0.166667,0.14,0.0,0.841037
3,Who is the chair of the english department?,The current chair of the English department is...,True,0.166667,0.00,0.0,0.793255
4,What are some of the academic resources I can ...,"You can leverage several academic resources, i...",False,0.000000,0.74,0.0,0.896486
...,...,...,...,...,...,...,...
173,Where is the advising and academic services ce...,The advising and academic services center is l...,False,0.000000,0.29,0.0,0.876235
174,How to use iLearn,"To use iLearn, start by watching the short ""On...",False,0.000000,0.50,0.0,0.911958
175,Janine Peterson job?,Janine Peterson currently works at the Nationa...,True,1.000000,0.00,0.0,0.806813
176,What about Blackridge technology,Blackridge technology focuses on cybersecurity...,True,0.200000,0.00,0.0,0.727944


In [77]:
# Averages & St. devs
print(f"LLM Context Recall - Mean: {finalResultsDF["LLMContextRecall"].mean()}, St. Dev: {finalResultsDF["LLMContextRecall"].std()}")
print(f"FactualCorrectness - Mean: {finalResultsDF["FactualCorrectness"].mean()}, St. Dev: {finalResultsDF["FactualCorrectness"].std()}")
print(f"Faithfulness - Mean: {finalResultsDF["Faithfulness"].mean()}, St. Dev: {finalResultsDF["Faithfulness"].std()}")
print(f"SemanticSimilarity - Mean: {finalResultsDF["SemanticSimilarity"].mean()}, St. Dev: {finalResultsDF["SemanticSimilarity"].std()}")


LLM Context Recall - Mean: 0.3502828980522305, St. Dev: 0.38078152337104065
FactualCorrectness - Mean: 0.229438202247191, St. Dev: 0.21680191980388025
Faithfulness - Mean: 0.0025280898876404497, St. Dev: 0.023930497237328208
SemanticSimilarity - Mean: 0.861302026325832, St. Dev: 0.05792963836449268


In [81]:
final100ResultsDF = finalResultsDF.tail(100)

In [82]:
# Dervive generations that didn't use the internet
trueContextsDF = final100ResultsDF[final100ResultsDF["retry"] == False]
trueContextsDF.describe()

Unnamed: 0,LLMContextRecall,FactualCorrectness,Faithfulness,SemanticSimilarity
count,55.0,55.0,55.0,55.0
mean,0.374025,0.356,0.0,0.888085
std,0.395259,0.213772,0.0,0.049715
min,0.0,0.0,0.0,0.77336
25%,0.0,0.195,0.0,0.85718
50%,0.25,0.36,0.0,0.892533
75%,0.65,0.5,0.0,0.925828
max,1.0,0.78,0.0,0.971669


In [83]:
trueContextsDF.head(100)

Unnamed: 0,question,generation,retry,LLMContextRecall,FactualCorrectness,Faithfulness,SemanticSimilarity
78,Do I have to research something specific for m...,"No, you do not have to research something spec...",False,0.571429,0.24,0.0,0.779964
82,What is accommodations and accessibility?,Accommodations and accessibility refer to the ...,False,0.0,0.57,0.0,0.871183
86,How many hours of clinical field experience is...,The Clinical Mental Health Counseling program ...,False,0.25,0.3,0.0,0.884294
88,Where can you do internships,You can do internships in New York City and th...,False,1.0,0.5,0.0,0.889595
89,Admissions contact?,You can contact the Office of Graduate Admissi...,False,0.0,0.0,0.0,0.857182
90,School of Communications internship opportunities,The School of Communication and the Arts offer...,False,0.9,0.35,0.0,0.919466
91,on what principles is the college's education ...,The college's education is grounded on princip...,False,0.454545,0.19,0.0,0.847089
93,How can I learn about the term abroad program?,"You can learn about the term ""abroad program"" ...",False,0.5,0.44,0.0,0.898406
94,Dr. Nicholas Marshall,Dr. Nicholas Marshall is an Associate Professo...,False,0.0,0.55,0.0,0.920979
95,MA clinical mental health,The Master of Arts in Clinical Mental Health C...,False,0.0,0.48,0.0,0.928664


# Corrective RAG Process w/ LangChain

In [3]:
# Create vector DB

## Load Data
csvPath = "/Users/christiansarmiento/Library/CloudStorage/OneDrive-MaristCollege/Machine Learning/Data/Cleaned_QA.csv"
maristQA = pd.read_csv(csvPath, header=None)

## To use RecursiveCharacterTextSplitter, we need a list of dictionaries
maristContext = [Document(page_content=text) for text in maristQA[1].tolist()]

## Split Documents into Chunks
textSplitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, add_start_index=True)
texts = textSplitter.split_documents(maristContext)

## Store Documents in Vector DB (Chroma)
vectorDB = Chroma.from_documents(documents=texts, embedding=OpenAIEmbeddings())

In [4]:
# Define LLM 
llm = ChatOpenAI(model="gpt-4o-mini", max_tokens=1000, temperature=0)

In [5]:
# Search Tool
search = DuckDuckGoSearchResults()

In [6]:
# LLM chains for retrieval evaluator, knowledge refinement, & query rewriter

## Retrieval Evaluator
class RetrievalEvaluatorInput(BaseModel):
    relevance_score: float = Field(..., description="The relevance score of the document to the query. the score should be between 0 and 1.")

def retrieval_evaluator(query: str, document: str) -> float:
    prompt = PromptTemplate(
        input_variables=["query", "document"],
        template="On a scale from 0 to 1, how relevant is the following document to the query? Query: {query}\nDocument: {document}\nRelevance score:"
    )
    chain = prompt | llm.with_structured_output(RetrievalEvaluatorInput)
    input_variables = {"query": query, "document": document}
    result = chain.invoke(input_variables).relevance_score
    return result

## Knowledge Refinement
class KnowledgeRefinementInput(BaseModel):
    key_points: str = Field(..., description="The document to extract key information from.")

def knowledge_refinement(document: str) -> List[str]:
    prompt = PromptTemplate(
        input_variables=["document"],
        template="Extract the key information from the following document in bullet points:\n{document}\nKey points:"
    )
    chain = prompt | llm.with_structured_output(KnowledgeRefinementInput)
    input_variables = {"document": document}
    result = chain.invoke(input_variables).key_points
    return [point.strip() for point in result.split('\n') if point.strip()]

## Web Search Query Rewriter
class QueryRewriterInput(BaseModel):
    query: str = Field(..., description="The query to rewrite.")
    
def rewrite_query(query: str) -> str:
    prompt = PromptTemplate(
        input_variables=["query"],
        template="Rewrite the following query to make it more suitable for a web search:\n{query}\nRewritten query:"
    )
    chain = prompt | llm.with_structured_output(QueryRewriterInput)
    input_variables = {"query": query}
    return chain.invoke(input_variables).query.strip()

In [7]:
# Helper function to parse search results
def parse_search_results(results_string: str) -> List[Tuple[str, str]]:
    """
    Parse a JSON string of search results into a list of title-link tuples.

    Args:
        results_string (str): A JSON-formatted string containing search results.

    Returns:
        List[Tuple[str, str]]: A list of tuples, where each tuple contains the title and link of a search result.
                               If parsing fails, an empty list is returned.
    """
    try:
        # Attempt to parse the JSON string
        results = json.loads(results_string)
        # Extract and return the title and link from each result
        return [(result.get('title', 'Untitled'), result.get('link', '')) for result in results]
    except json.JSONDecodeError:
        # Handle JSON decoding errors by returning an empty list
        print("Error parsing search results. Returning empty list.")
        return []

In [8]:
# Define CRAG "nodes"
def retrieve_documents(query: str, vectorstore, k: int = 3) -> List[str]:
    """
    Retrieve documents based on a query using a FAISS index.

    Args:
        query (str): The query string to search for.
        faiss_index (FAISS): The FAISS index used for similarity search.
        k (int): The number of top documents to retrieve. Defaults to 3.

    Returns:
        List[str]: A list of the retrieved document contents.
    """
    docs = vectorstore.similarity_search(query, k=k)
    return [doc.page_content for doc in docs]

def evaluate_documents(query: str, documents: List[str]) -> List[float]:
    """
    Evaluate the relevance of documents based on a query.

    Args:
        query (str): The query string.
        documents (List[str]): A list of document contents to evaluate.

    Returns:
        List[float]: A list of relevance scores for each document.
    """
    return [retrieval_evaluator(query, doc) for doc in documents]

def perform_web_search(query: str) -> Tuple[List[str], List[Tuple[str, str]]]:
    """
    Perform a web search based on a query.

    Args:
        query (str): The query string to search for.

    Returns:
        Tuple[List[str], List[Tuple[str, str]]]: 
            - A list of refined knowledge obtained from the web search.
            - A list of tuples containing titles and links of the sources.
    """
    rewritten_query = rewrite_query(query)
    web_results = search.run(rewritten_query)
    web_knowledge = knowledge_refinement(web_results)
    sources = parse_search_results(web_results)
    return web_knowledge, sources

def generate_response(query: str, knowledge: str, sources: List[Tuple[str, str]]) -> str:
    """
    Generate a response to a query using knowledge and sources.

    Args:
        query (str): The query string.
        knowledge (str): The refined knowledge to use in the response.
        sources (List[Tuple[str, str]]): A list of tuples containing titles and links of the sources.

    Returns:
        str: The generated response.
    """
    response_prompt = PromptTemplate(
        input_variables=["query", "knowledge", "sources"],
        template="Based on the following knowledge, answer the query. Include the sources with their links (if available) at the end of your answer:\nQuery: {query}\nKnowledge: {knowledge}\nSources: {sources}\nAnswer:"
    )
    input_variables = {
        "query": query,
        "knowledge": knowledge,
        "sources": "\n".join([f"{title}: {link}" if link else title for title, link in sources])
    }
    response_chain = response_prompt | llm
    return response_chain.invoke(input_variables).content


In [9]:
# Corrective RAG Process w/ LangChain
def correctiveRAGLangChain(query: str, vectorstore) -> str:
    """
    Process a query by retrieving, evaluating, and using documents or performing a web search to generate a response.

    Args:
        query (str): The query string to process.
        vectorstore: The vector DB (Chroma) used for document retrieval.

    Returns:
        str: The generated response based on the query.
    """
    print(f"\nProcessing query: {query}")

    # Retrieve and evaluate documents
    retrieved_docs = retrieve_documents(query, vectorstore)
    eval_scores = evaluate_documents(query, retrieved_docs)
    
    print(f"\nRetrieved {len(retrieved_docs)} documents")
    print(f"Evaluation scores: {eval_scores}")

    # Determine action based on evaluation scores
    max_score = max(eval_scores)
    sources = []
    
    if max_score > 0.7:
        print("\nAction: Correct - Using retrieved document")
        best_doc = retrieved_docs[eval_scores.index(max_score)]
        final_knowledge = best_doc
        sources.append(("Retrieved document", ""))
    elif max_score < 0.3:
        print("\nAction: Incorrect - Performing web search")
        final_knowledge, sources = perform_web_search(query)
    else:
        print("\nAction: Ambiguous - Combining retrieved document and web search")
        best_doc = retrieved_docs[eval_scores.index(max_score)]
        # Refine the retrieved knowledge
        retrieved_knowledge = knowledge_refinement(best_doc)
        web_knowledge, web_sources = perform_web_search(query)
        final_knowledge = "\n".join(retrieved_knowledge + web_knowledge)
        sources = [("Retrieved document", "")] + web_sources

    print("\nFinal knowledge:")
    print(final_knowledge)
    
    print("\nSources:")
    for title, link in sources:
        print(f"{title}: {link}" if link else title)

    # Generate response
    print("\nGenerating response...")
    response = generate_response(query, final_knowledge, sources)

    print("\nResponse generated")
    return response

In [14]:
# Sample a record from our dataset
maristTestSample = maristQA.sample(1, replace=False)
maristTestSample.head()

Unnamed: 0,0,1
594,Are DMA members eligible for special tuition p...,Community Organizations Department Graduate Ad...


In [None]:
# Make sure this works by itself first

## Run our chain with a question
for row in maristTestSample.iterrows():
    response = correctiveRAGLangChain(row[1][0], vectorDB)
    print(f"Generation: {response}")




Processing query: Are DMA members eligible for special tuition pricing?

Retrieved 3 documents
Evaluation scores: [1.0, 1.0, 1.0]

Action: Correct - Using retrieved document

Final knowledge:
for innovators, artists, creators, writers, analysts, and strategists of all things related to marketing powered by data. They provide innovation that accelerates business growth and brand experiences as well as education that develops talent and insights into best practices.The DMA represents thousands of companies and nonprofit organizations that use and support data-driven marketing practices and techniques. Their members span the marketing ecosystem and have the passion for accelerating industry diversity and innovation. The Data & Marketing Association supports and drives their members to reach their fullest potential.As a way of saying \thank you\ to the many DMA members who reside in our community, Marist College is offering special tuition pricing for all members and their immediate adult

In [10]:
# Modify the function to support RAGAS evaluation
evaluationSamples = []
def correctiveRAGLangChain(query: str, vectorstore, correctAnswer: str) -> str:
    """
    Process a query by retrieving, evaluating, and using documents or performing a web search to generate a response.

    Args:
        query (str): The query string to process.
        vectorstore: The vector DB (Chroma) used for document retrieval.

    Returns:
        str: The generated response based on the query.
    """
    print(f"\nProcessing query: {query}")

    # Retrieve and evaluate documents
    retrieved_docs = retrieve_documents(query, vectorstore)
    eval_scores = evaluate_documents(query, retrieved_docs)
    
    print(f"\nRetrieved {len(retrieved_docs)} documents")
    print(f"Evaluation scores: {eval_scores}")

    # Determine action based on evaluation scores
    max_score = max(eval_scores)
    sources = []
    
    if max_score > 0.7:
        print("\nAction: Correct - Using retrieved document")
        best_doc = retrieved_docs[eval_scores.index(max_score)]
        final_knowledge = best_doc
        sources.append(("Retrieved document", ""))
    elif max_score < 0.3:
        print("\nAction: Incorrect - Performing web search")
        final_knowledge, sources = perform_web_search(query)
    else:
        print("\nAction: Ambiguous - Combining retrieved document and web search")
        best_doc = retrieved_docs[eval_scores.index(max_score)]
        # Refine the retrieved knowledge
        retrieved_knowledge = knowledge_refinement(best_doc)
        web_knowledge, web_sources = perform_web_search(query)
        final_knowledge = "\n".join(retrieved_knowledge + web_knowledge)
        sources = [("Retrieved document", "")] + web_sources

    print("\nFinal knowledge:")
    print(final_knowledge)
    
    print("\nSources:")
    for title, link in sources:
        print(f"{title}: {link}" if link else title)

    # Generate response
    print("\nGenerating response...")
    response = generate_response(query, final_knowledge, sources)

    # Save output
    if type(final_knowledge) == str:  # Needs to be in a list format for subsequent evaluation
        final_knowledge = [final_knowledge]

    if correctAnswer:
        evaluationSamples.append({
            "user_input": query,
            "retrieved_contexts": final_knowledge,
            "response": response,
            "reference": correctAnswer
        })

    print("\nResponse generated")
    return response

In [16]:
# Function to evaluate our RAG pipeline when given ground truth

async def evaluateSample(sample, metrics):
    '''Helper function for pipelineEvaluation that evaluates a sample using multithreading'''

    # Evaluate metrics
    runResults = {"question": sample.user_input}
    for metric in metrics:

        # Get the score for the given metric
        try:

            print(f"Data being passed: {sample}")
            score = await metric.single_turn_ascore(sample)
            runResults[type(metric).__name__] = score
            print(f"Score for {type(metric).__name__}: {score}")

        except Exception as e:
            # Catch errors for debugging
            runResults[type(metric).__name__] = f"Error: {str(e)}"
    
    return runResults

def multithreadedEvaluation(sample, metrics, queue):
    '''Helper function that runs the async evaluation within a thread'''

    # Allows for asynchronous thread compatibility
    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(loop)

    results = loop.run_until_complete(evaluateSample(sample, metrics))

    queue.put(results)

async def pipelineEvaluation(dataset, metrics):
    '''Function that takes in a dataset of input and output and metrics to evaluate 
    the quality of a RAG process using RAGAS
    '''
    
    # Run through our runs
    results = []
    resultQueue = Queue()  # intermediary queue for thread data
    threads = []  
    for run in dataset:

        # Save our inputs/outputs
        inputQuery = run["user_input"]
        groundTruthAnswer = run["reference"]
        contexts = run["retrieved_contexts"]
        response = run["response"]

        # Create a SingleTurnSample object
        sample = SingleTurnSample(
            user_input=inputQuery,
            response=response,
            reference=groundTruthAnswer,
            retrieved_contexts=contexts 
        )

        # Create and start a threat for paralleization of metrics
        thread = threading.Thread(target=multithreadedEvaluation, args=(sample, metrics, resultQueue))
        threads.append(thread)
        thread.start()
    
    # Wait for the threads to finish
    for thread in threads:
        thread.join()
    
    # Get the results
    while not resultQueue.empty():
        results.append(resultQueue.get())
    
    # Calculate mean and standard deviation for each metric
    metricsStats = {}
    for metric in metrics:
        metricName = type(metric).__name__
        scores = [result[metricName] for result in results if isinstance(result[metricName], (int, float))]
        
        # Only calculate stats if there are valid scores
        if scores:
            metricsStats[metricName] = {
                "mean": np.mean(scores),
                "std_dev": np.std(scores),
            }
            
        else:
            metricsStats[metricName] = {
                "mean": "No valid scores",
                "std_dev": "No valid scores",
            }
    
    return results, metricsStats

In [11]:
# Function to evaluate our RAG pipeline when given ground truth
async def pipelineEvaluation(dataset, metrics):

    # Run through our runs
    results = []
    for run in dataset:

        # Save our inputs/outputs
        inputQuery = run["user_input"]
        groundTruthAnswer = run["reference"]
        contexts = run["retrieved_contexts"]
        response = run["response"]

        # Create a SingleTurnSample object
        sample = SingleTurnSample(
            user_input=inputQuery,
            response=response,
            reference=groundTruthAnswer,
            retrieved_contexts=contexts 
        )

        # Evaluate metrics
        runResults = {"question": inputQuery}
        for metric in metrics:

            # Get the score for the given metric
            try:

                score = await metric.single_turn_ascore(sample)
                runResults[type(metric).__name__] = score

            except Exception as e:
                # Catch errors for debugging
                runResults[type(metric).__name__] = f"Error: {str(e)}"
        
        # Save metric results
        results.append(runResults)
    
    # Calculate mean and standard deviation for each metric
    metricsStats = {}
    for metric in metrics:
        metricName = type(metric).__name__
        scores = [result[metricName] for result in results if isinstance(result[metricName], (int, float))]
        
        # Only calculate stats if there are valid scores
        if scores:
            metricsStats[metricName] = {
                "mean": np.mean(scores),
                "std_dev": np.std(scores),
            }
            
        else:
            metricsStats[metricName] = {
                "mean": "No valid scores",
                "std_dev": "No valid scores",
            }
    
    return results, metricsStats

In [12]:
# Load metrics
evalMetrics = [LLMContextRecall(llm=LangchainLLMWrapper(llm)), 
               FactualCorrectness(llm=LangchainLLMWrapper(llm)), 
               Faithfulness(llm=LangchainLLMWrapper(llm)), 
               SemanticSimilarity(embeddings=LangchainEmbeddingsWrapper(OpenAIEmbeddings()))]

#evalMetrics = [LLMContextRecall(llm=LangchainLLMWrapper(llm)), Faithfulness(llm=LangchainLLMWrapper(llm)), SemanticSimilarity(embeddings=LangchainEmbeddingsWrapper(OpenAIEmbeddings()))]

In [13]:
# Sample records from our dataset
maristTestSample = maristQA.sample(50, replace=False)
maristTestSample.head()

Unnamed: 0,0,1
470,Who is Desiree Dighton?,Contact InformationAcademic SchoolOfficeEmailP...
359,How can I find out more about the Clinical hea...,Master of Arts in Clinical Mental Health Couns...
546,Who published Displacements and Transformation...,Contact InformationAcademic SchoolOfficeEmailP...
103,What is the number for financial services?,Transfer: Information for Accepted Transfer St...
111,What educational areas does Marist give master...,Master of Arts in Educational Psychology Depar...


In [14]:
# Run our chain with a question and evaluate
for row in maristTestSample.iterrows():
    response = correctiveRAGLangChain(row[1][0], vectorDB, row[1][1])
    print(f"Generation: {response}")
    
print("Generations done!\n")


Processing query: Who is Desiree Dighton?

Retrieved 3 documents
Evaluation scores: [0.1, 0.1, 0.1]

Action: Incorrect - Performing web search


  ddgs_gen = ddgs.text(


Error parsing search results. Returning empty list.

Final knowledge:
['- Dr. Desiree Dighton is an assistant professor in the English department.', '- A major concern for Dighton is building student confidence in using generative AI tools.', '- Dighton emphasizes the importance of critical thinking and writing skills for students.', '- Ethan Moseley, a BTC certificate student, received the Gold Scholarship from the ECU College of Business.', '- Moseley will start the MA in English program alongside the MBA program in Spring 2025.', '- Dighton and Brent Henze have mentored Moseley and provided feedback on his scholarship materials.', '- Dighton worked with students on designing book reviews for the NCLR winter issue.', '- The winter issue includes over a hundred pages of content, including poetry and creative nonfiction.', "- Dighton praised Sylvia's research and writing on data and society, calling it relevant and approachable for undergraduates."]

Sources:

Generating response...

R

  ddgs_gen = ddgs.text(


Error parsing search results. Returning empty list.

Final knowledge:
- Title: Transformations in Caribbean Cultures
- Publisher: University Press of Florida
- Publication Date: Spring 2008
- Title: Travelers Possessed: Generic Hybrids and the Caribbean
- Context: Between Anthropology and Literature: Interdisciplinary Discourse
- Publisher: Routledge Press
- Publication Date: 2002
- Pages: 248-266
- Title: El drama del silencio: Renacimiento ritual en la literatura caribena
- Context: Cultural (Con) fusion? Trans-Caribbean Performers and Performance
- Publisher: Caribe 2000 Series
- Publication Date: 2002
- Pages: 49-63
- Title: Women Adrift: Madwomen, Matriarchs, and the Caribbean
- Context: Women at Sea: Travel Writing and the Margins of Caribbean Discourse
- Publisher: Palgrave/St. Martin's Press
- Publication Date: 2001
- Pages: 135-160
- Title: Sorcerers, She-Devils, and Shipwrecked Women: Writing Religion in French Caribbean Literature
- Context: Sacred Possessions: Vodou, Santer

  ddgs_gen = ddgs.text(


Error parsing search results. Returning empty list.

Final knowledge:
- Department: Student Financial Services
- Email: studentfinancialservices@marist.edu
- Phone: 1-800-436-5483 or 845-575-3230
- Fax: 845-575-3099
- Location: Donnelly Hall 200
- Office Hours:
- Monday - Thursday: 8:00 am - 5:00 pm
- Friday: 8:00 am - 4:00 pm
- Focus on providing best value for graduate education investment
- Commitment to affordability options and equitable distribution of student aid funds
- Coordination of student employment on and off-campus
- Responsible for student billing
- Dedicated to promoting accessibility through personalized service and integrity.
- Membership in regulated DTCC subsidiaries is generally required to use most services.
- Alphabetical listings of participants, settling banks, and depository facilities are publicly available for free.
- Information updated as of January 2, 2025.
- Federal Reserve Financial Services announced 2025 fees and payment system enhancements.
- 2025 h

  ddgs_gen = ddgs.text(


Error parsing search results. Returning empty list.

Final knowledge:
['- Marist College offers advanced degrees (masters and doctorate) and several advanced certificates.', '- Areas of study include accounting, allied health, business, computer science, communication, education, psychology, and public administration.', '- Prospective students can find a complete list of programs on the Graduate Admission web page.', '- More than 40,000 alumni and alumnae are associated with Marist College.', "- The college offers 47 bachelor's degree programs, 11 master's degree programs, 1 doctoral program, and 4 professional certificate programs.", '- Marist College is committed to providing educational opportunities for adults that accommodate their work schedules.', "- Graduate education programs are linked to the College's Mission and focus on teacher preparation.", '- Marist College is a private graduate school located in Poughkeepsie, New York.', '- The graduate student body consists of 954 stu

  ddgs_gen = ddgs.text(


Error parsing search results. Returning empty list.

Final knowledge:
- Students from various departments have completed internships in their chosen fields.
- Departments include: Art & Digital Media, Communication, Fashion, Film, TV, Games, and Emerging Media.
- Internships provide opportunities to apply studies in a professional environment.
- Notable companies where students have interned:
- Hearst Corporation
- Harpers Bazaar
- Conde Nast
- Glamour Magazine
- Allure Magazine
- Madison Square Garden Company
- Marvel Entertainment
- MTV Networks
- Nickelodeon
- ABC News
- CBS Sports
- Dateline NBC
- BCBG
- Chanel
- Coach
- Michael Kors
- The School of Computer Science and Mathematics has established corporate partnerships and professional relationships with various organizations.
- Delta Airlines is offering a paid Summer 2025 internship for undergraduate students in the Commercial Strategy Analyst (CSA) Program.
- The internship aims to develop high-potential analysts by enhancing t

  ddgs_gen = ddgs.text(


Error parsing search results. Returning empty list.

Final knowledge:
['- The Gallup Resident Agency of the FBI investigated a case with assistance from the Navajo Police Department, New Mexico State Police, and Gallup Police Department.', '- Assistant U.S. Attorney Nicholas Marshall is prosecuting the case.', '- Torrez was described by Nicholas Marshall as "arguably the second most culpable" in a sentencing memorandum.', '- Torrez did not admit to any involvement during interrogation.', '- A man was sentenced to 17 years for a 2018 killing, with the sentence set in 2023.', '- Nicholas Marshall also mentioned Torrez in a sentencing memorandum for Luis Mariscal-Lopez.', '- Another individual was sentenced to 12 years in prison for a 2018 kidnapping.', '- Nicholas Marshall is a second-year architecture major and director of student organizations at NJIT, working on club system reforms.']

Sources:

Generating response...

Response generated
Generation: Nicholas Marshall is an Assistant U

  ddgs_gen = ddgs.text(


Error parsing search results. Returning empty list.

Final knowledge:
['- Confirmed positive COVID-19 result by a viral test or clinical diagnosis required for reporting.', '- High-risk exposure period defined as two days before symptoms develop until certain conditions are met.', '- Students encouraged to complete the COVID-19 Report Form after testing positive.', '- Students not required to notify others about a positive test; notifications handled per government protocols.', '- Isolation and exposure guidelines provided for campus members not affiliated with UI Health.', '- UI Health faculty, staff, and students on clinical rotations must follow specific guidance.', '- Employees and students should report if they receive a positive test result or are a Close Contact to a COVID-19 positive person within the past 10 days.']

Sources:

Generating response...

Response generated
Generation: To report a COVID-19 issue on campus, you should follow these steps:

1. **Confirm Positive Resul

  ddgs_gen = ddgs.text(


Error parsing search results. Returning empty list.

Final knowledge:
['- Kathleen Rita McNulty was born on February 12, 1921, in Creeslough, Ireland.', '- She was the third of six children of Anne Nelis and James McNulty.', '- McNulty graduated as one of the few female math majors in 1942.', '- She was a programmer and computer scientist, known as Kathleen "Kay" McNulty.', '- McNulty initially spoke only Irish when she came to the US.', '- She became a pioneering computer programmer in the 20th century.', '- Her family was from a Gaeltacht region, which is an Irish-speaking area.', '- Kathleen was married first to Mauchly and later to Antonelli.']

Sources:

Generating response...

Response generated
Generation: Kathleen McNulty graduated from the College of Saint Elizabeth in 1942, where she was one of the few female math majors at the time. 

Sources:
- [Wikipedia - Kathleen McNulty](https://en.wikipedia.org/wiki/Kathleen_McNulty)

Processing query: Tell me about women leadership.



  ddgs_gen = ddgs.text(


Error parsing search results. Returning empty list.

Final knowledge:
- Marist College is dedicated to preparing students for challenges in a changing global society.
- The Marist Core equips students with essential 21st-century skills.
- Key components of the curriculum include:
- First Year Seminar
- Philosophical Perspectives Course
- Ethics and Justice Course
- Pathway Capping Course
- Writing for College Course
- Public Presentation-intensive Course
- Mathematics Requirement (Quantitative FYS and/or Pathway Courses)
- Natural Science Requirement (Science-related FYS and Pathway Courses)
- Technology-intensive Requirement (First Year Seminar, Writing for College Course, Writing-intensive Requirement, Capping Course)
- Technology-intensive courses help students understand the impact of technology on their lives and the world.
- Sepsis is defined as life-threatening organ dysfunction caused by a dysregulated host response to infection.
- Sepsis can result from infections of any etiol

  ddgs_gen = ddgs.text(


Error parsing search results. Returning empty list.

Final knowledge:
['- Full-time undergraduate students typically enroll in at least 15 semester credit hours during fall and spring semesters.', '- Summer term is shorter, and students are advised not to take more than 12 credit hours during this period.', '- Maximum registration limits: 20 credit hours in Spring and Fall semesters, 16 credit hours in Summer semester (with exceptions for certain students).', '- Students at Fort Campbell campus can qualify for academic distinction by completing at least 12 hours in one term with a minimum GPA of 3.50, or by completing 12 credits in consecutive Fall or Spring terms.', '- Full-time status is defined as enrollment in 12 or more credit hours per term; half-time status is 6 to 11 credit hours.', '- Students cannot register for more than 18 credit hours in a 17-week period without written permission.']

Sources:

Generating response...

Response generated
Generation: The MSPAccy program, lik

  ddgs_gen = ddgs.text(


Error parsing search results. Returning empty list.

Final knowledge:
- Students will explore areas relevant to physical therapy through a clinical research question.
- Projects involve extensive literature review and may include primary data collection and analysis.
- The course culminates in a substantial final written report and presentation in PHTH 714 Capstone.
- PHTH 709 is a Doctoral Project worth 3 credits.
- The DPT degree at Marist College is a clinical doctorate.
- Students must complete a doctoral project that allows exploration of interests beyond didactic or clinical education.
- The doctoral project is an intensive active learning experience requiring significant planning and implementation effort.
- Each student will be mentored by faculty with expertise in their project area.
- Cellular Medicine is a concept by Dr. Matthias Rath.
- It identifies chronic deficiency of vitamins, minerals, amino acids, and nutrients as the primary cause of common chronic diseases.
- Affec

  ddgs_gen = ddgs.text(


Error parsing search results. Returning empty list.

Final knowledge:
['- The HSF Scholar Program provides financial aid and comprehensive support services for academic and professional success.', '- Annually, 10,000 students are selected for the HSF Scholar Program.', '- HSF Scholars can receive annual scholarships ranging from $500 to $5,000.', '- Additional support services include networking events, leadership training, career development tools, and mentorship programs.', '- These services aim to help students navigate their academic journeys and prepare for future careers.', '- HSF was co-founded by Cameron Dang and Charles in 2020 to provide educational opportunities.', '- HSF offers various scholarships with different eligibility criteria and award amounts.']

Sources:

Generating response...

Response generated
Generation: HSF stands for the Hispanic Scholarship Fund, which is dedicated to providing financial aid and comprehensive support services to help students achieve acade

  ddgs_gen = ddgs.text(


Error parsing search results. Returning empty list.

Final knowledge:
- Marist is committed to being an academic resource for students, faculty, and staff.
- Essential tools are provided for students to excel in their education.
- Recognizes that every student learns differently.
- Offers services like the Writing Center for essay and paper assistance.
- Provides classroom accommodations and accessibility for diverse learning styles.
- Includes career services and advising for student support.
- Offers resources for faculty to enhance their teaching and research.
- A student must request a leave of absence or withdraw from Marist College through the Center for Advising and Academic Services.
- Summer Pre-College is a rigorous academic program for rising high school juniors and seniors.
- Marian College was renamed Marist College in 1960.
- In 1969, ownership was transferred to the Marist College Educational Corporation with an independent board of trustees.
- Dr. Dennis J. Murray becam

  ddgs_gen = ddgs.text(


Error parsing search results. Returning empty list.

Final knowledge:
['- CaaS (Containers as a Service) is sometimes confused with IaaS (Infrastructure as a Service).', '- Both CaaS and IaaS allow users to request and provision computing resources.', '- CaaS architecture consists of distinct layers, with the infrastructure layer providing the necessary physical or virtual resources for running containers.', '- CaaS promotes collaboration and agility in DevOps environments, allowing development and operations teams to work simultaneously on issues.', '- Advantages of CaaS include:', '- Scalability and flexibility for easy packaging, distribution, and management of containers.', '- Cost-effectiveness through optimized resource utilization and a pay-as-you-go model.']

Sources:

Generating response...

Response generated
Generation: CaaS, or Containers as a Service, is a cloud service model that allows users to manage and deploy containerized applications. It is often compared to Infrast

  ddgs_gen = ddgs.text(


Error parsing search results. Returning empty list.

Final knowledge:
- Students can complete the remaining 12 credits on a part-time distance learning basis in the fall and/or spring.
- Accounting firms hire graduates to start their careers in early September.
- The MSPAccy program addresses the 150-credit requirement for entering accounting professionals.
- Students admitted in the summer can complete up to 60% of degree requirements before starting their accounting careers.
- Remaining credits can be completed through online classes.
- On-campus housing is available for students enrolled in summer courses for the MS in Professional Accountancy program.
- An application for housing will be available upon admission to the program.
- AICPA 2023 report indicates a decline in college accounting graduates during the 2021-2022 academic year.
- Report titled "2023 Trends: A Report on Accounting Education, the CPA Exam and Public Accounting Firms' Hiring of Recent Graduates".
- Approximately

  ddgs_gen = ddgs.text(


Error parsing search results. Returning empty list.

Final knowledge:
- Students can choose a Pathway to complement their major, such as Spanish for Biology majors or Global Studies for Accounting majors.
- Pathways can help fulfill requirements for minors or second majors.
- Courses in Pathways can enrich the overall academic experience.
- Each Pathway must include 4 courses from at least 3 different disciplinary areas.
- Honors, special-topics, and study-abroad courses can be included in Pathways if approved.
- Steps to develop a career path include outlining career goals through self-reflection and guided questions.
- Importance of active reflection to narrow down career choices.
- Guide provides 15 vital steps to find the right career path in 2024, starting with taking a career test.
- Choosing a career is compared to a jigsaw puzzle, where many pieces must fit together.
- When selecting a university course, ensure it aligns with your career path and interests by reviewing the cour

  ddgs_gen = ddgs.text(


Error parsing search results. Returning empty list.

Final knowledge:
['- The Channel Tunnel, also known as the Chunnel, is a 50.46 km (31.35-mile) undersea railway tunnel.', '- Opened in 1994, it connects Folkestone (Kent, England) with Coquelles (Pas-de-Calais, France).', '- It is the only fixed link between Great Britain and the European mainland.', '- The tunnel consists of three tunnels: two for rail traffic and one central tunnel for services and security.', '- The Channel Tunnel celebrates its 30th anniversary in 2024.', '- The excavation for the tunnel produced 4.9 million cubic meters of chalk marl and shale, enough to fill Wembley Stadium seven times.', '- The spoil from the excavation contributed to the creation of the Samphire Hoe nature reserve, increasing the size of the UK by 90 acres.']

Sources:

Generating response...

Response generated
Generation: The Channel Tunnel, also known as the Chunnel, was opened in 1994. It connects Folkestone in England with Coquelles in F

  ddgs_gen = ddgs.text(


Error parsing search results. Returning empty list.

Final knowledge:
['- Emanuel Ringelblum Chair in Jewish History recruitment at UC Davis.', '- History Department located at 2216 Social Science and Humanities Building, 1 Shields Avenue, Davis, CA 95616.', '- Undergrad advising and graduate application information available.', '- Charles D. Johnson, Ph.D., is the chair of the history department.', '- Notable historical development: North Carolina Central University expanded its programs, including a graduate program in history.', '- Kathryn Olmsted served as chair of the history department from 2013-2016 and has held various academic positions since 2001.', '- Texas Tech University has an Associate Professor and Chair of the Department of History.', '- Social media presence for the Department of History at Texas Tech University.']

Sources:

Generating response...

Response generated
Generation: The chair of the history department is Charles D. Johnson, Ph.D. 

Sources:
- [UC Davis H

  ddgs_gen = ddgs.text(


Error parsing search results. Returning empty list.

Final knowledge:
- Opportunity to earn a degree while working full-time.
- Five start opportunities throughout the year:
- Fall Start: August
- Fall Start: October
- Spring Start: January
- Spring Start: March
- Summer Start: June
- Marist MBA program defined by student achievements.
- Aims to help students advance in their current careers or transition to new ones.
- Marist MBA alumni work in prestigious organizations.
- MBA curriculum is cutting-edge and aligned with industry trends.
- Faculty consists of active industry experts.
- Dual degrees are available at most business schools, combining MBA with specialized master's programs or JD MBA programs.
- MBA curriculum is divided into core courses and electives.
- Core MBA courses cover foundational business concepts.
- Managerial accounting is a required course, focusing on costing methods and profitability analysis.
- Topics include Program Evaluation and Review Technique (PERT) f

  ddgs_gen = ddgs.text(


Error parsing search results. Returning empty list.

Final knowledge:
- In 1995, the campus theatre in the Student Center was renovated and renamed the Nelly Goletti Theatre in memory of Frank Fusco's late wife.
- The Nelly Goletti Theatre is a notable venue for performances in the U.S. and Europe.
- The campus green adjacent to the Student Center was completed in 1995, serving as a venue for outdoor performances and student activities, and overlooks the Hudson River.
- The campus green is the site of Commencement ceremonies each May and hosted the opening ceremonies of the 2005 Empire State Games.
- The McCann Center was expanded and renovated in 1997, adding 20,000 square feet to accommodate the growing student population.
- The expansion of the McCann Center includes a multi-purpose gym, cardiovascular center, weight training facility, and locker rooms.
- A new office complex was added in the original structure of the McCann Center to house team offices.
- Marist College is located 

  ddgs_gen = ddgs.text(


Error parsing search results. Returning empty list.

Final knowledge:
- Graduate school offers advanced programs beyond a bachelor's degree.
- Decision to attend graduate school requires time, planning, and support from faculty and staff.
- Important questions to consider:
- Why should I go to graduate school?
- Where do I want to attend graduate school?
- What might I contribute to a graduate program?
- Benefits of graduate school include:
- Higher pay scale for those with advanced degrees compared to those with only a bachelor's degree.
- Mandatory graduate training for certain occupations (e.g., law, medicine, teaching).
- Potential for enhanced career opportunities.
- Graduate school should be pursued with a clear purpose, not as a means to delay job searching.
- Choose a program that facilitates career advancement.
- Attending grad school immediately after undergrad can provide an advantage in securing roles that require a master's degree.
- Graduate school offers networking oppor

  ddgs_gen = ddgs.text(


Error parsing search results. Returning empty list.

Final knowledge:
['- Rover Environmental Monitoring Station (REMS) data certified from Aug 7, 2012, to July 28, 2024 (ongoing).', '- REMS includes sensors for wind, temperature, humidity, and pressure.', '- Artistic representation of Curiosity rover with REMS Booms and Ultraviolet Sensor.', '- Link to REMS data: https://atmos.nmsu.edu/data_and_services/atmospheres_data/MARS/curiosity/rems.html', '- AirData Air Quality Monitors app available on web and mobile for mapping monitor locations and information.', '- Link to Air Quality Monitors interactive map: https://www.epa.gov/outdoor-air-quality-data/interactive-map-air-quality-monitors', '- Lunar Environment Monitoring Station (LEMS) is part of Artemis III mission to explore lunar South Pole, planned for launch in 2026.', '- Artemis III marks the return of humans to the Moon since Apollo program (1969-1972).', '- Link to LEMS information: https://science.nasa.gov/lems/', '- Environmen

  ddgs_gen = ddgs.text(


Error parsing search results. Returning empty list.

Final knowledge:
- School of Professional Programs and Graduate & Adult Enrollment established.
- In 1994, a $27 million project completed to create a new Student Center.
- The Student Center features:
- Expansive bookstore
- New dining facilities
- Cabaret
- Dramatic rotunda
- Admissions Office
- A mid-rise residence hall for 382 students in suites was included.
- New offices for Student Affairs and Student Government were part of the project.
- A new set of townhouses for 144 students opened in the same year.
- On-campus graduate housing is more affordable in ultra-urban areas like Chicago, New York, Washington, D.C., and Los Angeles.
- Doctoral students in the Arts and Sciences are eligible for five consecutive academic years of guaranteed student housing if they apply by the deadline, remain registered, and maintain good academic and administrative standing.
- On-campus housing for graduate students at the University at Buffalo i

  ddgs_gen = ddgs.text(


Error parsing search results. Returning empty list.

Final knowledge:
- Marist College's Strategic Plan covers the period 2018-2023.
- The plan is titled "Student Success, Innovation, and the Social Good."
- Developed through a year-long, broad-based, and inclusive process.
- Aims to build on Marist's successes and enter a new chapter.
- Principles from Marist Brother founders include excellence in education, community, and service.
- Emphasizes the need for innovation and institutional agility in higher education.
- Focuses on three essential goals: Ensuring Student Success, Promoting Innovation, and Advancing the Social Good.
- The FY23-27 Strategic Plan outlines the missions and goals of the U.S. Department of Homeland Security (DHS).
- It fulfills the GPRA Modification Act of 2010 and OMB Circular A-11, Part 6 (2013).
- DHS aims to build a resilient nation capable of withstanding current and future threats.
- Progress was made in the goals set out in the 2022-2026 Strategic Plan by

In [15]:
# Evaluation
print("Starting Evaluation...")
evalResults, metricStats = await pipelineEvaluation(evaluationSamples, [LLMContextRecall(llm=LangchainLLMWrapper(llm))])
print("Evaluation Finished!")
for result in evalResults:
    print(result)

print("+-+-+-+-+-+-+-+-+-+-+-+FINAL RESULTS+-+-+-+-+-+-+-+-+-+-+-+")
for metric in metricStats.keys():
    print(f"{metric} - Mean: {metricStats[metric]['mean']}, St. Dev: {metricStats[metric]['std_dev']}")

Starting Evaluation...
Evaluation Finished!
{'question': 'Who is Desiree Dighton?', 'LLMContextRecall': 0.3333333333333333}
{'question': 'How can I find out more about the Clinical health counseling program?', 'LLMContextRecall': 0.75}
{'question': 'Who published Displacements and Transformations in Caribbean Cultures?', 'LLMContextRecall': 0.25}
{'question': 'What is the number for financial services?', 'LLMContextRecall': 0.5}
{'question': 'What educational areas does Marist give masters in?', 'LLMContextRecall': 0.2}
{'question': 'School of Communications internship opportunities', 'LLMContextRecall': 0.0}
{'question': 'Nicholas Marshall', 'LLMContextRecall': 0.0}
{'question': 'About Fanfarelli', 'LLMContextRecall': 'Error: The LLM generation was not completed. Please increase try increasing the max_tokens and try again.'}
{'question': 'How do a report a covid 19 issue on campus?', 'LLMContextRecall': 0.42857142857142855}
{'question': 'how many distribution credits do I need?', 'LLM

In [16]:
# Evaluation
print("Starting Evaluation...")
evalResults, metricStats = await pipelineEvaluation(evaluationSamples, [FactualCorrectness(llm=LangchainLLMWrapper(llm))])
print("Evaluation Finished!")
for result in evalResults:
    print(result)

print("+-+-+-+-+-+-+-+-+-+-+-+FINAL RESULTS+-+-+-+-+-+-+-+-+-+-+-+")
for metric in metricStats.keys():
    print(f"{metric} - Mean: {metricStats[metric]['mean']}, St. Dev: {metricStats[metric]['std_dev']}")

Starting Evaluation...
Evaluation Finished!
{'question': 'Who is Desiree Dighton?', 'FactualCorrectness': 'Error: The LLM generation was not completed. Please increase try increasing the max_tokens and try again.'}
{'question': 'How can I find out more about the Clinical health counseling program?', 'FactualCorrectness': 0.67}
{'question': 'Who published Displacements and Transformations in Caribbean Cultures?', 'FactualCorrectness': 'Error: The LLM generation was not completed. Please increase try increasing the max_tokens and try again.'}
{'question': 'What is the number for financial services?', 'FactualCorrectness': 'Error: The LLM generation was not completed. Please increase try increasing the max_tokens and try again.'}
{'question': 'What educational areas does Marist give masters in?', 'FactualCorrectness': 'Error: The LLM generation was not completed. Please increase try increasing the max_tokens and try again.'}
{'question': 'School of Communications internship opportunities'

In [17]:
# Evaluation
print("Starting Evaluation...")
evalResults, metricStats = await pipelineEvaluation(evaluationSamples, [Faithfulness(llm=LangchainLLMWrapper(llm))])
print("Evaluation Finished!")
for result in evalResults:
    print(result)

print("+-+-+-+-+-+-+-+-+-+-+-+FINAL RESULTS+-+-+-+-+-+-+-+-+-+-+-+")
for metric in metricStats.keys():
    print(f"{metric} - Mean: {metricStats[metric]['mean']}, St. Dev: {metricStats[metric]['std_dev']}")

Starting Evaluation...
Evaluation Finished!
{'question': 'Who is Desiree Dighton?', 'Faithfulness': 1.0}
{'question': 'How can I find out more about the Clinical health counseling program?', 'Faithfulness': 0.6666666666666666}
{'question': 'Who published Displacements and Transformations in Caribbean Cultures?', 'Faithfulness': 0.25}
{'question': 'What is the number for financial services?', 'Faithfulness': 1.0}
{'question': 'What educational areas does Marist give masters in?', 'Faithfulness': 1.0}
{'question': 'School of Communications internship opportunities', 'Faithfulness': 1.0}
{'question': 'Nicholas Marshall', 'Faithfulness': 1.0}
{'question': 'About Fanfarelli', 'Faithfulness': 0.9333333333333333}
{'question': 'How do a report a covid 19 issue on campus?', 'Faithfulness': 0.7}
{'question': 'how many distribution credits do I need?', 'Faithfulness': 1.0}
{'question': 'Dr. Elizabeth Kaknes?', 'Faithfulness': 0.9090909090909091}
{'question': 'Who is Dr. Lynn Eckert?', 'Faithfulne

In [18]:
# Evaluation
print("Starting Evaluation...")
evalResults, metricStats = await pipelineEvaluation(evaluationSamples, [SemanticSimilarity(embeddings=LangchainEmbeddingsWrapper(OpenAIEmbeddings()))])
print("Evaluation Finished!")
for result in evalResults:
    print(result)

print("+-+-+-+-+-+-+-+-+-+-+-+FINAL RESULTS+-+-+-+-+-+-+-+-+-+-+-+")
for metric in metricStats.keys():
    print(f"{metric} - Mean: {metricStats[metric]['mean']}, St. Dev: {metricStats[metric]['std_dev']}")

Starting Evaluation...
Evaluation Finished!
{'question': 'Who is Desiree Dighton?', 'SemanticSimilarity': 0.9210551869163699}
{'question': 'How can I find out more about the Clinical health counseling program?', 'SemanticSimilarity': 0.9442127353057344}
{'question': 'Who published Displacements and Transformations in Caribbean Cultures?', 'SemanticSimilarity': 0.8268826685361922}
{'question': 'What is the number for financial services?', 'SemanticSimilarity': 0.8807925079739356}
{'question': 'What educational areas does Marist give masters in?', 'SemanticSimilarity': 0.8429485289235374}
{'question': 'School of Communications internship opportunities', 'SemanticSimilarity': 0.8966025720243086}
{'question': 'Nicholas Marshall', 'SemanticSimilarity': 0.8072944692808861}
{'question': 'About Fanfarelli', 'SemanticSimilarity': 0.9568380732491247}
{'question': 'How do a report a covid 19 issue on campus?', 'SemanticSimilarity': 0.7606187431012537}
{'question': 'how many distribution credits d

### Full CRAG Results:
- LLMContextRecall - Mean: 0.410, St. Dev: 0.325
- FactualCorrectness - Mean: 0.396, St. Dev: 0.276
- Faithfulness - Mean: 0.824, St. Dev: 0.213
- SemanticSimilarity - Mean: 0.881, St. Dev: 0.065