In [None]:
'''
Simple RAG Implementation
Author: Christian Sarmiento
Purpose: This notebook is intended to get a simple implementation of RAG set up with LangChain.
Date Created: 10/1/24
Last Updated: 11/28/24
Data: https://archive.ics.uci.edu/dataset/450/sports+articles+for+objectivity+analysis
Sources:
- https://python.langchain.com/docs/tutorials/rag/
- https://python.langchain.com/docs/tutorials/llm_chain/
- https://medium.com/@dinabavli/rag-basics-basic-implementation-of-retrieval-augmented-generation-rag-e80e0791159d
- ChatGPT: o1-preview
-----------------------------------------------------------------------------------------------------------------------
RAG Research             |               Machine Learning Independent Study             |              DR. EITEL LAURIA
'''

In [44]:
# Download RAGAS for RAG metrics
%pip install ragas
%pip install nltk

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [1]:
# Imports
import sys
sys.path.append("/Users/christiansarmiento/Library/CloudStorage/OneDrive-MaristCollege/Machine Learning/Private Code")
from api_keys import openAIKey
from api_keys import langchainKey
from langchain_openai import ChatOpenAI
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain import hub  # for RAG prompt
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain.docstore.document import Document
from langchain_core.messages import AIMessage, HumanMessage
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import MessagesPlaceholder
from langchain_core.prompts import ChatPromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter


import pandas as pd
import os
import gradio as gr  # easy frontend implementation
import numpy as np
import time
import warnings
warnings.filterwarnings("ignore")


from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness, SemanticSimilarity, LLMContextPrecisionWithoutReference
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas import SingleTurnSample


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# LangChain Enviornment Variables
os.environ["LANGCHAIN_TRACING_V2"] = "false"
os.environ["LANGCHAIN_API_KEY"] = langchainKey()
os.environ["OPENAI_API_KEY"] = openAIKey()

In [3]:
# Load OpenAI model 
llm = ChatOpenAI(model="gpt-4o-mini")

In [13]:
# Load Data
folderPath = "/Users/christiansarmiento/Library/CloudStorage/OneDrive-MaristCollege/Machine Learning/Data/sports_articles_corpus/Raw data"
sportsArticles = []

for fileName in os.listdir(folderPath):
    filePath = os.path.join(folderPath, fileName)
    loader = TextLoader(filePath, encoding='latin1')  # UTF-8 not working for the files
    doc = loader.load()
    sportsArticles.extend(doc)

In [14]:
# Split Documents into Chunks
textSplitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, add_start_index=True)
texts = textSplitter.split_documents(sportsArticles)

In [15]:
# Store Documents in Vector DB (Chroma)
vectorDB = Chroma.from_documents(documents=texts, embedding=OpenAIEmbeddings())

In [16]:
# Setup Retrieval System
retriever = vectorDB.as_retriever(search_type="similarity", search_kwargs={"k": 3})  # Retrieves 3 documents

# To get retrieved documents:
# retrievedDocuments = retriever.invoke("query")

In [None]:
# Setting up the RAG Chain

# Function to format documents into the prompt
def formatDocs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Setup RAG Chain
prompt = hub.pull("rlm/rag-prompt")
ragChain = (
    {"context": retriever | formatDocs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [22]:
# Results
for chunk in ragChain.stream("Explain the offside rule in soccer."):
    print(chunk, end="", flush=True)

The offside rule in soccer states that a player is in an offside position if they are nearer to the opponent's goal line than both the ball and the second-to-last opponent when the ball is played to them, unless they are in their own half or level with the second-to-last opponent. Being in an offside position is not an offense in itself; the player must become involved in active play to be penalized. The rule aims to prevent players from gaining an unfair advantage by lingering near the opponent's goal.

In [2]:
# Test if answers are coming from the llm or from the documents
# Try giving documents that aren't real then asking questions on things off of that
# Avoids the model relying on trained info 
# Play with the system prompt

# Next step after QA - feed answers into the system to make it more conversational
# Implement Gradio
# Knowledge Graph 
# Identifying Metrics - do research!!


# Implement with Marist Data

In [4]:
# Load Data
csvPath = "/Users/christiansarmiento/Library/CloudStorage/OneDrive-MaristCollege/Machine Learning/Data/Cleaned_QA.csv"
maristQA = pd.read_csv(csvPath, header=None)

# To use RecursiveCharacterTextSplitter, we need a list of dictionaries
maristContext = [Document(page_content=text) for text in maristQA[1].tolist()]

In [5]:
# Split Documents into Chunks
textSplitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, add_start_index=True)
texts = textSplitter.split_documents(maristContext)

In [6]:
# Store Documents in Vector DB (Chroma)
vectorDB = Chroma.from_documents(documents=texts, embedding=OpenAIEmbeddings())

In [7]:
# Setup Retrieval System
retriever = vectorDB.as_retriever(search_type="similarity", search_kwargs={"k": 3})  # Retrieves 3 documents

In [8]:
# Prompts
systemPrompt = (
    
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
    
)

prompt = ChatPromptTemplate.from_messages(
    
    [
        ("system", systemPrompt),
        ("human", "{input}"),
    ]
    
)

contextualizeSystemPrompt = (
    "Given a chat history and the latest user question "
    "which might reference context in the chat history, "
    "formulate a standalone question which can be understood "
    "without the chat history. Do NOT answer the question, "
    "just reformulate it if needed and otherwise return it as is."
)

contextualizePrompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualizeSystemPrompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
historyAwareRetriever = create_history_aware_retriever(
    llm, retriever, contextualizePrompt
)

qaPrompt = ChatPromptTemplate.from_messages(
    [
        ("system", systemPrompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

In [9]:
# Make chains
questionAnswerChain = create_stuff_documents_chain(llm, qaPrompt)
ragChain = create_retrieval_chain(historyAwareRetriever, questionAnswerChain)

In [11]:
# Talking to ChatGPT, making RAG conversational
conversationHistory = []
userQuery = input("Prompt (0 to quit): ")
while userQuery != '0':

    # Print input - this is just for a VSCode enviornment to see I/O together, feel free to comment out in Jupyter
    print(f"User: {userQuery}")

    # Call ChatGPT using RAG chain
    llmResponse = ragChain.invoke({"input": userQuery, "chat_history": conversationHistory})
    print(f"LLM: {llmResponse['answer']}")
    print()
    conversationHistory.extend([
        
        HumanMessage(content=userQuery),
        AIMessage(content=llmResponse["answer"]),
        
    ])

    # New prompt
    userQuery = input("Prompt (0 to quit): ")

Marist College is located on the banks of the Hudson River and also has a campus in Florence, Italy.

Marist College is in Poughkeepsie, New York, situated along the Hudson River. The Florence campus is located in Florence, Italy.



In [10]:
# Frontend with Gradio

'''
Function that calls the RAG chain for Gradio
'''
evaluationSamples = []
def simpleRAG(userQuery, history, correctAnswer):

    # Ensure there is a list to use for the conversation history
    if history is None:
        history = []

    # Call ChatGPT using RAG chain
    llmResponse = ragChain.invoke({"input": userQuery, "chat_history": history})

    # Get response and context for evaluation
    responseText = llmResponse["answer"]
    retrievedContexts = [context.page_content for context in retriever.get_relevant_documents(userQuery)]

    # Save information for RAG metrics
    evaluationSamples.append({
        "user_input": userQuery,
        "retrieved_contexts": retrievedContexts,
        "response": responseText,
        "reference": correctAnswer    # ground truth 
    })

    # Save chat history for conversational aspect
    history.extend([
        
        HumanMessage(content=userQuery),
        AIMessage(content=llmResponse["answer"]),
        
    ])

    # Save input and output to history
    history.append(HumanMessage(content=userQuery))
    history.append(AIMessage(content=llmResponse["answer"]))

    # Prepare display of data
    #chatDisplay = [(msg.content, "User" if isinstance(msg, HumanMessage) else "LLM") for msg in history]

    return history  #, chatDisplay


In [None]:
# Frontend
interface = gr.Interface(
    fn=simpleRAG,  
    inputs=["text", "state", gr.Textbox(label="Correct Answer")],  
    outputs=["chatbot", "state"],  
    title="Simple RAG",  
    description="Initial setup for a simple conversational RAG process."
)

# Launch the frontend
interface.launch()

In [11]:
# Function to evaluate our RAG pipeline
async def pipelineEvaluation(dataset, metrics):

    # Run through our runs
    results = []
    for run in dataset:

        # Save our inputs/outputs
        inputQuery = run["user_input"]
        groundTruthAnswer = run["reference"]
        contexts = run["retrieved_contexts"]
        response = run["response"]

        # Create a SingleTurnSample object
        sample = SingleTurnSample(
            user_input=inputQuery,
            response=response,
            reference=groundTruthAnswer,
            retrieved_contexts=contexts 
        )

        # Evaluate metrics
        runResults = {"input_query": inputQuery}
        for metric in metrics:

            # Get the score for the given metric
            try:

                score = await metric.single_turn_ascore(sample)
                runResults[type(metric).__name__] = score

            except Exception as e:
                # Catch errors for debugging
                runResults[type(metric).__name__] = f"Error: {str(e)}"
        
        # Save metric results
        results.append(runResults)

    # Calculate mean and standard deviation for each metric
    metricsStats = {}
    for metric in metrics:
        metricName = type(metric).__name__
        scores = [result[metricName] for result in results if isinstance(result[metricName], (int, float))]
        
        # Only calculate stats if there are valid scores
        if scores:
            metricsStats[metricName] = {
                "mean": np.mean(scores),
                "std_dev": np.std(scores),
            }
            
        else:
            metricsStats[metricName] = {
                "mean": "No valid scores",
                "std_dev": "No valid scores",
            }
    
    return results, metricsStats

In [12]:
# Load metrics
evalMetrics = [LLMContextRecall(llm=LangchainLLMWrapper(llm)), 
               FactualCorrectness(llm=LangchainLLMWrapper(llm)), 
               Faithfulness(llm=LangchainLLMWrapper(llm)), 
               SemanticSimilarity(embeddings=LangchainEmbeddingsWrapper(OpenAIEmbeddings())),
               LLMContextPrecisionWithoutReference(llm=LangchainLLMWrapper(llm))]

In [13]:
# Evaluate our pipeline responses
evalResults = await pipelineEvaluation(evaluationSamples, evalMetrics)
for result in evalResults:
    print(result)

{'input_query': 'Who is Carolyn Matheus?', 'LLMContextRecall': 0.0, 'FactualCorrectness': 0.0, 'Faithfulness': 1.0, 'SemanticSimilarity': 0.9328844088386496}


In [13]:
# Sample records from our dataset
maristTestSample = maristQA.sample(50, replace=False)
maristTestSample.head()

Unnamed: 0,0,1
486,When do you apply to the special education tea...,Education DepartmentDual Certification: Childh...
468,Eileen Curley Bio?,Contact InformationAcademic SchoolOfficeEmailP...
518,Dr. Joanne Myers role,Contact InformationAcademic SchoolOfficeEmailP...
553,Phone number Dr. Wermuth?,School of Liberal ArtsDr. Thomas S. WermuthVic...
14,Where is student financial services?,Graduate: Student Financial ServicesReceive th...


In [None]:
# Run our chain with each question and evaluate
chatHistory = None
for row in maristTestSample.iterrows():
    chatHistory = simpleRAG(row[1][0], chatHistory, row[1][1])

## Evaluation
evalResults, metricStats = await pipelineEvaluation(evaluationSamples, evalMetrics)
for result in evalResults:
    print(result)

for metric in metricStats.keys():
    print(f"{metric} - Mean: {metricStats[metric]['mean']}, St. Dev: {metricStats[metric]['std_dev']}")

# Define Simple RAG w/o Conversational Aspect
- Since we are looking at samples independently here, no need to have a conversational aspect here
- This may help with runtimes since as of right now, this is slower than the other two implementations

In [3]:
# Define function for RAG chain setup
def ragChainSetup(model, context):
    '''
    Helper method that sets up a simple RAG chain given a model object and a corpus of context documents. This method
    returns the RAG chain object built using LangChain, the retrieval system object built by LangChain, and a list
    of metrics that are built using the given model. 
    '''
    
    # Split Documents into Chunks
    textSplitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, add_start_index=True)
    texts = textSplitter.split_documents(context)

    # Store Documents in Vector DB (Chroma)
    vectorDB = Chroma.from_documents(documents=texts, embedding=OpenAIEmbeddings())

    # Setup Retrieval System
    retrievalSystem = vectorDB.as_retriever(search_type="similarity", search_kwargs={"k": 3})  # Retrieves 3 document

    # Function to format documents into the prompt
    def formatDocs(docs):
        return "\n\n".join(doc.page_content for doc in docs)

    # Setup RAG Chain
    prompt = hub.pull("rlm/rag-prompt")
    chain = (
        {"context": retrievalSystem | formatDocs, "question": RunnablePassthrough()}
        | prompt
        | model
        | StrOutputParser()
    )

    ## Load metrics
    metrics = [LLMContextRecall(llm=LangchainLLMWrapper(model)), 
                FactualCorrectness(llm=LangchainLLMWrapper(model)), 
                Faithfulness(llm=LangchainLLMWrapper(model)), 
                SemanticSimilarity(embeddings=LangchainEmbeddingsWrapper(OpenAIEmbeddings())),
                LLMContextPrecisionWithoutReference(llm=LangchainLLMWrapper(llm))]

    return chain, retrievalSystem, metrics

In [8]:
# Define RAG chain

## Load data
csvPath = "/Users/christiansarmiento/Library/CloudStorage/OneDrive-MaristCollege/Machine Learning/Data/Cleaned_QA.csv"  # use header=None when reading in this csv
evalDataPath = "/Users/christiansarmiento/Library/CloudStorage/OneDrive-MaristCollege/Machine Learning/Data/manual_eval.csv"
maristQA = pd.read_csv(evalDataPath, sep=";")
maristContext = [Document(page_content=text) for text in maristQA["ground_truth"].tolist()]

## Load OpenAI model 
llm = ChatOpenAI(model="gpt-4o-mini")

## Setting up the RAG Chain
ragChain, retriever, evalMetrics = ragChainSetup(llm, maristContext)

In [9]:
# SimpleRAG function

'''
Function that calls the runs the simpleRAG chain and saves responses to a global list for access
'''
evaluationSamples = []
def simpleRAG(userQuery, correctAnswer):
    '''
    Method that runs a simpleRAG chain and saves the input and output. This takes in a single question (userQuery)
    and the correct answer to the question (correctAnswer). Returns nothing, but saves the user query, contexts, 
    generation, and correct answer to a global list for future evaluation. 
    '''
    
    # Call ChatGPT using RAG chain
    responseText = ragChain.invoke(userQuery)

    # Get context for evaluation
    retrievedContexts = [context.page_content for context in retriever.get_relevant_documents(userQuery)]

    # Ensure that we are only passing along contexts that are different from eachother
    finalContexts = [retrievedContexts[0]]
    for context in retrievedContexts:

        if context != retrievedContexts[0]:
            finalContexts.append(context)

    print(f"Final Context Length: {len(finalContexts)}")
        
    # Save information for RAG metrics
    evaluationSamples.append({
        "user_input": userQuery,
        "retrieved_contexts": finalContexts,
        "response": responseText,
        "reference": correctAnswer    # ground truth 
    })

In [10]:
# Function to evaluate our RAG pipeline
async def pipelineEvaluation(dataset, metrics):
    '''
    Method that evaluates the performance of a RAG chain. This takes in a dataset that contains the input query,
    correct answer to the query, the generation for the query, and the contexts used for generation and as well as a
    list of metrics to evaluate the dataset on. This returns a list of results that shows the final metric for each
    record in the dataset and as well as a dictionary with the mean and standard deviation of each metric. 
    '''
    
    # Run through our runs
    results = []
    for run in dataset:

        # Save our inputs/outputs
        inputQuery = run["user_input"]
        groundTruthAnswer = run["reference"]
        contexts = run["retrieved_contexts"]
        response = run["response"]

        # Create a SingleTurnSample object
        sample = SingleTurnSample(
            user_input=inputQuery,
            response=response,
            reference=groundTruthAnswer,
            retrieved_contexts=contexts 
        )

        # Evaluate metrics
        runResults = {"input_query": inputQuery}
        for metric in metrics:

            # Get the score for the given metric
            try:

                score = await metric.single_turn_ascore(sample)
                runResults[type(metric).__name__] = score

            except Exception as e:
                # Catch errors for debugging
                runResults[type(metric).__name__] = f"Error: {str(e)}"
        
        # Save metric results
        results.append(runResults)

    # Calculate mean and standard deviation for each metric
    metricsStats = {}
    for metric in metrics:
        metricName = type(metric).__name__
        scores = [result[metricName] for result in results if isinstance(result[metricName], (int, float))]
        
        # Only calculate stats if there are valid scores
        if scores:
            metricsStats[metricName] = {
                "mean": np.mean(scores),
                "std_dev": np.std(scores),
            }
            
        else:
            metricsStats[metricName] = {
                "mean": "No valid scores",
                "std_dev": "No valid scores",
            }
    
    return results, metricsStats

In [11]:
# Function to clean up evaluation code
async def computeEvaluationMetrics(samples, metrics):
    '''
    Helper method to compute the metrics and its averages for a given sample. parameter "samples" is a list of
    dictonaries that was aggregated from running a RAG process and collecting the input and output data. Parameter
    "metrics" is a list of metric objects from RAGAS to evaluate the given samples. 
    '''
    
    # Keep track of time
    startTime = time.time()

    # Evaluate every sample
    print("Starting Evaluation...")
    evalResults, metricStats = await pipelineEvaluation(samples, metrics)
    print("Evaluation Finished!")
    for result in evalResults:
        print(result)

    # Compute and output the mean and standard deviation for each metric
    print("+-+-+-+-+-+-+-+-+-+-+-+FINAL RESULTS+-+-+-+-+-+-+-+-+-+-+-+")
    for metric in metricStats.keys():
        print(f"{metric} - Mean: {metricStats[metric]['mean']}, St. Dev: {metricStats[metric]['std_dev']}")
    
    # Elapsed time
    endTime = time.time()
    elapsedTime = endTime - startTime
    if elapsedTime > 60:
        secToMin = elapsedTime / 60
        if secToMin > 60:
            print(f"\nExecution Time: {(secToMin / 60):.2f} hrs")
        else:
            print(f"\nExecution Time: {secToMin:.2f} min")
    
    else:
        print(f"\nExecution Time: {(elapsedTime):.2f} sec")

In [8]:
# Sample records from our dataset
maristTestSample = maristQA.sample(50, replace=False)
maristTestSample.head()

Unnamed: 0,0,1
43,What Community Based Learning classes are avai...,Academics Department Center for Civic Engageme...
134,Is there a strategic committee at the school?,About Marist CollegeStrategic Plan 2018-2023Ja...
528,Who's in charge FYS?,Contact InformationAcademic SchoolOfficeEmailP...
87,Online graduate program information,Graduate: Information SessionsThe best way to ...
660,The MSPAccy program has how many credits for f...,Master of Science in Professional Accountancy ...


In [12]:
# Run simple RAG on samples
for row in maristQA.iterrows():
    simpleRAG(row[1][0], row[1][1])

print("Generations Done!")

Final Context Length: 3
Final Context Length: 3
Final Context Length: 3
Final Context Length: 3
Final Context Length: 2
Final Context Length: 3
Final Context Length: 2
Final Context Length: 2
Final Context Length: 3
Final Context Length: 2
Final Context Length: 3
Final Context Length: 3
Final Context Length: 3
Final Context Length: 3
Final Context Length: 3
Final Context Length: 3
Final Context Length: 2
Final Context Length: 3
Final Context Length: 3
Final Context Length: 2
Final Context Length: 2
Final Context Length: 3
Final Context Length: 3
Final Context Length: 3
Final Context Length: 3
Final Context Length: 3
Final Context Length: 2
Final Context Length: 3
Final Context Length: 3
Final Context Length: 3
Final Context Length: 3
Final Context Length: 2
Final Context Length: 3
Final Context Length: 3
Final Context Length: 2
Final Context Length: 3
Final Context Length: 3
Final Context Length: 3
Final Context Length: 3
Final Context Length: 2
Final Context Length: 3
Final Context Le

In [None]:
# Evaluation for LLMContextRecall
await computeEvaluationMetrics(evaluationSamples, [evalMetrics[0]])

Starting Evaluation...


Failed to batch ingest runs: langsmith.utils.LangSmithRateLimitError: Rate limit exceeded for https://api.smith.langchain.com/runs/batch. HTTPError('429 Client Error: Too Many Requests for url: https://api.smith.langchain.com/runs/batch', '{"detail":"Monthly unique traces usage limit exceeded"}')
post: trace=9c6af390-08d1-4307-a10e-867c409eaf60,id=9c6af390-08d1-4307-a10e-867c409eaf60; trace=9c6af390-08d1-4307-a10e-867c409eaf60,id=927f6c59-0246-4fed-b407-4c7718811a8d; trace=9c6af390-08d1-4307-a10e-867c409eaf60,id=98d4a2ee-7402-4870-8e0c-085a432398e1; trace=9c6af390-08d1-4307-a10e-867c409eaf60,id=278c023d-aebd-413f-b40c-63342b6b053d; trace=9c6af390-08d1-4307-a10e-867c409eaf60,id=9ab8fbf0-2ea6-4c90-a64f-712b8febfd4d
Failed to batch ingest runs: langsmith.utils.LangSmithRateLimitError: Rate limit exceeded for https://api.smith.langchain.com/runs/batch. HTTPError('429 Client Error: Too Many Requests for url: https://api.smith.langchain.com/runs/batch', '{"detail":"Monthly unique traces usag

Evaluation Finished!
{'input_query': 'How to contact Safety and security?', 'LLMContextRecall': 0.5}
{'input_query': 'NASPAA', 'LLMContextRecall': 0.4}
{'input_query': 'Can I appeal for financial aid?', 'LLMContextRecall': 0.4}
{'input_query': 'Who is Eileen Curley?', 'LLMContextRecall': 0.8333333333333334}
{'input_query': "How long has Michael E. O'Sullivan been at marist?", 'LLMContextRecall': 0.25}
{'input_query': 'When is the language center open?', 'LLMContextRecall': 0.8}
{'input_query': 'James Lucina email', 'LLMContextRecall': 0.875}
{'input_query': 'Professional Accountancy graduate success rate', 'LLMContextRecall': 0.8333333333333334}
{'input_query': 'From where did Donise English graduate?', 'LLMContextRecall': 0.3333333333333333}
{'input_query': 'Fox Quest', 'LLMContextRecall': 0.0}
{'input_query': 'What are the other disciplinary approaches included in a pathway?', 'LLMContextRecall': 0.8461538461538461}
{'input_query': 'How do you get academic accommodation?', 'LLMContex

In [10]:
# Evaluation for FactualCorrectness
await computeEvaluationMetrics(evaluationSamples, [evalMetrics[1]])

Starting Evaluation...
Evaluation Finished!
{'input_query': 'What Community Based Learning classes are available?', 'FactualCorrectness': 0.0}
{'input_query': 'Is there a strategic committee at the school?', 'FactualCorrectness': 0.26}
{'input_query': "Who's in charge FYS?", 'FactualCorrectness': 0.0}
{'input_query': 'Online graduate program information', 'FactualCorrectness': 0.29}
{'input_query': 'The MSPAccy program has how many credits for full time classes and  also during fall or spring ?', 'FactualCorrectness': 0.38}
{'input_query': 'MA clinical mental health', 'FactualCorrectness': 0.59}
{'input_query': 'When do accounting firms hire accounting graduates?', 'FactualCorrectness': 0.4}
{'input_query': 'Who is James Lucina', 'FactualCorrectness': 0.61}
{'input_query': 'What is the email for Marist technical support?', 'FactualCorrectness': 0.0}
{'input_query': 'How many credits are required for the physical therapy program?', 'FactualCorrectness': 0.04}
{'input_query': 'Dr. Gregor

In [27]:
# Evaluation for Faithfulness
await computeEvaluationMetrics(evaluationSamples, [evalMetrics[2]])

Starting Evaluation...


Failed to batch ingest runs: langsmith.utils.LangSmithRateLimitError: Rate limit exceeded for https://api.smith.langchain.com/runs/batch. HTTPError('429 Client Error: Too Many Requests for url: https://api.smith.langchain.com/runs/batch', '{"detail":"Monthly unique traces usage limit exceeded"}')
post: trace=0a0b3723-f868-4e4d-bfaf-6c4b6c1a7ccd,id=0a0b3723-f868-4e4d-bfaf-6c4b6c1a7ccd; trace=0a0b3723-f868-4e4d-bfaf-6c4b6c1a7ccd,id=36bcf38a-ec76-4975-86d1-cf0aeb6e40dc; trace=0a0b3723-f868-4e4d-bfaf-6c4b6c1a7ccd,id=6532bea6-0add-4034-a087-adce8ecc9f3d
Failed to batch ingest runs: langsmith.utils.LangSmithConnectionError: Connection error caused failure to POST https://api.smith.langchain.com/runs/batch in LangSmith API. Please confirm your internet connection. SSLError(MaxRetryError("HTTPSConnectionPool(host='api.smith.langchain.com', port=443): Max retries exceeded with url: /runs/batch (Caused by SSLError(SSLEOFError(8, 'EOF occurred in violation of protocol (_ssl.c:2406)')))"))
Content

Evaluation Finished!
{'input_query': 'Undergrad admission Contact details for general enquires?', 'Faithfulness': 1.0}
{'input_query': 'What is the cost of ASIP Program?', 'Faithfulness': 1.0}
{'input_query': 'What position does Dr. Stephen M. Mercier hold?', 'Faithfulness': 1.0}
{'input_query': 'Who is Dr. Sally Dwyer-McNulty?', 'Faithfulness': 1.0}
{'input_query': 'Who is the chair of the history department?', 'Faithfulness': 1.0}
{'input_query': 'What non-profit internships are there?', 'Faithfulness': 0.8888888888888888}
{'input_query': 'Cost of credit for Masters  of Arts in Museum Studies?', 'Faithfulness': 1.0}
{'input_query': 'Who is Dr. Pau-San Haruta?', 'Faithfulness': 1.0}
{'input_query': 'About Fanfarelli', 'Faithfulness': 0.8}
{'input_query': 'Assistant dean of school of communications', 'Faithfulness': 1.0}
{'input_query': 'Who is Subir Sengupta?', 'Faithfulness': 0.875}
{'input_query': 'What is the minimum gpa to get an internship?', 'Faithfulness': 1.0}
{'input_query': 

Failed to batch ingest runs: langsmith.utils.LangSmithRateLimitError: Rate limit exceeded for https://api.smith.langchain.com/runs/batch. HTTPError('429 Client Error: Too Many Requests for url: https://api.smith.langchain.com/runs/batch', '{"detail":"Monthly unique traces usage limit exceeded"}')
post: trace=0c1dd7a3-7a7a-44c5-8959-525c165151d2,id=38708c99-6698-4618-a502-815e6f243e47; trace=0c1dd7a3-7a7a-44c5-8959-525c165151d2,id=dc15e619-e0bd-4d02-be6d-6f0e3cc373be; trace=0c1dd7a3-7a7a-44c5-8959-525c165151d2,id=9cd7f29a-4512-4353-a9d7-7c9fe498ed67; trace=110bf2a7-262c-44c3-94ce-6a238f55dad6,id=110bf2a7-262c-44c3-94ce-6a238f55dad6; trace=110bf2a7-262c-44c3-94ce-6a238f55dad6,id=7561f627-d4f1-49bd-b6bb-f598bee1f589; trace=110bf2a7-262c-44c3-94ce-6a238f55dad6,id=9420f1b6-663d-4889-b482-3784bee68e28; trace=110bf2a7-262c-44c3-94ce-6a238f55dad6,id=2e398373-f0be-4a61-a7dc-65b9723780f9; trace=110bf2a7-262c-44c3-94ce-6a238f55dad6,id=63a5f99a-b051-4ad4-b105-d73b646db469; trace=89fa7083-a942-460c

In [None]:
# Evaluation for SemanticSimilarity
await computeEvaluationMetrics(evaluationSamples, [evalMetrics[3]])

Starting Evaluation...
Evaluation Finished!
{'input_query': 'Undergrad admission Contact details for general enquires?', 'SemanticSimilarity': 0.9355849329864713}
{'input_query': 'What is the cost of ASIP Program?', 'SemanticSimilarity': 0.8725719598337792}
{'input_query': 'What position does Dr. Stephen M. Mercier hold?', 'SemanticSimilarity': 0.8035092985077776}
{'input_query': 'Who is Dr. Sally Dwyer-McNulty?', 'SemanticSimilarity': 0.9647212455250856}
{'input_query': 'Who is the chair of the history department?', 'SemanticSimilarity': 0.7795354215224463}
{'input_query': 'What non-profit internships are there?', 'SemanticSimilarity': 0.7575654237420436}
{'input_query': 'Cost of credit for Masters  of Arts in Museum Studies?', 'SemanticSimilarity': 0.8702278837860309}
{'input_query': 'Who is Dr. Pau-San Haruta?', 'SemanticSimilarity': 0.9525824258347897}
{'input_query': 'About Fanfarelli', 'SemanticSimilarity': 0.8647435165018217}
{'input_query': 'Assistant dean of school of communic

In [13]:
# Evaluation for ContextPrecision
await computeEvaluationMetrics(evaluationSamples, [evalMetrics[4]])

Starting Evaluation...
Evaluation Finished!
{'input_query': 'What is a Marist ally?', 'LLMContextPrecisionWithoutReference': 0.5833333333041666}
{'input_query': 'Are flu shots required for students?', 'LLMContextPrecisionWithoutReference': 0.8333333332916666}
{'input_query': 'Are there career fairs on campus?', 'LLMContextPrecisionWithoutReference': 0.9999999999666667}
{'input_query': 'Juan-Manuel Olivera-Silvera about', 'LLMContextPrecisionWithoutReference': 0.99999999995}
{'input_query': 'Studying philosophy.', 'LLMContextPrecisionWithoutReference': 0.99999999995}
{'input_query': 'Marist social media', 'LLMContextPrecisionWithoutReference': 0.3333333333}
{'input_query': 'Who teaches the game of thrones course?', 'LLMContextPrecisionWithoutReference': 0.9999999999}
{'input_query': 'What is the multicultural affairs center?', 'LLMContextPrecisionWithoutReference': 0.99999999995}
{'input_query': 'How much may be given in a research grant?', 'LLMContextPrecisionWithoutReference': 0.99999

# Rerun with Anthropic LLM

In [None]:
# Load OpenAI model 
llm = ChatOpenAI(model="gpt-4o-mini")

# Setting up the RAG Chain
ragChain, retriever, evalMetrics = ragChainSetup(llm, maristContext)

# Reset evaluation samples list to keep track of the samples for this run
evaluationSamples = []

In [None]:
# Sample records from our dataset
maristTestSample = maristQA.sample(50, replace=False)
maristTestSample.head()

In [None]:
# Run simple RAG on samples
for row in maristTestSample.iterrows():
    simpleRAG(row[1][0], row[1][1])

print("Generations Done!")

In [None]:
# Evaluation for LLMContextRecall
await computeEvaluationMetrics(evaluationSamples, [evalMetrics[0]])

In [None]:
# Evaluation for FactualCorrectness
await computeEvaluationMetrics(evaluationSamples, [evalMetrics[1]])

In [None]:
# Evaluation for Faithfulness
await computeEvaluationMetrics(evaluationSamples, [evalMetrics[2]])

In [None]:
# Evaluation for SemanticSimilarity
await computeEvaluationMetrics(evaluationSamples, [evalMetrics[3]])

# Visual Inspection of Genenrations

In [4]:
# Load data
csvPath = "/Users/christiansarmiento/Library/CloudStorage/OneDrive-MaristCollege/Machine Learning/Data/Cleaned_QA.csv"
maristQA = pd.read_csv(csvPath, header=None)
maristContext = [Document(page_content=text) for text in maristQA[1].tolist()]

# Load OpenAI model 
llm = ChatOpenAI(model="gpt-4o-mini")

# Setting up the RAG Chain
ragChain, retriever, evalMetrics = ragChainSetup(llm, maristContext)

# Reset evaluation samples list to keep track of the samples for this run
evaluationSamples = []

In [11]:
# SimpleRAG function - editing to build a pandas dataframe out of it

'''
Function that calls the runs the simpleRAG chain and saves responses to a global list for access
'''
samplesOutputDF = pd.DataFrame(columns=["question", "ground_truth", "context", "generation", "TP", "FP", "FN"])
def simpleRAG(userQuery, correctAnswer):
    '''
    Method that runs a simpleRAG chain and saves the input and output. This takes in a single question (userQuery)
    and the correct answer to the question (correctAnswer). Returns nothing, but saves the user query, contexts, 
    generation, and correct answer to a global list for future evaluation. 
    '''
    
    # Call ChatGPT using RAG chain
    responseText = ragChain.invoke(userQuery)

    # Get context for evaluation
    retrievedContexts = [context.page_content for context in retriever.get_relevant_documents(userQuery)]

    # Ensure that we are only passing along contexts that are different from eachother
    finalContexts = [retrievedContexts[0]]
    for context in retrievedContexts:

        if context != retrievedContexts[0]:
            finalContexts.append(context)

    print(f"Final Context Length: {len(finalContexts)}")
        
    # Save information for RAG metrics
    evaluationSamples.append({
        "user_input": userQuery,
        "retrieved_contexts": finalContexts,
        "response": responseText,
        "reference": correctAnswer    # ground truth 
    })

    # Save output to dataframe
    if len(finalContexts) > 1:

        # We need one string to pass to the dataframe
        fullContext = ""
        for context in finalContexts:
            fullContext += context + ", "

        # removing trailing comma
        fullContext = fullContext.rstrip(", ") 

        # Add to the dataframe
        samplesOutputDF.loc[len(samplesOutputDF)] = [userQuery, correctAnswer, fullContext, responseText, -1, -1, -1]
    
    else:
        samplesOutputDF.loc[len(samplesOutputDF)] = [userQuery, correctAnswer, finalContexts[0], responseText, -1, -1, -1]
    
    print(responseText)

In [6]:
# Sample records from our dataset
maristTestSample = maristQA.sample(50, replace=False)
maristTestSample.head()

Unnamed: 0,0,1
251,What is a Marist ally?,The Marist Community Department LGBTQ+ Name Ma...
378,Are flu shots required for students?,Health Services Department Health Services Pho...
151,Are there career fairs on campus?,Center for Career ServicesYour Path to Success...
439,Juan-Manuel Olivera-Silvera about,Contact InformationAcademic SchoolOfficeEmailP...
70,Studying philosophy.,Philosophy & Religious Studies DepartmentPhilo...


In [7]:
# Run simple RAG on samples
for row in maristTestSample.iterrows():
    simpleRAG(row[1][0], row[1][1])

print("Generations Done!")

Final Context Length: 1
Final Context Length: 1
Final Context Length: 1
Final Context Length: 3
Final Context Length: 2
Final Context Length: 1
Final Context Length: 1
Final Context Length: 2
Final Context Length: 1
Final Context Length: 1
Final Context Length: 1
Final Context Length: 1
Final Context Length: 1
Final Context Length: 1
Final Context Length: 2
Final Context Length: 1
Final Context Length: 1
Final Context Length: 1
Final Context Length: 2
Final Context Length: 1
Final Context Length: 2
Final Context Length: 3
Final Context Length: 2
Final Context Length: 3
Final Context Length: 3
Final Context Length: 2
Final Context Length: 1
Final Context Length: 1
Final Context Length: 2
Final Context Length: 2
Final Context Length: 1
Final Context Length: 1
Final Context Length: 2
Final Context Length: 1
Final Context Length: 1
Final Context Length: 1
Final Context Length: 1
Final Context Length: 1
Final Context Length: 1
Final Context Length: 2
Final Context Length: 1
Final Context Le

In [20]:
# Evaluation for ContextPrecision 
await computeEvaluationMetrics(evaluationSamples, [evalMetrics[4]])

IndexError: list index out of range

In [13]:
# Run simple RAG on samples
question = "Who is the chair of the Modern Languages and Cultures Department?"
sample = maristQA[maristQA[0] == question]
for row in sample.iterrows():
    simpleRAG(row[1][0], row[1][1])

print("Generations Done!")

Final Context Length: 1
The chair of the Modern Languages and Cultures Department is Claire Keith, Ph.D.
Generations Done!


In [8]:
# Save Pandas DF
samplesOutputDF.to_csv("/Users/christiansarmiento/Library/CloudStorage/OneDrive-MaristCollege/Machine Learning/Data/manual_eval.csv", sep=";", index=False)