In [3]:
import pandas as pd
from openai import OpenAI
import re

client = OpenAI(api_key="YOUR_API_KEY")

In [7]:
query = 'What are the main topics covered in the NLP class?'
agent_answer = "The main topics covered in the NLP class may vary depending on the specific syllabus of the course, but typically include the following: 1. Introduction to NLP: Overview of NLP, its history, and applications. 2. Language Models: Basics of language models, such as n-gram models, n-gram smoothing, and hidden Markov models. 3. Text Preprocessing: Text cleaning, tokenization, stemming, lemmatization, and stop word removal. 4. Part-of-Speech Tagging: Rules-based and statistical methods for identifying parts of speech in a sentence. 5. Named Entity Recognition: Techniques for identifying and classifying named entities in text, such as people, locations, and organizations. 6. Sentiment Analysis: Methods for determining the sentiment or opinion expressed in a text. 7. Machine Translation: Overview of machine translation, rule-based and statistical methods, and recent advancements in neural machine translation. 8. Information Retrieval: Introduction to information retrieval systems, relevance ranking, and indexing. 9. Text Classification: Techniques for classifying text into predefined categories, such as spam detection and sentiment classification. 10. Dialogue Systems: Introduction to dialogue systems, including rule-based and statistical approaches. 11. Question Answering: Methods for building question-answering systems, including information retrieval-based and retrieval-based approaches. 12. Summarization: Overview of summarization techniques, such as extractive and abstractive summarization. 13. Text Generation: Introduction to text generation techniques, including rule-based and statistical methods. 14. NLP Applications: Case studies and applications of NLP in various domains, such as healthcare, finance, and customer service. These topics may be covered in different ways and with varying depth, depending on the specific goals and focus of the NLP course."
reference_answer = "The main topics covered in this course include Natural Language Processing (NLP) with a focus on language models. It covers both fundamental and advanced NLP topics, large-scale language models, real-world implications such as ethics, classical NLP practices, modern approaches, and hands-on experience in training and evaluating language models."
rag_section = "Course Description:\nThe course delves into Natural Language Processing (NLP), emphasizing Language Models. It covers fundamentals and advanced NLP topics, including large-scale language models and their real-world implications, such as ethics. Students will develop skills in classical and modern NLP practices, along with hands-on experience in training and evaluating language models."

# prompt = (
#         "Evaluate the relevance and correctness of the answers generated by our agent that reads the information retrieved by RAG. And we will provide you the following information:\n\n"

#         "The user's query:\n{query}\n\n"
#         "Information retrieved by the RAG model:\n{rag_section}\n\n"
#         "Our agent's answer:\n{agent_answer}\n\n"
#         "The reference answer generated by human:\n{reference_answer}\n\n"

#         "Please base your rating on the following three points:\n"
#         "1. How relevant is the agent's answer to the user's query, given the information retrieved by the RAG?\n"
#         "2. Was the agent's answer consistent with the information retrieved and did the agent not fabricate any information?\n"
#         "3. Making things up is not allowed. According to the retrieved information from RAG and the reference answer, if you find out our agent is making up something, you should note that which is forbidden.\n\n"

#         "It's important to note that don't be petty when scoring. As long as our agent correctly answers the user's query and doesn't make up anything, or correctly mentions that there are no such information in RAG's result, Then you should give it a high score.\n\n"

#         "Please only return your rating result, which is a score between 1 and 10, where 10 indicates the highest relevance and correctness, and 1 indicates no relevance or correctness.\n\n"

#         "Again, please only return a number, and nothing else. Thank you.").format(query=query, rag_section=rag_section, agent_answer=agent_answer, reference_answer=reference_answer)

prompt ="""
        Evaluate the relevance and correctness of the answers generated by our agent based on the information retrieved by the RAG model. The evaluation will consider the following details:

        1. User's Query:
        {query}
        
        2. Information retrieved by the RAG model:
        {rag_section}
        
        3. Agent's Answer:
        {agent_answer}
        
        4. Reference Answer (generated by humans):
        {reference_answer}
        
        **Rating Guidelines:**
        
        Rate the agent's answer on a scale of 1 to 10, where:
        - **10**: Highly relevant and correct, fully addressing the user's query without fabricating information.
        - **1**: Completely irrelevant or incorrect, with significant issues in relevance or correctness.
        
        **Key Considerations for Rating:**
        1. Relevance: How well does the agent's answer address the user's query based on the retrieved RAG information?
        2. Consistency: Does the agent's answer align with the retrieved RAG information without introducing fabricated details?
        3. Adherence to Facts: If the RAG information or reference answer lacks specific details, the agent must acknowledge it rather than make up information.
        
        **Important Notes:**
        - Be fair and avoid petty deductions. If the agent answers correctly based on the RAG information and does not fabricate, it should receive a high score.
        - If the agent correctly notes that the retrieved RAG information lacks sufficient details, this should not result in a lower score.
        
        **Response Format:**
        Please respond with a single number between 1 and 10. Provide no additional text or explanation. Thank you.
        """
print(prompt)


        Evaluate the relevance and correctness of the answers generated by our agent based on the information retrieved by the RAG model. The evaluation will consider the following details:

        1. User's Query:
        {query}
        
        2. Information retrieved by the RAG model:
        {rag_section}
        
        3. Agent's Answer:
        {agent_answer}
        
        4. Reference Answer (generated by humans):
        {reference_answer}
        
        **Rating Guidelines:**
        
        Rate the agent's answer on a scale of 1 to 10, where:
        - **10**: Highly relevant and correct, fully addressing the user's query without fabricating information.
        - **1**: Completely irrelevant or incorrect, with significant issues in relevance or correctness.
        
        **Key Considerations for Rating:**
        1. Relevance: How well does the agent's answer address the user's query based on the retrieved RAG information?
        2. Consistency: Does the age

In [27]:
def match_re(text):
        matches = re.findall(r'\b\d+(?:,\d{3})*(?:\.\d+)?\b', text)
        if matches:
            return matches[-1]
        else:
            return text

def eval(query, agent_answer, reference_answer, rag_section, prompt=prompt):
    prompt = prompt.format(query=query, rag_section=rag_section, agent_answer=agent_answer, reference_answer=reference_answer)
    # print(prompt)
    # print(len(prompt))
    
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a fair evaluator following specific guidelines."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.0,        # Deterministic output for fairness
        top_p=1.0,              # Allow full consideration of relevant tokens
        max_tokens=50,          # Limit the response to a single number
    )

    score = response.choices[0].message.content.strip()

    return match_re(score), response.choices[0].message.content

In [28]:
score, reason = eval(query, agent_answer, reference_answer, rag_section)
print(score, reason)

7 7


In [29]:
from tqdm import tqdm

def gpt_rate_answers(read_path, write_path):
    content = pd.read_csv(read_path)
    
    queries = content['query'].tolist()
    agent_answers = content['Agent_responds'].tolist()
    reference_answers = content['reference_answer'].tolist()
    rag_sections = content['RAG_retrieval_results'].tolist()

    scores = []
    for i in tqdm(range(len(queries))):
      score, reason = eval(queries[i], agent_answers[i], reference_answers[i], rag_sections[i])
      scores.append(score)
      reasons.append(reason)
    content['score'] = scores

    content.to_csv(write_path, index=False)

In [30]:
gpt_rate_answers('./Qwen_responds.csv', './Qwen_ratings.csv')

100%|█████████████████████████████████████████| 305/305 [02:48<00:00,  1.81it/s]


In [62]:
compute_average_score("./Qwen_ratings.csv")

Mean: 9.081967213114755


In [32]:
gpt_rate_answers('./GPT_responds.csv', './GPT_ratings.csv')

100%|█████████████████████████████████████████| 305/305 [02:34<00:00,  1.97it/s]


In [33]:
compute_average_score("./GPT_ratings.csv")

Mean: 9.39016393442623


In [34]:
gpt_rate_answers('./Llama_responds.csv', './Llama_ratings.csv')

100%|█████████████████████████████████████████| 305/305 [02:22<00:00,  2.13it/s]


In [60]:
compute_average_score("./Llama_ratings.csv")

Mean: 9.160655737704918


In [64]:
gpt_rate_answers('./GLM_responds.csv', './GLM_ratings.csv')

100%|█████████████████████████████████████████| 305/305 [02:38<00:00,  1.92it/s]


In [68]:
compute_average_score("./GLM_ratings.csv")

Mean: 9.039344262295081


In [69]:
import pandas as pd

def compute_average_score(csv_path):
    df = pd.read_csv(csv_path)
    df["score"] = pd.to_numeric(df["score"])

    mean_value = df["score"].mean()
    print("Mean:", mean_value)

compute_average_score("./Qwen_ratings.csv")
compute_average_score("./GPT_ratings.csv")
compute_average_score("./Llama_ratings.csv")
compute_average_score("./GLM_ratings.csv")

Mean: 9.081967213114755
Mean: 9.39016393442623
Mean: 9.160655737704918
Mean: 9.039344262295081


In [None]:
# 1. GPT 2. LLaMA 3. Qwen 4. GLM