In [11]:
!pip install openai==0.28

Collecting openai==0.28
  Downloading openai-0.28.0-py3-none-any.whl.metadata (13 kB)
Downloading openai-0.28.0-py3-none-any.whl (76 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.5/76.5 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 1.30.5
    Uninstalling openai-1.30.5:
      Successfully uninstalled openai-1.30.5
Successfully installed openai-0.28.0


In [1]:
import json
import openai
import os
from src.utils import extract_gpt_scores, load_json

In [2]:
openai.api_key = os.getenv("OPENAI_API_KEY")

# Load the evaluation data

In [None]:
baseline = "data/evaluation_baseline"
ft = "data/evaluation_ft"
ft2x = "data/evaluation_ft2x"

data_baseline = load_json(baseline)
data_ft = load_json(ft)
data_ft2x = load_json(ft2x)
    

# Evaluate the answers using GPT-4

In [16]:
# Open a file to store the outputs
output_file = open("outputs/evaluation_chatgpt.txt", "w")


In [None]:
# Loop through each question and answer pair
for i in range(len(data_baseline)):
    question = data_ft[i]["instruction"]
    answer0 = data_baseline[i]["model_response"]
    answer1 = data_ft[i]["model_response"]
    answer2 = data_ft2x[i]["model_response"]

    # Prepare the prompt for ChatGPT
    prompt = f"""Please evaluate the following three answers to the same question using the rubric below. For each answer, provide a score from 1 to 5 (5 being the highest) based on how well it meets the criteria. Indicate which answer (0 or 1 or 2) you believe is superior.

Rubric:
1. Comprehensiveness and Depth
    - 0: Incomplete explanation, many key aspects missing.
    - 0.5: Partial explanation, covers some key aspects but lacks depth.
    - 1: Complete and detailed explanation, covers all key aspects thoroughly.
2. Accuracy and Terminology
    - 0: Incorrect use of terminology, several inaccuracies.
    - 0.5: Basic use of terminology, some inaccuracies.
    - 1: Precise use of terminology, accurate definitions and explanations.
3. Clarity and Engagement
    - 0: Unclear explanation, difficult to follow.
    - 0.5: Somewhat clear, with parts that are hard to follow.
    - 1: Clear and engaging, easy to follow.
4. Self-Containment
    - 0: Lacks necessary context, difficult to understand.
    - 0.5: Provides some context, but may need additional information.
    - 1: Fully self-contained, no additional context needed.
5. Logical Structure and Flow
    - 0: Poorly organized, with a confusing structure and flow.
    - 0.5: Some organization, but with occasional lapses in structure and flow.
    - 1: Well-organized, with a logical structure and smooth flow.

Question: {question}

Answer 0:
{answer0}

Answer 1:
{answer1}

Answer 2:
{answer2}

Please provide your evaluation in the following format:

A0: [0-5]
A1: [0-5]
A2: [0-5]
Superior: [0 or 1 or 2] (0 if all are equal)
"""

    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[
            {
                "role": "user",
                "content": prompt,
            },
        ],
    )
    
    # Extract the evaluation from the response
    evaluation = response.choices[0].message['content'].strip()

    # Write the evaluation to the output file
    output_file.write(f"Question {i+1}:\n{evaluation}\n\n")

    # Extract the superior answer and update the wins dictionary
    # superior = int(re.search(r"Superior: (\d)", evaluation).group(1))
    # wins[superior] += 1
    
    print(f"Question {i+1} completed.")
# Close the output file
output_file.close()


In [37]:
# create a dictionary of question number and question and answer
questions = []
for i in range(len(data_baseline)):
    question = data_ft[i]["instruction"]
    answer0 = data_baseline[i]["model_response"]
    answer1 = data_ft[i]["model_response"]
    answer2 = data_ft2x[i]["model_response"]

questions.append({"Question Number": f"Question {i+1}", "Question": question, "Answer 0": answer0, "Answer 1": answer1, "Answer 2": answer2})

In [39]:
# save to a json file
with open("outputs/evaluation_chatgpt_questions_dict.json", "w") as f:
    json.dump(questions, f)

# Extract the evaluation scores

In [None]:
# Read the file content
with open('outputs/evaluation_chatgpt_reviewed.txt', 'r') as file:
    file_content = file.read()

# Extract the data into a DataFrame
df = extract_gpt_scores(file_content)

In [34]:
# calculate number or wins for each answer
wins = df['Superior'].value_counts().to_dict()

In [35]:
wins

{1: 182, 2: 152, 0: 62, 9: 43}

In [36]:
# compute percentage of wins for each answer
total = sum(wins.values())
win_percentages = {answer: wins[answer] / total * 100 for answer in wins}
win_percentages

{1: 41.45785876993166,
 2: 34.62414578587699,
 0: 14.123006833712983,
 9: 9.79498861047836}