In [3]:
from datasets import load_dataset
import numpy as np
import pandas as pd

from arthur_bench.run.testsuite import TestSuite
from arthur_bench.scoring import PythonUnitTesting

# Load python coding benchmark dataset

In [4]:
humaneval_code_dataset = load_dataset("openai_humaneval")
humaneval_df = pd.DataFrame(humaneval_code_dataset["test"])

Found cached dataset openai_humaneval (/Users/maxcembalest/.cache/huggingface/datasets/openai_humaneval/openai_humaneval/1.0.0/2955cebd73602e828fa8c0a424c594e5fab4ec863b316ca98f3d8fdb6a626e75)


  0%|          | 0/1 [00:00<?, ?it/s]

# Generate solutions from GPT and Claude

In [5]:
import openai
from anthropic import Anthropic, HUMAN_PROMPT, AI_PROMPT
anthropic = Anthropic()
def chatgpt(input_text):
    return openai.ChatCompletion.create(
        model="gpt-3.5-turbo", messages=[
            {"role" : "system", "content" : "You are a helpful assistant."},
            {"role" : "user", "content" : input_text}
        ]
    )['choices'][0]['message']['content']

def claude(input_text):
    return anthropic.completions.create(
        model="claude-2",
        max_tokens_to_sample=300,
        prompt=f"{HUMAN_PROMPT} {input_text} {AI_PROMPT}",
    ).completion

prompt_template = """
You are a bot that gives answers to coding tasks only.
If the task is a coding task, give an expert python solution.
If the task is unrelated, give the response "I don't know."
ALWAYS mark the beginning and end of your solution with 
```python 
and 
```
Without these markers, the code cannot be extracted. Therefore the markers are required.
===
<text>
===
Solution:
"""

def generate_solutions(df):
    res = df.copy()
    res['chatgpt_solution'] = ['' for _ in range(len(res))]
    res['claude_solution'] = ['' for _ in range(len(res))]
    for i in range(len(df)):
        code_prompt = df[i]['prompt']
        filled_prompt = prompt_template.replace("<text>", code_prompt)
        chatgpt_response = chatgpt(filled_prompt)
        claude_response = claude(filled_prompt)
        bar = "-----------"
        print(f"code prompt:\n\n{code_prompt}\n{bar}\nchatgpt:\n{chatgpt_response}\n{bar}\nclaude:\n{claude_response}\n\n{bar}{bar}\n\n")
        code_gen_df.loc[i, 'chatgpt_solution'] = chatgpt_response
        code_gen_df.loc[i, 'claude_solution'] = claude_response
        code_gen_df.to_csv('human_eval_solutions.csv')        

Uncomment and run this cell to generate solutions to the coding benchmark dataset from `chatgpt` and `claude`

In [6]:
# generate_solutions(humaneval_df)

# Evaluate the LLM-generated coding solutions

Load the LLM-generated solutions to the humaneval dataset

In [7]:
humaneval_results = pd.read_csv("python_unit_testing/human_eval_solutions.csv", index_col=0).dropna()

Parse the part of the output which is python from between the  \` \` \` python... \` \` \`  markers

In [8]:
extract_python = lambda x : x.replace('python\n', '').replace('```', '').replace(' def', 'def')
chatgpt_solutions = list(humaneval_results.chatgpt_solution.apply(extract_python))
claude_solutions = list(humaneval_results.claude_solution.apply(extract_python))

### Create the test suite

First we create the list of unit tests from the humaneval dataset

Each unit test should contain `def check(candidate):...` and then at the end of the unit test invoke the unit test with `check(<candidate_function_name>)`

In [9]:
unit_tests = [
    f'\n{humaneval_df.loc[i]["test"]}\ncheck({humaneval_df.loc[i]["entry_point"]})' 
    for i in range(len(humaneval_df))
]

We create a PythonUnitTesting scorer with our unit tests as a parameter. 

In [17]:
python_scorer = PythonUnitTesting(unit_tests=unit_tests)

Now we can create our test suite with our scorer, input prompts, and the golden/canonical solutions from the dataset

In [16]:
python_suite = TestSuite(
    "humaneval", 
    python_scorer, 
    input_text_list=list(humaneval_df.prompt.values), 
    reference_output_list=list(humaneval_df.canonical_solution.values),
)

### Run the test suite on the `chatgpt` and `claude` solutions

In [11]:
report_scores = lambda scores : f"{sum(scores)/len(scores) * 100}%"

In [12]:
run_chatgpt = python_suite.run(
    "chatgpt", 
    candidate_output_list=chatgpt_solutions, 
)

100%|████████████████████| 164/164 [00:37<00:00,  4.43it/s]


In [13]:
print("ChatGPT:", report_scores([x.score for x in run_chatgpt.test_cases]))

ChatGPT: 54.268292682926834%


In [14]:
run_claude = python_suite.run(
    "claude", 
    candidate_output_list=claude_solutions, 
)

100%|████████████████████| 164/164 [00:35<00:00,  4.57it/s]


In [15]:
print("Claude:", report_scores([x.score for x in run_claude.test_cases]))

Claude: 48.170731707317074%
