# Imports

In [1]:
from datasets import load_dataset, load_metric
import json
import numpy as np
import pandas as pd
import os
os.environ["HF_ALLOW_CODE_EVAL"] = "1" # required for executing code using the HuggingFace code_eval metric

from arthur_bench.run.testsuite import TestSuite

# Load HumanEval dataset

In [6]:
code_eval_data = load_dataset("openai_humaneval")

Found cached dataset openai_humaneval (/Users/maxcembalest/.cache/huggingface/datasets/openai_humaneval/openai_humaneval/1.0.0/2955cebd73602e828fa8c0a424c594e5fab4ec863b316ca98f3d8fdb6a626e75)


  0%|          | 0/1 [00:00<?, ?it/s]

# Get `gpt-3.5-turbo` and `claude-2` solutions

In [9]:
from langchain.chat_models import ChatOpenAI, ChatAnthropic
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.schema import HumanMessage

In [10]:
chatgpt = ChatOpenAI(
    streaming=True,
    temperature=0.0,
    callbacks=[
        StreamingStdOutCallbackHandler()
    ])
claude = ChatAnthropic(
    streaming=True,
    temperature=0.0,
    callbacks=[
        StreamingStdOutCallbackHandler()
    ])

In [11]:
prompt_template = """
You are a bot that gives answers to coding tasks only.
If the task is a coding task, give an expert python solution.
If the task is unrelated, give the response "I don't know."
ALWAYS mark the beginning and end of your solution with 
```python 
and 
```
Without these markers, the code cannot be extracted. Therefore the markers are required.
===
Example:
===
import json
import pandas as pd

def json_to_df(filename: str, threshold: float) -> pd.DataFrame
    ''' Write a python function that loads a json file into a dataframe
    '''
===
Solution:
===
```python
return pd.DataFrame([json.loads(line) for line in open(filename, 'r')])
```
===
Now the real task:
===
<text>
===
Solution:
"""

In [12]:
def generate_solutions():
    code_gen_df = pd.DataFrame(code_eval_data['test'])
    code_gen_df['chatgpt_solution'] = ['' for _ in range(len(code_gen_df))]
    code_gen_df['claude_solution'] = ['' for _ in range(len(code_gen_df))]
    for i in range(len(code_eval_data['test'])):
        code_prompt = code_eval_data['test'][i]['prompt']
        print('code prompt:\n\n', code_prompt)
        filled_prompt = prompt_template.replace("<text>", code_prompt)
        print('$ chatgpt $:')
        chatgpt_response = chatgpt([HumanMessage(content=filled_prompt)])
        code_gen_df.loc[i, 'chatgpt_solution'] = chatgpt_response.content
        print("\n-----------\n")
        print('$ claude $:')
        claude_response = claude([HumanMessage(content=filled_prompt)])
        code_gen_df.loc[i, 'claude_solution'] = claude_response.content
        print("\n\n\n>>>>>>>>>>>>>>>>\n\n\n")
        code_gen_df.to_csv('human_eval_solutions.csv')
# generate_solutions()

# Load / Preprocess generated solutions

In [2]:
code_gen_df = pd.read_csv('human_eval_solutions.csv', index_col=0).dropna()

In [3]:
# extract code from the solution strings
# combine imports from prompt and solution code into candidate_outputs
def prepare_candidates(df):
    res = df.copy()
    for s in ['chatgpt_solution', 'claude_solution']:
        res[s] = [x.replace('python\n', '').replace('```', '').replace(' def', 'def') for x in res[s]]
    chatgpt_solns_import = []
    claude_solns_import = []
    for i in range(len(res)):
        chatgpt_solns_import.append(res['prompt'][i] + '\n' + res['chatgpt_solution'][i])
        claude_solns_import.append(res['prompt'][i] + '\n' + res['claude_solution'][i])
    res['chatgpt_with_imports'] = chatgpt_solns_import
    res['claude_with_imports'] = claude_solns_import
    return res
code_gen_df = prepare_candidates(code_gen_df)

In [7]:
# combine unit test code and entrypoint name into test func script
def prepare_test_func(df):
    res = df.copy()
    test_funcs = []
    for i in range(len(res)):
        entry_point = f"check({code_eval_data['test'][i]['entry_point']})"
        test_func = "\n" + code_eval_data["test"][i]["test"] + "\n" + entry_point
        test_funcs.append(test_func)
    res['test_func'] = test_funcs
    return res
code_gen_df = prepare_test_func(code_gen_df)

In [5]:
code_gen_df

Unnamed: 0,task_id,prompt,canonical_solution,test,entry_point,chatgpt_solution,claude_solution,chatgpt_with_imports,claude_with_imports
0,HumanEval/0,from typing import List\n\n\ndef has_close_ele...,"for idx, elem in enumerate(numbers):\n ...","\n\nMETADATA = {\n 'author': 'jt',\n 'da...",has_close_elements,"def has_close_elements(numbers: List[float], t...","def has_close_elements(numbers: List[float], t...",from typing import List\n\n\ndef has_close_ele...,from typing import List\n\n\ndef has_close_ele...
1,HumanEval/1,from typing import List\n\n\ndef separate_pare...,result = []\n current_string = []\n ...,"\n\nMETADATA = {\n 'author': 'jt',\n 'da...",separate_paren_groups,def separate_paren_groups(paren_string: str) -...,def separate_paren_groups(paren_string: str) -...,from typing import List\n\n\ndef separate_pare...,from typing import List\n\n\ndef separate_pare...
2,HumanEval/2,\n\ndef truncate_number(number: float) -> floa...,return number % 1.0\n,"\n\nMETADATA = {\n 'author': 'jt',\n 'da...",truncate_number,return number % 1\n,I don't know.,\n\ndef truncate_number(number: float) -> floa...,\n\ndef truncate_number(number: float) -> floa...
3,HumanEval/3,from typing import List\n\n\ndef below_zero(op...,balance = 0\n\n for op in operations:\n...,"\n\nMETADATA = {\n 'author': 'jt',\n 'da...",below_zero,def below_zero(operations: List[int]) -> bool:...,def below_zero(operations: List[int]) -> bool:...,from typing import List\n\n\ndef below_zero(op...,from typing import List\n\n\ndef below_zero(op...
4,HumanEval/4,from typing import List\n\n\ndef mean_absolute...,mean = sum(numbers) / len(numbers)\n re...,"\n\nMETADATA = {\n 'author': 'jt',\n 'da...",mean_absolute_deviation,def mean_absolute_deviation(numbers: List[floa...,def mean_absolute_deviation(numbers: List[floa...,from typing import List\n\n\ndef mean_absolute...,from typing import List\n\n\ndef mean_absolute...
...,...,...,...,...,...,...,...,...,...
69,HumanEval/69,\ndef search(lst):\n '''\n You are given...,frq = [0] * (max(lst) + 1)\n for i in l...,def check(candidate):\n\n # manually genera...,search,def search(lst):\n freq_dict = {}\n for ...,def search(lst):\n freq = {}\n for num i...,\ndef search(lst):\n '''\n You are given...,\ndef search(lst):\n '''\n You are given...
70,HumanEval/70,\ndef strange_sort_list(lst):\n '''\n Gi...,"res, switch = [], True\n while lst:\n ...",def check(candidate):\n\n # Check some simp...,strange_sort_list,def strange_sort_list(lst):\n sorted_lst = ...,def strange_sort_list(lst):\n if not lst:\n...,\ndef strange_sort_list(lst):\n '''\n Gi...,\ndef strange_sort_list(lst):\n '''\n Gi...
71,HumanEval/71,"\ndef triangle_area(a, b, c):\n '''\n Gi...",if a + b <= c or a + c <= b or b + c <= a:...,def check(candidate):\n\n # Check some simp...,triangle_area,"import math\n\ndef triangle_area(a, b, c):\n ...","def triangle_area(a, b, c):\n if a + b > c ...","\ndef triangle_area(a, b, c):\n '''\n Gi...","\ndef triangle_area(a, b, c):\n '''\n Gi..."
72,HumanEval/72,"\ndef will_it_fly(q,w):\n '''\n Write a ...",if sum(q) > w:\n return False\n\n ...,def check(candidate):\n\n # Check some simp...,will_it_fly,"def will_it_fly(q, w):\n if q == q[::-1] an...",I don't know.,"\ndef will_it_fly(q,w):\n '''\n Write a ...","\ndef will_it_fly(q,w):\n '''\n Write a ..."


# Create CodeEval test suite

In [8]:
my_test_suite = TestSuite(
    'humaneval', 
    "code_eval",
    reference_data=code_gen_df, 
    input_column='prompt', 
    reference_column='test_func')

# Evaluate ChatGPT

In [9]:
my_test_run_chatgpt = my_test_suite.run(
    "chatgpt", 
    candidate_data=code_gen_df, 
    candidate_column='chatgpt_solution'
)

  self.scorer = load_metric("code_eval") # type: ignore
100%|███████████████████████████████████████████| 74/74 [00:13<00:00,  5.59it/s]


In [16]:
print("PASS@1 SCORE ON HUMANEVAL")
str(np.round(np.array([
    my_test_run_chatgpt.test_case_outputs[i].score 
    for i in range(len(my_test_run_chatgpt.test_case_outputs))]
).mean(), 4) * 100) + "%"
                            

PASS@1 SCORE ON HUMANEVAL


'55.410000000000004%'

# Evaluate ChatGPT ( providing `import` statements to help)

In [13]:
my_test_run_chatgpt_with_imports = my_test_suite.run(
    "chatgpt_with_imports", 
    candidate_data=code_gen_df, 
    candidate_column='chatgpt_with_imports'
)

100%|███████████████████████████████████████████| 74/74 [00:13<00:00,  5.47it/s]


In [17]:
print("PASS@1 SCORE ON HUMANEVAL")
str(np.round(np.array([
    my_test_run_chatgpt_with_imports.test_case_outputs[i].score 
    for i in range(len(my_test_run_chatgpt_with_imports.test_case_outputs))]
).mean(), 4) * 100) + "%"

PASS@1 SCORE ON HUMANEVAL


'75.68%'

# Evaluate Claude

In [19]:
my_test_run_claude = my_test_suite.run(
    "claude", 
    candidate_data=code_gen_df, 
    candidate_column='claude_solution'
)

In [20]:
print("PASS@1 SCORE ON HUMANEVAL")
str(np.round(np.array([
    my_test_run_claude.test_case_outputs[i].score 
    for i in range(len(my_test_run_claude.test_case_outputs))]
).mean(), 4) * 100) + "%"

PASS@1 SCORE ON HUMANEVAL


'5.41%'

# Evaluate Claude ( providing `import` statements to help)

In [21]:
my_test_run_claude_with_imports = my_test_suite.run(
    "claude_with_imports", 
    candidate_data=code_gen_df, 
    candidate_column='claude_with_imports'
)

100%|███████████████████████████████████████████| 74/74 [00:13<00:00,  5.36it/s]


In [22]:
print("PASS@1 SCORE ON HUMANEVAL")
str(np.round(np.array([
    my_test_run_claude_with_imports.test_case_outputs[i].score 
    for i in range(len(my_test_run_claude_with_imports.test_case_outputs))]
).mean(), 4) * 100) + "%"

PASS@1 SCORE ON HUMANEVAL


'18.92%'