# Create LBPP Test Set to Use with Human-Eval Evaluation Package
This code transforms the LBPP test set to use with the human_eval package:
* Original package: https://github.com/openai/human-eval
* Same package which I modified for use with LBPP in addition to the HumanEval dataset: https://github.com/agnedil/code-generation/tree/main/modified-openai-human-eval-code.

In [68]:
import datasets
import json
import ast
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load LBPP

In [2]:
lbpp = datasets.load_dataset("CohereForAI/lbpp")

In [5]:
lbpp

DatasetDict({
    test: Dataset({
        features: ['task_id', 'language', 'title', 'instruction', 'completion', 'test_setup', 'test_list', 'signature', 'categories'],
        num_rows: 162
    })
})

In [12]:
counter = 0
max_num = 5
for ddict in lbpp['test']:
    if counter == max_num:
        break
    for k2, v2 in ddict.items():
        print('Key:\n', k2, sep='')
        print('Value:')
        if k2 == 'test_list':
            alist = ast.literal_eval(v2)
            print('\n'.join(alist))
        else:
            print(v2)
        print('\n', '-'*75, '\n', sep='')
    counter += 1
    print('\n', '='*100, '\n', sep='')

Key:
task_id
Value:
lbpp/0

---------------------------------------------------------------------------

Key:
language
Value:
python

---------------------------------------------------------------------------

Key:
title
Value:
add_avg_and_std_cols_numpy

---------------------------------------------------------------------------

Key:
instruction
Value:
Write a python function `add_avg_and_std_cols_numpy(ar: np.ndarray) -> np.ndarray` that takes a 2D numpy array and returns a 2D numpy array with two additional columns appended to the end. The first column should contain the average of each row, and the second column should contain the standard deviation of each row. You may assume that the input array has at least one row and one column.

---------------------------------------------------------------------------

Key:
completion
Value:
import numpy as np


def add_avg_and_std_cols_numpy(ar: np.ndarray) -> np.ndarray:
    avg = np.mean(ar, axis=1).reshape(-1, 1)
    std = np.std(ar, 

In [34]:
# explore test_setup
counter = 0
max_num = 1000
for ddict in lbpp['test']:
    if counter == max_num:
        break
    counter += 1
    print('Test setup:\n', ddict['test_setup'], sep='')
    print('\n', '-'*75, '\n', sep='')
    print('Signature:\n', ddict['signature'], sep='')
    print('\n', '-'*75, '\n', sep='')
    print('Title:\n', ddict['title'], sep='')
    print('\n', '='*100, '\n', sep='')

Test setup:
from code import add_avg_and_std_cols_numpy
import numpy as np



---------------------------------------------------------------------------

Signature:
def add_avg_and_std_cols_numpy(ar: np.ndarray) -> np.ndarray:

---------------------------------------------------------------------------

Title:
add_avg_and_std_cols_numpy


Test setup:
from code import anagram_combos



---------------------------------------------------------------------------

Signature:
def anagram_combos(sub_words: list[str], whole_word: str) -> int:

---------------------------------------------------------------------------

Title:
anagram_combos


Test setup:
from code import form_words_from_magazine



---------------------------------------------------------------------------

Signature:
def form_words_from_magazine(word: str, magazine: list[str]) -> int | None:

---------------------------------------------------------------------------

Title:
anonymous_letter


Test setup:
from code import a

In [31]:
# explore test_setup
from_code, other = [], []
for ddict in lbpp['test']:
    lines = ddict['test_setup'].splitlines()
    temp = []
    for line in lines:
        if line.startswith('from code'):
            from_code.append(line)
        else:
            temp.append(line)
    other.append('\n'.join(temp))
print(f'From code imports (a total of {len(from_code)}):\n')
print('\n'.join(from_code))

from_code_second_imports = [item.split(',')[1:] for item in from_code]
from_code_second_imports = sorted(set([item for sublist in from_code_second_imports for item in sublist]))

print('\n\nThis is also imported from code:\n')
print('\n'.join(from_code_second_imports))

print('\n\nOther imports:\n')
print('\n'.join(sorted(set(other))))

From code imports (a total of 162):

from code import add_avg_and_std_cols_numpy
from code import anagram_combos
from code import form_words_from_magazine
from code import apply_discount, Products
from code import are_all_int_present
from code import arrange_grades
from code import at_least_this_fast
from code import are_targets_obtainable
from code import average_grades
from code import average_salary_by_department
from code import cheapest_connection
from code import has_diff_path, TreeNode
from code import get_all_elements_in_range, TreeNode
from code import binary_representations
from code import can_reach_exit
from code import bonus_calculation
from code import BowlingAlley
from code import build_prompt
from code import LRUCache
from code import calculateCommand
from code import can_exit_maze
from code import cheapest_connection
from code import check_urgent_messages
from code import max_reactions
from code import chess_attacks
from code import clockwise_spiral
from code import Cl

### It is easier to convert LBPP into the MBPP Test format as done in `2. Create_MBPP_Test_Set.ipynb`
* __When adding `test_setup`, disregard `from code import` statements - they import the function to be written.
* The purpose of and impact of not having `second imports from code` is not clear: import ListNode, Node, Products, TreeNode, find_equilibrium_node, find_path, get_double_median, is_k_weight_balanced, merge_partially_sorted_list,  shortestRootToLeafPath, swapIthNodeAtKthLevel, swap_outer_inner_nodes. We may be fine if we assume that these should be generated by the LLM when it generates the code to implement the task from the verbal description.
* Add `other imports` to assertions.
* To avoid unexpected impact, need to test assertions to see if there are other errors raised besides the assertion error.

## Create and Save LBPP_TEST Dataset for Use with Human-Eval Package

In [60]:
problems_lbpp = []

for lbpp_item in lbpp['test']:
    
    task_id = lbpp_item["task_id"]

    # (1) test_setup, if present
    test_setup = lbpp_item['test_setup'].splitlines()
    test_setup = [ line for line in test_setup if line.strip() and
                   not line.startswith('from code') and
                   not line.startswith('#') ]
    
    # (2) grab import statements from the solution, if any
    solution_imports = lbpp_item['completion'].splitlines()
    solution_imports = [ line for line in solution_imports if line.startswith('import') or
                         line.startswith('from ') ]
    test_setup = [ line for line in test_setup if line not in solution_imports ]
    test_setup = '\n'.join(test_setup + solution_imports) + '\n\n'
    
    # (3) test_list (required LBPP tests)
    test_list = ast.literal_eval( lbpp_item['test_list'] )
    test_list = '\n'.join(test_list)
    
    # (4) combine test_setup, solution_imports, and test_list
    test_code = test_setup + test_list

    problem = {
        'task_id': task_id,
        'test': test_code,
        'prompt':  lbpp_item['instruction'],
        'canonical_solution': lbpp_item['completion'],
        'entry_point': None,
        'title': lbpp_item['title'],
        'signature': lbpp_item['signature'],
        'categories': lbpp_item['categories'],
    }

    problems_lbpp.append(problem)
len(problems_lbpp)

162

In [63]:
for i in range(140,180):
    for k,v in problems_lbpp[i].items():
        print(k)
        print(v)
        print('-'*100)
    print('\n', '='*100, '\n', sep='')

task_id
lbpp/140
----------------------------------------------------------------------------------------------------
test
from datetime import datetime
from typing import Any, Literal

message_list = [
    {"text": "Hey how is it going?", "author": "Thomas", "date": datetime(2021, 1, 2)},
    {"text": "Hey it's Jordan", "author": "Jordan", "date": datetime(2021, 1, 3)},
    {"text": "Hi, it's Paul!!!", "author": "Paul", "date": datetime(2021, 1, 6)},
]
assert sort_messages(message_list) == [
    "Hey it's Jordan",
    "Hi, it's Paul!!!",
    "Hey how is it going?",
]
message_list = [
    {"text": "Hey how is it going?", "author": "Thomas", "date": datetime(2021, 1, 2)},
    {"text": "Hey it's Jordan", "author": "Jordan", "date": datetime(2021, 1, 3)},
    {"text": "Hi, it's Paul!!!", "author": "Paul", "date": datetime(2021, 1, 6)},
    {
        "text": "Hi, it's Paul from 2020!!!",
        "author": "Paul",
        "date": datetime(2020, 1, 6),
    },
    {
        "text": "Hi, it's 

IndexError: list index out of range

In [77]:
# is it always a function that needs to be generated? Yes, even if they forgot to use 'def'
signatures = [item['signature'] for item in problems_lbpp]
print('No def:', [i for i in signatures if not i.startswith('def ')], '\n')
print('\n'.join(signatures))

No def: ['findTargetSumNodes(root: TreeNode, B: int) -> list[int]:', 'min_g_c_d_subsets(nums: list[int]) -> int:', 'swapIthNodeAtKthLevel(root: TreeNode, i: int, k: int) -> TreeNode:'] 

def add_avg_and_std_cols_numpy(ar: np.ndarray) -> np.ndarray:
def anagram_combos(sub_words: list[str], whole_word: str) -> int:
def form_words_from_magazine(word: str, magazine: list[str]) -> int | None:
def apply_discount(order_list: list[Products], category: str, discount: float) -> float:
def are_all_int_present(numbers: tuple[int]) -> bool:
def arrange_grades(students: list[list[int]]) -> bool:
def at_least_this_fast(input_list: list[tuple[float, float]]) -> float:
def are_targets_obtainable(targets: list[int], coin_values: list[int], k: int) -> list[bool]:
def average_grades(grades: pd.DataFrame) -> list[tuple[float, str]]:
def average_salary_by_department(df: pd.DataFrame) -> list[tuple[str, float]]:
def cheapest_connection(flights: list[list[str]], costs: list[int], times: list[int]) -> list[str

In [65]:
from human_eval.data import write_jsonl
file = 'data/LBPP_Test.jsonl.gz'
write_jsonl(file, problems_lbpp)

## Test new package

In [69]:
from human_eval.data import read_problems, HUMAN_EVAL, MBPP_TEST, LBPP_TEST

In [70]:
problems = read_problems(LBPP_TEST)

In [71]:
problems

{'lbpp/0': {'task_id': 'lbpp/0',
  'test': 'import numpy as np\n\nar = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])\nexpected_output = np.array([[1, 2, 3, 2.0, 0.81649658], [4, 5, 6, 5.0, 0.81649658], [7, 8, 9, 8.0, 0.81649658]])\nassert np.allclose(add_avg_and_std_cols_numpy(ar), expected_output)\nar = np.array([[1], [1], [2]])\nexpected_output = np.array([[1, 1, 0.0], [1, 1, 0.0], [2, 2, 0.0]])\nassert np.allclose(add_avg_and_std_cols_numpy(ar), expected_output)\nar = np.array([[1, 2, 3, 4, 5]])\nexpected_output = np.array([[1, 2, 3, 4, 5, 3.0, 1.41421356]])\nassert np.allclose(add_avg_and_std_cols_numpy(ar), expected_output)',
  'prompt': 'Write a python function `add_avg_and_std_cols_numpy(ar: np.ndarray) -> np.ndarray` that takes a 2D numpy array and returns a 2D numpy array with two additional columns appended to the end. The first column should contain the average of each row, and the second column should contain the standard deviation of each row. You may assume that the input a