### setup

In [3]:
import gzip
import json
from typing import List
from collections import Counter, defaultdict
from tqdm import tqdm
import os
import inspect_ai
from openai import OpenAI
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login
from dotenv import load_dotenv
import torch as t
import subprocess
import contextlib
import shutil

In [4]:
from inspect_ai import Task, task
from inspect_ai.dataset import Sample, hf_dataset
from inspect_ai.util import ExecResult, sandbox
from inspect_ai.scorer import CORRECT, INCORRECT, Score, Scorer, Target, accuracy, scorer, stderr
from inspect_ai.solver import TaskState, generate
from inspect_ai.model import get_model
import re

In [5]:
load_dotenv()
login(token = os.environ['HF_TOKEN'])

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [6]:
IMPORT_HELPER = {
    "python": [
        "import math",
        "import re",
        "import sys",
        "import copy",
        "import datetime",
        "import itertools",
        "import collections",
        "import heapq",
        "import statistics",
        "import functools",
        "import hashlib",
        "import numpy",
        "import numpy as np",
        "import string",
        "from typing import *",
        "from collections import *",
    ],
    "go"    : [
        "math",
        "strings",
        "fmt",
        "strconv",
        "time",
        "bytes",
        "regexp",
        "sort",
        "math/rand",
        "crypto/md5",
    ],
    "cpp"   : [
        "#include<stdlib.h>",
        "#include<algorithm>",
        "#include<math.h>",
        "#include<stdio.h>",
        "#include<vector>",
        "#include<string>",
        "#include<climits>",
        "#include<cstring>",
        "#include<iostream>",
    ],
}

### Data Processing

In [7]:
def stream_jsonl_all(filename: str):
    results = []
    fp = gzip.open(open(filename, "rb"), "rt")
    for line in fp:
        if any(not x.isspace() for x in line):
            results.append(json.loads(line))
    fp.close()

    return results

In [8]:
python_content = stream_jsonl_all('data/python_data.gz')
cpp_content = stream_jsonl_all('data/cpp_data.gz')
go_content = stream_jsonl_all('data/go_data.gz')
java_content = stream_jsonl_all('data/java_data.gz')
js_content = stream_jsonl_all('data/js_data.gz')
content = [python_content, cpp_content, go_content, java_content, js_content]

In [9]:
generations = stream_jsonl_all('data/python_generations.gz')
generations[0]['generation']

'    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx != idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n\n    return False\n'

In [10]:
for lang in content:
    print(lang[0].keys())
    print()

dict_keys(['task_id', 'prompt', 'canonical_solution', 'test', 'text', 'declaration', 'example_test'])

dict_keys(['task_id', 'prompt', 'canonical_solution', 'test', 'declaration', 'example_test'])

dict_keys(['task_id', 'prompt', 'import', 'docstring', 'declaration', 'canonical_solution', 'test', 'test_setup', 'example_test'])

dict_keys(['task_id', 'prompt', 'canonical_solution', 'test', 'text', 'declaration', 'example_test'])

dict_keys(['task_id', 'prompt', 'canonical_solution', 'test', 'declaration', 'example_test'])



### LLM output => code

#### Inspect output

In [21]:
openai_log = inspect_ai.log.read_eval_log('../logs/HumanEval_4o-mini.eval')
llama_log = inspect_ai.log.read_eval_log('../logs/HumanEval_Llama-31-8b.eval')

In [26]:
llama_log.samples[1].scores

{'verify': Score(value='C', answer='    """\n    Separate groups of balanced parentheses from a string.\n\n    Args:\n    paren_string: A string containing multiple groups of nested parentheses.\n\n    Returns:\n    A list of strings where each string represents a separate group of balanced parentheses.\n    """\n    result = []\n    current_group = \'\'\n    balance = 0\n    for char in paren_string:\n        if char == \'(\':\n            balance += 1\n            current_group += char\n        elif char == \')\':\n            balance -= 1\n            if balance == 0:\n                result.append(current_group + char)\n                current_group = \'\'\n            else:\n                current_group += char\n        if balance < 0:\n            raise ValueError("Unbalanced parentheses in the input string")\n    if balance != 0:\n        raise ValueError("Unbalanced parentheses in the input string")\n    return result\n', explanation='', metadata=None)}

In [37]:
# print(f'gpt 4o-mini:\n\n{openai_log.samples[0].output.choices[0].message.content}')
# print('\n\n')
print(f'llama 3.1 8B:\n\n{llama_log.samples[0].output.choices[0].message.content}')

llama 3.1 8B:

```python
from typing import List

def has_close_elements(numbers: List[float], threshold: float) -> bool:
    """
    Check if in given list of numbers, are any two numbers closer to each other than given threshold.

    Args:
        numbers (List[float]): A list of floating point numbers.
        threshold (float): The minimum distance between two numbers to be considered close.

    Returns:
        bool: True if any two numbers are closer than the given threshold, False otherwise.
    """
    for i in range(len(numbers)):
        for j in range(i + 1, len(numbers)):
            if abs(numbers[i] - numbers[j]) <= threshold:
                return True
    return False
```


In [None]:
print(openai_log.samples[0].input)

#### HumanEval-X output
*now i have prompt template, feed it into model & look at output*

In [7]:
INSTRUCTION = """
Read the following function signature and docstring, and fully implement
the function described. Your response should only contain the code for
this function.\n
"""

LANG_PREFIX = {
    "cpp"          : "// language: C++",
    "java"         : "// language: Java",
    "js"           : "// language: JavaScript",
    "javascript"   : "// language: JavaScript",
    "go"           : "// language: Go",
    "python"       : "# language: Python",
}

In [31]:
my_lang = 'python'
model_input = INSTRUCTION + LANG_PREFIX[my_lang] + '\n' + python_content[0]['prompt'] 

In [36]:
client = OpenAI()

response = client.responses.create(
  model="gpt-4o-mini",
  input=model_input
)

response_text = response.output[0].content[0].text

print(response_text)

```python
def has_close_elements(numbers: List[float], threshold: float) -> bool:
    for i in range(len(numbers)):
        for j in range(i + 1, len(numbers)):
            if abs(numbers[i] - numbers[j]) < threshold:
                return True
    return False
```


In [None]:
model = AutoModelForCausalLM.from_pretrained('meta-llama/Llama-3.1-8B-Instruct', device_map='auto', torch_dtype=t.bfloat16)
tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-3.1-8B-Instruct', padding_side="left", torch_dtype=t.bfloat16)
tokenizer.pad_token_id = tokenizer.eos_token_id
model.eval()

In [33]:
tokenized_input = tokenizer([model_input], return_tensors='pt').to('cuda')
generated_ids = model.generate(**tokenized_input, max_new_tokens=800)
out = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [34]:
print(model_input)


Read the following function signature and docstring, and fully implement
the function described. Your response should only contain the code for
this function.

# language: Python
from typing import List


def has_close_elements(numbers: List[float], threshold: float) -> bool:
    """ Check if in given list of numbers, are any two numbers closer to each other than
    given threshold.
    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)
    False
    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
    True
    """



In [35]:
print(out)


Read the following function signature and docstring, and fully implement
the function described. Your response should only contain the code for
this function.

# language: Python
from typing import List


def has_close_elements(numbers: List[float], threshold: float) -> bool:
    """ Check if in given list of numbers, are any two numbers closer to each other than
    given threshold.
    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)
    False
    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
    True
    """
    pass

# The function should return True if there are any two numbers closer to each other
# than the threshold, and False otherwise. The function should not consider
# duplicates in the list. It should also not consider a number with itself.

# The function should work for any list of numbers and any positive threshold.
# The function should also work for lists with less than two elements.
# The function should also work for lists with no elements.

# The function

In [None]:
# generate this from model now lol.
# dont be pussy.
# two things. one generate from openai model
# two, generate from hf model

relevant?
- task id 
- prompt
- canonical solution

In [28]:
python_content[0]

{'task_id': 'Python/0',
 'prompt': 'from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    """ Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    """\n',
 'canonical_solution': '    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx != idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n\n    return False\n',
 'test': "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(has_close_elements):\n    assert has_close_elements([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True\n    assert has_close_elements([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False\n    assert has_close_elements([1.0, 2.0, 5.9, 4.0,

In [None]:
# testing, python stuff
for sample in python_content:
    task_id = sample['task_id']
    language = task_id.split('/')[0].lower()

    prompt = sample['prompt']
    test = python_content[0]['test']
    break
    # code = 



METADATA = {
    'author': 'jt',
    'dataset': 'test'
}


def check(has_close_elements):
    assert has_close_elements([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True
    assert has_close_elements([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False
    assert has_close_elements([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True
    assert has_close_elements([1.0, 2.0, 5.9, 4.0, 5.0], 0.8) == False
    assert has_close_elements([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True
    assert has_close_elements([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True
    assert has_close_elements([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False

check(has_close_elements)


### Execute Code  
*assuming I have code, ensure I can execute it*

In [33]:
java_content[0]['text']

'    Check if in given list of numbers, are any two numbers closer to each other than given threshold.\n    >>> hasCloseElements(Arrays.asList(1.0, 2.0, 3.0), 0.5)\n    false\n    >>> hasCloseElements(Arrays.asList(1.0, 2.8, 3.0, 4.0, 5.0, 2.0), 0.3)\n    true'

In [None]:
def get_executable_code(records):
    code_list = []
    for record in records:
        code = [
            record['prompt'],
            record['canonical_solution'],
            '\n',
            record['test'],
        ]
        code_list.append(''.join(code))
    
    return code_list

def get_code_idx(idx):
    out = get_executable_code(java_content[idx:idx+1])
    return out[0]

In [None]:
@scorer(metrics=[accuracy(), stderr()])
def java_scorer() -> Scorer:
    async def score(state: TaskState, target: Target) -> Score:
        task_id = state.sample_id
        idx = int(task_id.split('/')[1])
        code = get_code_idx(idx)

        tmp_dir = f'/root/srf-project/test_humaneval-x/java_tmp/test_{idx}/'
        if not os.path.exists(tmp_dir):
            os.mkdir(tmp_dir)
        file = os.path.join(tmp_dir, 'Main.java')

        with contextlib.chdir(tmp_dir):
            open(file, 'w').write(code)
        
        try:
            compile_proc = subprocess.run(
                ["javac", "Main.java"],
                cwd=tmp_dir,
                capture_output=True,
                text=True,
                timeout=30  # seconds, adjust as needed
            )
            if compile_proc.returncode != 0:
                print("Compilation failed!")
                print("stderr:", compile_proc.stderr)
        except subprocess.TimeoutExpired:
            print("Compilation timed out!")
        except Exception as e:
            print("Compilation error:", e)

        try:
            result = await sandbox().exec(
                cmd=["java", "-cp", tmp_dir, "Main"],
                timeout=30,
            )
        except TimeoutError:
            result = ExecResult(False, 1, "", "Verification timed out.")

        shutil.rmtree(tmp_dir)

        return Score(
            value=CORRECT if result.success else INCORRECT,
            explanation="".join(
                ["The following verification code was executed:\n\n"]
                + ["```python\n\n"]
                + ["\n```\n"]
                + [f"\nThe submission was incorrect\n\n{result.stderr}"]
                if not result.success
                else [""]
            ),
        )

    return score

In [None]:
def humaneval_record_to_sample(record):
    model_input = 'hello world'
    
    return Sample(
        id=record["task_id"],
        input=model_input,
        target=record["canonical_solution"],
        metadata={
            "prompt": record["prompt"],
            "test": record["test"],
            "entry_point": record["entry_point"],
        },
    )

humaneval_dataset = hf_dataset(
    path = 'openai_humaneval',
    split = 'test',
    sample_fields = humaneval_record_to_sample,
    trust = True,
)

@task
def humaneval():
    return Task(
        dataset = humaneval_dataset,
        solver = generate(),
        scorer = java_scorer(),
        sandbox = 'local',
    )

In [None]:
inspect_ai.eval(humaneval(), model = 'openai/gpt-4o-mini', epochs = 1)

#### Working Scorers

In [None]:
# JAVA SCORER

@scorer(metrics=[accuracy(), stderr()])
def java_scorer() -> Scorer:
    async def score(state: TaskState, target: Target) -> Score:
        task_id = state.sample_id
        idx = int(task_id.split('/')[1])
        code = get_code_idx(idx)

        tmp_dir = f'/root/srf-project/test_humaneval-x/java_tmp/test_{idx}/'
        if not os.path.exists(tmp_dir):
            os.mkdir(tmp_dir)
        file = os.path.join(tmp_dir, 'Main.java')

        with contextlib.chdir(tmp_dir):
            open(file, 'w').write(code)
        
        try:
            compile_proc = subprocess.run(
                ["javac", "Main.java"],
                cwd=tmp_dir,
                capture_output=True,
                text=True,
                timeout=30  # seconds, adjust as needed
            )
            if compile_proc.returncode != 0:
                print("Compilation failed!")
                print("stderr:", compile_proc.stderr)
        except subprocess.TimeoutExpired:
            print("Compilation timed out!")
        except Exception as e:
            print("Compilation error:", e)

        try:
            result = await sandbox().exec(
                cmd=["java", "-cp", tmp_dir, "Main"],
                timeout=30,
            )
        except TimeoutError:
            result = ExecResult(False, 1, "", "Verification timed out.")

        shutil.rmtree(tmp_dir)

        return Score(
            value=CORRECT if result.success else INCORRECT,
            explanation="".join(
                ["The following verification code was executed:\n\n"]
                + ["```python\n\n"]
                + ["\n```\n"]
                + [f"\nThe submission was incorrect\n\n{result.stderr}"]
                if not result.success
                else [""]
            ),
        )

    return score

JAVA NOTES  
- run the following code to install Java:  
    `apt-get update`  
    `apt-get install -y openjdk-17-jdk`  
  verify with:  
    `java -version`  
    `javac -version`  

In [None]:
# GO SCORER

@scorer(metrics=[accuracy(), stderr()])
def go_scorer() -> Scorer:
    async def score(state: TaskState, target: Target) -> Score:
        task_id = state.sample_id
        idx = int(task_id.split('/')[1])
        code = get_go_stuff_new(idx)

        tmp_dir = f'/root/srf-project/test_humaneval-x/go_tmp/test_{idx}/'
        if not os.path.exists(tmp_dir):
            os.mkdir(tmp_dir)
        file = os.path.join(tmp_dir, 'main_test.go')

        with contextlib.chdir(tmp_dir):
            open(file, 'w').write(code)
            if not os.path.exists('go.mod'):
                subprocess.run(['/usr/local/go/bin/go', 'mod', 'init', f'example.com/tmpmod_{idx}'], check=True)
            subprocess.run(['/usr/local/go/bin/go', 'mod', 'tidy'], check=True)
        
        try:
            result = await sandbox().exec(
                cmd=["/usr/local/go/bin/go", "test", file],
                timeout=30,
                cwd=tmp_dir
            )
        except TimeoutError:
            result = ExecResult(False, 1, "", "Verification timed out.")

        # shutil.rmtree(tmp_dir)

        return Score(
            value=CORRECT if result.success else INCORRECT,
            explanation="".join(
                ["The following verification code was executed:\n\n"]
                + ["```python\n\n"]
                + ["\n```\n"]
                + [f"\nThe submission was incorrect\n\n{result.stderr}"]
                if not result.success
                else [""]
            ),
        )

    return score

In [None]:
# GO PARSER

def get_go_stuff_new(idx):
    import_string = go_content[idx]['import']
    prompt = go_content[idx]['prompt'].replace(import_string, '')
    code = go_content[idx]['canonical_solution']        # FYI, change this to *generated code* later!

    test = go_content[idx]['test']
    test_setup = go_content[idx]['test_setup']
    other_pkgs = []

    for pkg in IMPORT_HELPER['go']:
        if pkg not in test_setup:
            p = pkg.split('/')[-1]
            if p + '.' in code:    
                other_pkgs.append(f"\"{pkg}\"")
    if other_pkgs:
        import_other_pkgs = "import (\n" + "    ".join([p + "\n" for p in other_pkgs]) + ")"
        test_string = test_setup + "\n" + import_other_pkgs + "\n" + prompt + code + "\n" + test
    else:
        test_string = test_setup + "\n" + prompt + code + "\n" + test

    return test_string


GO NOTES  
- run the following shell code to install go  
    `cd ~/`  
    `GO_VERSION=1.24.5`  
    `wget https://go.dev/dl/go${GO_VERSION}.linux-amd64.tar.gz`  
    `rm -rf /usr/local/go`  
    `tar -C /usr/local -xzf go${GO_VERSION}.linux-amd64.tar.gz`  
    `echo 'export PATH=$PATH:/usr/local/go/bin' >> ~/.bashrc`  
    `export PATH=$PATH:/usr/local/go/bin`  
    `go version`  
- you need to create separate Go package for *every* test case. Otherwise the `go.mod` files will interfere (cuz multi threading)  
- also make sure you're running eval from the same directory (ie. specify the `cwd` flag)

In [62]:
# JS SCORER

@scorer(metrics=[accuracy(), stderr()])
def js_scorer() -> Scorer:
    async def score(state: TaskState, target: Target) -> Score:
        task_id = state.sample_id
        idx = int(task_id.split('/')[1])
        ans = get_code_idx(idx)

        try:
            result = await sandbox().exec(
                cmd=["node", "-e", ans],
                timeout=30,
            )
        except TimeoutError:
            result = ExecResult(False, 1, "", "Verification timed out.")

        return Score(
            value=CORRECT if result.success else INCORRECT,
            explanation="".join(
                ["The following verification code was executed:\n\n"]
                + ["```python\n\n"]
                + [ans]
                + ["\n```\n"]
                + [f"\nThe submission was incorrect\n\n{result.stderr}"]
                if not result.success
                else [""]
            ),
        )

    return score

JS NOTES  
- remember to install node.js!  
    `apt-get update`  
    `apt-get install -y curl`  
    `curl -fsSL https://deb.nodesource.com/setup_lts.x | bash -`  
    `apt-get install -y nodejs`  
    verify installation: `node -v` // `npm -v` // `node -e "console.log('Node.js is working')"`  
- 

In [63]:
# PYTHON SCORER

@scorer(metrics=[accuracy(), stderr()])
def python_scorer() -> Scorer:
    async def score(state: TaskState, target: Target) -> Score:
        task_id = state.sample_id
        idx = int(task_id.split('/')[1])
        ans = get_code_idx(idx)

        try:
            result = await sandbox().exec(
                cmd=["python", "-c", ans],
                timeout=30,
            )
        except TimeoutError:
            result = ExecResult(False, 1, "", "Verification timed out.")

        return Score(
            value=CORRECT if result.success else INCORRECT,
            explanation="".join(
                ["The following verification code was executed:\n\n"]
                + ["```python\n\n"]
                + [ans]
                + ["\n```\n"]
                + [f"\nThe submission was incorrect\n\n{result.stderr}"]
                if not result.success
                else [""]
            ),
        )

    return score

In [None]:
# PARSER: PYTHON + JS + JAVA 

def get_executable_code(records):
    code_list = []
    for record in records:
        code = [
            record['prompt'],
            record['canonical_solution'],
            '\n',
            record['test'],
        ]
    
    return code_list

def get_code_idx(idx):
    out = get_executable_code(go_content[idx:idx+1])
    return out[0]

### Ignore RN

In [None]:
def process_humaneval_test(sample, problems):
    task_id = sample["task_id"]
    language = task_id.split("/")[0].lower()

    prompt = sample["prompt"]
    test = problems[task_id]["test"]
    code = sample["generation"]

    # Pre-process for different languages
    if language == "python":
        code_ = []
        for line in code.split("\n"):
            if (len(line.strip()) > 0 and line[0] != ' ' and line[0] != '\t'):
                break
            code_.append(line)
        code = "\n".join(code_)
        test_setup = "\n".join(IMPORT_HELPER["python"]) + "\n"
        test_string = test_setup + prompt + code + "\n" + test + "\n"
    elif language == "cpp":
        test_set_up = ""
        for s in IMPORT_HELPER["cpp"]:
            if s not in prompt:
                test_set_up += s + "\n"
        test_string = test_set_up + "\n" + prompt + code + "\n" + test
    elif language == "java":
        test_string = prompt + code + "\n" + test
    elif language == "js" or language == "javascript":
        test_string = prompt + code + "\n" + test
    elif language == "go":
        import_string = problems[task_id]["import"]
        prompt = prompt.replace(import_string, "")
        test = problems[task_id]["test"]
        test_setup = problems[task_id]["test_setup"]
        other_pkgs = []
        for pkg in IMPORT_HELPER["go"]:
            if pkg not in test_setup:
                p = pkg.split("/")[-1]
                if p + "." in code:
                    other_pkgs.append(f"\"{pkg}\"")
        if other_pkgs:
            import_other_pkgs = "import (\n" + "    ".join([p + "\n" for p in other_pkgs]) + ")"
            test_string = test_setup + "\n" + import_other_pkgs + "\n" + prompt + code + "\n" + test
        else:
            test_string = test_setup + "\n" + prompt + code + "\n" + test

    return test_string

In [None]:
sample_jsonl = python_content
for sample in tqdm(sample_jsonl):
    task_id = sample['task_id']
    lang = task_id.split('/')[0].lower()
    if lang == 'javascript': 
        lang = 'js'
    
    sample[]

In [None]:
def evaluate_functional_correctness(
        input_file: str = None,
        tmp_dir: str = "./",
        n_workers: int = 32,
        timeout: float = 500.0,
        problem_file: str = "../data/humaneval_python.jsonl.gz",
        out_dir: str = None,
        k: List[int] = [1, 10, 100],
):
    
    problems = read_dataset(problem_file,
                            dataset_type="humaneval")
    sample_jsonl = stream_jsonl_all(input_file)

    suffix = "_results.jsonl"
    if out_dir is not None:
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)
        out_file = os.path.join(out_dir, input_file.split('/')[-1].replace(".jsonl", suffix))
    else:
        out_file = os.path.join(input_file.replace(".jsonl", suffix))

    if "/codegeex/benchmark/humaneval-x/" in input_file:
        test_groundtruth = True

    if "-to-" in input_file:
        translation_mode = True
    else:
        translation_mode = False

    with ThreadPoolExecutor(max_workers=n_workers) as executor:

        futures = []
        completion_id = Counter()
        n_samples = 0
        results = defaultdict(list)

        print("Reading samples...")
        for sample in tqdm(sample_jsonl):
            task_id = sample["task_id"]
            lang = task_id.split("/")[0].lower()
            if translation_mode:
                task_id = sample["task_id"].split("/")[-1]
                lang = regex.findall("-to-.*-", input_file)[0].split("-to-")[-1].rstrip("-")
                for l in LANGUAGE_NAME:
                    if l in lang:
                        lang = l
                        break
                task_id = f"{LANGUAGE_NAME[lang]}/{task_id}"
            if lang == "javascript":
                lang = "js"
            tmp_dir_ = os.path.join(tmp_dir, lang, "evaluation")
            sample["task_id"] = task_id
            sample["test_code"] = process_humaneval_test(sample, problems, False)
            if sample["test_code"] is None:
                continue
            if "completion_id" in sample:
                completion_id_ = sample["completion_id"]
            else:
                completion_id_ = completion_id[task_id]
            args = (task_id, sample, lang, timeout, tmp_dir_, completion_id_)
            future = executor.submit(check_correctness, *args)
            futures.append(future)
            completion_id[task_id] += 1
            n_samples += 1

        print(completion_id)
        if len(completion_id) == len(problems):
            evaluate_pass_at_k = True
        else:
            evaluate_pass_at_k = False

        print("Running test suites...")
        for future in tqdm(as_completed(futures), total=len(futures)):
            result = future.result()
            results[result["task_id"]].append((result["completion_id"], result))



    # Calculate pass@k.
    total, correct = [], []
    for result in results.values():
        passed = [r[1]["passed"] for r in result]
        total.append(len(passed))
        correct.append(sum(passed))
    total = np.array(total)
    correct = np.array(correct)
    if evaluate_pass_at_k:
        ks = k
        pass_at_k = {f"pass@{k}": estimate_pass_at_k(total, correct, k).mean()
                     for k in ks if (total >= k).all()}
        print(pass_at_k)
    else:
        print("Total:", np.sum(total))
        print("Correct:", np.sum(correct))

    print("Writing to: ", out_file)
    if out_file.endswith(".gz"):
        fp = gzip.GzipFile(fileobj=open(out_file, "wb"), mode="wb")
        for res in results.values():
            for r in res:
                fp.write((json.dumps(r[1]) + "\n").encode("utf-8"))
    else:
        fp = open(out_file, 'w')
        for res in results.values():
            for r in res:
                fp.write(json.dumps(r[1]) + "\n")
    fp.close()

    print("Evaluation finished.")

In [None]:
# GOAL – figure how humaneval-x generates / prompts model for code. compare w/ inspect.

# with inspect, they
#   a) prepend a default system prompt
#   b) add the dataset['prompt']
# 
# will this work for java, etc too? 
#   YA i think so. should be fine.
# 
# 


In [None]:
def stream_jsonl(filename: str):
    """
    Parses each jsonl line and yields it as a dictionary
    """
    with open(filename, "rb") as gzfp:
        with gzip.open(gzfp, "rt") as fp:
            for line in fp:
                if any(not x.isspace() for x in line):
                    yield json.loads(line)

def read_dataset(data_file: str = None):
    dataset = {task["task_id"]: task for task in stream_jsonl(data_file)}
    return dataset

In [None]:
path = 'data/humaneval_python.jsonl.gz'
out = read_dataset(path)