In [None]:
import subprocess

def format_heval(problem, solution):
    return f"""{solution}

{problem['test']}

check({problem['entry_point']})"""

def execute_test(problem, solution):
    with open(f"auto_eval.py", "w") as eval_file:
        test = format_heval(problem, solution)
        eval_file.write(test)

    try:
        result = subprocess.run(['python', 'auto_eval.py'], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True, timeout=10)
        return result.returncode == 0
    except subprocess.CalledProcessError as e:
        pass
    except subprocess.TimeoutExpired as e:
        pass

    return False

def get_markdown(solution):
    delimiter = "```python"
    size = len(delimiter)

    try:
        start = solution.index(delimiter)
        end = solution.index("```", start + size)
        code = solution[start + size:end]
        return code.strip()
    except:
        return ""

In [None]:
from datasets import load_dataset

def prepare_mbpp(problem):
    test_list = problem['test_list']
    fn_name = test_list[0].split("(")[0].split(' ')[1]
    return {
        "test": "\n".join(test_list),
        "entry_point": fn_name,
        "text": problem['text'],
        "task_id": problem['task_id']
    }
    
dataset = load_dataset("openai/openai_humaneval", split="test").to_list()
# dataset = load_dataset("google-research-datasets/mbpp", split="test").to_list()
# dataset = [prepare_mbpp(d) for d in dataset]

In [None]:
import requests
import concurrent

def generate_local(tokenizer, prompt):
    prompt = tokenizer.apply_chat_template([
        {
            "role": "system",
            "content": "You are a world-class AI system, capable of complex reasoning and reflection. Reason through the query inside <thinking> tags, and then provide your final response inside <output> tags. If you detect that you made a mistake in your reasoning at any point, correct yourself inside <reflection> tags." 
        },
        {
            "role": "user",
            "content": prompt
        }
    ], tokenize=False, add_generation_prompt=True)
    
    response = requests.post("http://192.168.1.14:7780/generate", json={
        "prompt": prompt,
        "temperature": 0.0, # for greedy generation
        "top_p": 1.0, # for greedy generation
        "top_k": -1, # for greedy generation
        "stop_token_ids": [
            128001,
            128008,
            128009
        ]
    }).json()

    return response['completion']


def generate_local_batch(tokenizer, problems, max_workers=20):
    futures = {}

    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as exe:
        for problem in problems:
            future = exe.submit(generate_local, tokenizer, problem["prompt"])
            futures[future] = problem

        for future in concurrent.futures.as_completed(futures.keys()):
            yield {
                **futures[future],
                "solution": future.result()
            }

In [None]:
import transformers
from tqdm import tqdm

def get_output(completion):
    try:
        return completion.split("<output>")[1]
    except:
        print("model did not produce output tags")
        return completion

completions = []
max_concurrent = 50
solutions = []
tokenizer = transformers.AutoTokenizer.from_pretrained('/home/anton/personal/models/ref_70_e3')

prompted_problems = [{**problem, "prompt": f"Please provide a Python script that completes the following problem:\n```python\n{problem['prompt'].strip()}\n```"} for problem in dataset]
for completed_problem in tqdm(generate_local_batch(tokenizer, prompted_problems, max_concurrent), total=len(prompted_problems)):
    solution = get_output(completed_problem["solution"])
    passed = execute_test(completed_problem, get_markdown(solution))
    completions.append(solution)
    solutions.append(passed)

In [None]:
import numpy as np

f"{round(np.mean(solutions) * 100, 2)}%"