### setup

In [None]:
import gzip
import json
from typing import List
from collections import Counter, defaultdict
from tqdm import tqdm
import os
import inspect_ai
from openai import OpenAI
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login
from dotenv import load_dotenv
import torch as t
import subprocess
import contextlib
import shutil
import ast
import textwrap

In [None]:
from inspect_ai import Task, task
from inspect_ai.dataset import Sample, hf_dataset
from inspect_ai.util import ExecResult, sandbox
from inspect_ai.scorer import CORRECT, INCORRECT, Score, Scorer, Target, accuracy, scorer, stderr
from inspect_ai.solver import TaskState, generate
from inspect_ai.model import get_model
from inspect_ai.log import read_eval_log
import re

In [None]:
load_dotenv()
login(token = os.environ['HF_TOKEN'])

In [None]:
IMPORT_HELPER = {
    "python": [
        "import math",
        "import re",
        "import sys",
        "import copy",
        "import datetime",
        "import itertools",
        "import collections",
        "import heapq",
        "import statistics",
        "import functools",
        "import hashlib",
        "import numpy",
        "import numpy as np",
        "import string",
        "from typing import *",
        "from collections import *",
    ],
    "go"    : [
        "math",
        "strings",
        "fmt",
        "strconv",
        "time",
        "bytes",
        "regexp",
        "sort",
        "math/rand",
        "crypto/md5",
    ],
    "cpp"   : [
        "#include<stdlib.h>",
        "#include<algorithm>",
        "#include<math.h>",
        "#include<stdio.h>",
        "#include<vector>",
        "#include<string>",
        "#include<climits>",
        "#include<cstring>",
        "#include<iostream>",
    ],
}

# instruction prepended to code problem
INSTRUCTION = """
Read the following function signature and docstring, and fully implement
the function described. Your response should only contain the code for
this function.\n
"""

LANG_PREFIX = {
    "cpp"          : "// language: C++",
    "java"         : "// language: Java",
    "js"           : "// language: JavaScript",
    "javascript"   : "// language: JavaScript",
    "go"           : "// language: Go",
    "python"       : "# language: Python",
}

In [None]:
model = get_model(
        'hf/meta-llama/Llama-3.1-8B-Instruct', 
        device = 'auto',
        torch_dtype=t.bfloat16,
)

### Data Processing

In [None]:
def stream_jsonl_all(filename: str):
    results = []
    fp = gzip.open(open(filename, "rb"), "rt")
    for line in fp:
        if any(not x.isspace() for x in line):
            results.append(json.loads(line))
    fp.close()

    return results

In [None]:
python_content = stream_jsonl_all('data/python_data.gz')
cpp_content = stream_jsonl_all('data/cpp_data.gz')
go_content = stream_jsonl_all('data/go_data.gz')
java_content = stream_jsonl_all('data/java_data.gz')
js_content = stream_jsonl_all('data/js_data.gz')
content = [python_content, cpp_content, go_content, java_content, js_content]

In [None]:
generations = stream_jsonl_all('data/python_generations.gz')
generations[0]['generation']

In [None]:
for lang in content:
    print(lang[0].keys())
    print()

### LLM output => code  
*I need to see LLM output & transform that into code*

#### Find Code

In [None]:
def find_code_new(completion: str) -> str:
    pattern_1 = re.compile(r"```python\n(.*?)```", re.DOTALL)
    pattern_2 = re.compile(r"```\n(.*?)```", re.DOTALL)
    matches = pattern_1.findall(completion) + pattern_2.findall(completion)
    if matches:
        extracted_answer = matches[0]
        extracted_answer = extract_function_body_new(extracted_answer)
    else:
        extracted_answer = completion
    return str(extracted_answer)


def extract_function_body_new(code: str) -> str:
    try:
        tree = ast.parse(code)
        for node in tree.body:
            if not isinstance(node, ast.FunctionDef):
                continue
            code_lines = code.splitlines()
            start = node.body[0].lineno - 1
            end = node.body[-1].end_lineno
            body_lines = code_lines[start:end]
            return "\n".join(body_lines)

    except Exception as e:
        print(f"Error extracting function body: {e}")
        return "error msg"
    

#### Run Eval

In [None]:
@scorer(metrics=[accuracy(), stderr()])
def verify() -> Scorer:
    async def score(state: TaskState, target: Target) -> Score:
        idx = state.sample_id
        model_completion = state.output.completion
        processed_completion = find_code_new(model_completion)

        val = CORRECT
        if 'error' in processed_completion:
            print(f'error in sample: {idx}')
            val = INCORRECT

        final_code = [
            state.metadata["prompt"],
            processed_completion,
            "\n",
            state.metadata["test"],
        ]

        return Score(
            value=val,
            metadata={
                'completion': model_completion,
                'processed': processed_completion,
                'final_code': ''.join(final_code),
                'idx': idx
            },
            answer=f'model completion: {model_completion}',
        )

    return score

In [None]:
lang = 'python'

def humaneval_record_to_sample(record):
    model_input = INSTRUCTION + LANG_PREFIX[lang] + '\n' + record['prompt'] 
    
    return Sample(
        id=record["task_id"],
        input=model_input,
        target=record["canonical_solution"],
        metadata={
            "prompt": record["prompt"],
            "test": record["test"],
        },
    )

humaneval_dataset = hf_dataset(
    path = 'THUDM/humaneval-x',
    name = lang,
    split = 'test',
    sample_fields = humaneval_record_to_sample,
    trust = True,
)

In [None]:
@task
def humaneval():
    return Task(
        dataset = humaneval_dataset,
        solver = generate(),
        scorer = verify(),
        sandbox = 'local',
    )

In [None]:
# for i in range(4):
inspect_ai.eval(humaneval(), model = model, epochs = 1)

#### Code Extraction Tests

In [None]:
test = read_eval_log('logs/llama.eval')
data = test.samples[52].scores['verify'].metadata
completion = data['completion']
processed = data['processed']
final = data['final_code']
idx = data['idx']

idx
# 34, 51, 52

In [None]:
[i for i in range(164) if test.samples[i].scores['verify'].value != 'C' ]

In [None]:
print(completion)

In [None]:
# preprocess – get 5 evals files!
num_eval_logs = 5
eval_logs = []
for i in range(num_eval_logs):
    file_name = f'logs/test_{i}.eval'
    eval_logs.append(read_eval_log(file_name))

samples_read = 164
for i in range(samples_read):
    processed_results = []
    for j in range(num_eval_logs):
        data = eval_logs[0].samples[i].scores['verify'].metadata
        processed_results.append(data['processed'])
    
    assert len(set(processed_results)) == 1

# if this works, easy to scale it up to 164 samples.

#### Working Extractors

In [None]:
# PYTHON

def find_code_new(completion: str) -> str:
    pattern_1 = re.compile(r"```python\n(.*?)```", re.DOTALL)
    pattern_2 = re.compile(r"```\n(.*?)```", re.DOTALL)
    matches = pattern_1.findall(completion) + pattern_2.findall(completion)
    if matches:
        extracted_answer = matches[0]
        extracted_answer = extract_function_body_new(extracted_answer)
    else:
        extracted_answer = completion
    return str(extracted_answer)


def extract_function_body_new(code: str) -> str:
    try:
        tree = ast.parse(code)
        for node in tree.body:
            if not isinstance(node, ast.FunctionDef):
                continue
            code_lines = code.splitlines()
            start = node.body[0].lineno - 1
            end = node.body[-1].end_lineno
            body_lines = code_lines[start:end]
            return "\n".join(body_lines)

    except Exception as e:
        print(f"Error extracting function body: {e}")
        return "error msg"
    

@scorer(metrics=[accuracy(), stderr()])
def verify() -> Scorer:
    async def score(state: TaskState, target: Target) -> Score:
        idx = state.sample_id
        model_completion = state.output.completion
        processed_completion = find_code_new(model_completion)

        val = CORRECT
        if 'error' in processed_completion:
            print(f'error in sample: {idx}')
            val = INCORRECT

        final_code = [
            state.metadata["prompt"],
            processed_completion,
            "\n",
            state.metadata["test"],
        ]

        return Score(
            value=val,
            metadata={
                'completion': model_completion,
                'processed': processed_completion,
                'final_code': ''.join(final_code),
                'idx': idx
            },
            answer=f'model completion: {model_completion}',
        )

    return score

### Execute Code  
*assuming I have code, ensure I can execute it*

#### Default

In [None]:
cpp_content[0].keys()

In [None]:
def get_cpp(idx):
    record = cpp_content[idx]
    test_setup = ''
    for s in IMPORT_HELPER['cpp']:
        if s not in record['prompt']:
            test_setup += s + '\n'

    code = test_setup + "\n" + record['prompt'] + record['canonical_solution'] + "\n" + record['test']
    
    return code

In [None]:
@scorer(metrics=[accuracy(), stderr()])
def cpp_scorer() -> Scorer:
    async def score(state: TaskState, target: Target) -> Score:
        task_id = state.sample_id
        idx = int(task_id.split('/')[1])
        code = get_cpp(idx)

        tmp_dir = f'/root/srf-project/test_humaneval-x/cpp_tmp/test_{idx}/'
        if not os.path.exists(tmp_dir):
            os.mkdir(tmp_dir)
        file = os.path.join(tmp_dir, 'test.cpp')
        executable = os.path.join(tmp_dir, 'test.out')

        open(file, 'w').write(code)
        
        try:
            compile_proc = subprocess.run(
                ["g++", "-std=c++17", file, "-o", executable, '-lssl', '-lcrypto'],
                capture_output=True,
                text=True,
                timeout=30  # seconds, adjust as needed
            )
            if compile_proc.returncode != 0:
                print(f"Compilation failed! task number: {idx}")
                print("stderr:", compile_proc.stderr)
        except subprocess.TimeoutExpired:
            print("Compilation timed out!")

        try:
            result = await sandbox().exec(
                cmd=[executable],
                timeout=30
            )
        except TimeoutError:
            result = ExecResult(False, 1, "", "Verification timed out.")
        except Exception as e:
            print(f'execution failed cuz of: {e}')

        shutil.rmtree(tmp_dir)

        return Score(
            value=CORRECT if result.success else INCORRECT,
            explanation="".join(
                ["The following verification code was executed:\n\n"]
                + ["```python\n\n"]
                + ["\n```\n"]
                + [f"\nThe submission was incorrect\n\n{result.stderr}"]
                if not result.success
                else [""]
            ),
        )

    return score

In [None]:
def humaneval_record_to_sample(record):
    model_input = 'hello world'
    
    return Sample(
        id=record["task_id"],
        input=model_input,
        target=record["canonical_solution"],
        metadata={
            "prompt": record["prompt"],
            "test": record["test"],
            "entry_point": record["entry_point"],
        },
    )

humaneval_dataset = hf_dataset(
    path = 'openai_humaneval',
    split = 'test',
    sample_fields = humaneval_record_to_sample,
    trust = True,
)

@task
def humaneval():
    return Task(
        dataset = humaneval_dataset,
        solver = generate(),
        scorer = cpp_scorer(),
        sandbox = 'local',
    )

In [None]:
inspect_ai.eval(humaneval(), model = 'openai/gpt-4o-mini', epochs = 1)

#### Working Scorers

In [None]:
# CPP SCORER

@scorer(metrics=[accuracy(), stderr()])
def cpp_scorer() -> Scorer:
    async def score(state: TaskState, target: Target) -> Score:
        task_id = state.sample_id
        idx = int(task_id.split('/')[1])
        code = get_cpp(idx)

        tmp_dir = f'/root/srf-project/test_humaneval-x/cpp_tmp/test_{idx}/'
        if not os.path.exists(tmp_dir):
            os.mkdir(tmp_dir)
        file = os.path.join(tmp_dir, 'test.cpp')
        executable = os.path.join(tmp_dir, 'test.out')

        open(file, 'w').write(code)
        
        try:
            compile_proc = subprocess.run(
                ["g++", "-std=c++17", file, "-o", executable, '-lssl', '-lcrypto'],
                capture_output=True,
                text=True,
                timeout=30  # seconds, adjust as needed
            )
            if compile_proc.returncode != 0:
                print(f"Compilation failed! task number: {idx}")
                print("stderr:", compile_proc.stderr)
        except subprocess.TimeoutExpired:
            print("Compilation timed out!")

        try:
            result = await sandbox().exec(
                cmd=[executable],
                timeout=30
            )
        except TimeoutError:
            result = ExecResult(False, 1, "", "Verification timed out.")
        except Exception as e:
            print(f'execution failed cuz of: {e}')

        shutil.rmtree(tmp_dir)

        return Score(
            value=CORRECT if result.success else INCORRECT,
            explanation="".join(
                ["The following verification code was executed:\n\n"]
                + ["```python\n\n"]
                + ["\n```\n"]
                + [f"\nThe submission was incorrect\n\n{result.stderr}"]
                if not result.success
                else [""]
            ),
        )

    return score

In [None]:
# CPP PARSER

def get_cpp(idx):
    record = cpp_content[idx]
    test_setup = ''
    for s in IMPORT_HELPER['cpp']:
        if s not in record['prompt']:
            test_setup += s + '\n'

    code = test_setup + "\n" + record['prompt'] + record['canonical_solution'] + "\n" + record['test']
    
    return code

In [None]:
# CPP NOTES

# apt-get update
# apt-get install -y g++
# apt-get install -y build-essential
# apt-get install libboost-all-dev
# apt-get install libssl-dev

In [None]:
# JAVA SCORER

@scorer(metrics=[accuracy(), stderr()])
def java_scorer() -> Scorer:
    async def score(state: TaskState, target: Target) -> Score:
        task_id = state.sample_id
        idx = int(task_id.split('/')[1])
        code = get_code_idx(idx)

        tmp_dir = f'/root/srf-project/test_humaneval-x/java_tmp/test_{idx}/'
        if not os.path.exists(tmp_dir):
            os.mkdir(tmp_dir)
        file = os.path.join(tmp_dir, 'Main.java')

        with contextlib.chdir(tmp_dir):
            open(file, 'w').write(code)
        
        try:
            compile_proc = subprocess.run(
                ["javac", "Main.java"],
                cwd=tmp_dir,
                capture_output=True,
                text=True,
                timeout=30  # seconds, adjust as needed
            )
            if compile_proc.returncode != 0:
                print("Compilation failed!")
                print("stderr:", compile_proc.stderr)
        except subprocess.TimeoutExpired:
            print("Compilation timed out!")
        except Exception as e:
            print("Compilation error:", e)

        try:
            result = await sandbox().exec(
                cmd=["java", "-cp", tmp_dir, "Main"],
                timeout=30,
            )
        except TimeoutError:
            result = ExecResult(False, 1, "", "Verification timed out.")

        shutil.rmtree(tmp_dir)

        return Score(
            value=CORRECT if result.success else INCORRECT,
            explanation="".join(
                ["The following verification code was executed:\n\n"]
                + ["```python\n\n"]
                + ["\n```\n"]
                + [f"\nThe submission was incorrect\n\n{result.stderr}"]
                if not result.success
                else [""]
            ),
        )

    return score

JAVA NOTES  
- run the following code to install Java:  
    `apt-get update`  
    `apt-get install -y openjdk-17-jdk`  
  verify with:  
    `java -version`  
    `javac -version`  

In [None]:
# GO SCORER

@scorer(metrics=[accuracy(), stderr()])
def go_scorer() -> Scorer:
    async def score(state: TaskState, target: Target) -> Score:
        task_id = state.sample_id
        idx = int(task_id.split('/')[1])
        code = get_go_stuff_new(idx)

        tmp_dir = f'/root/srf-project/test_humaneval-x/go_tmp/test_{idx}/'
        if not os.path.exists(tmp_dir):
            os.mkdir(tmp_dir)
        file = os.path.join(tmp_dir, 'main_test.go')

        with contextlib.chdir(tmp_dir):
            open(file, 'w').write(code)
            if not os.path.exists('go.mod'):
                subprocess.run(['/usr/local/go/bin/go', 'mod', 'init', f'example.com/tmpmod_{idx}'], check=True)
            subprocess.run(['/usr/local/go/bin/go', 'mod', 'tidy'], check=True)
        
        try:
            result = await sandbox().exec(
                cmd=["/usr/local/go/bin/go", "test", file],
                timeout=30,
                cwd=tmp_dir
            )
        except TimeoutError:
            result = ExecResult(False, 1, "", "Verification timed out.")

        # shutil.rmtree(tmp_dir)

        return Score(
            value=CORRECT if result.success else INCORRECT,
            explanation="".join(
                ["The following verification code was executed:\n\n"]
                + ["```python\n\n"]
                + ["\n```\n"]
                + [f"\nThe submission was incorrect\n\n{result.stderr}"]
                if not result.success
                else [""]
            ),
        )

    return score

In [None]:
# GO PARSER

def get_go_stuff_new(idx):
    import_string = go_content[idx]['import']
    prompt = go_content[idx]['prompt'].replace(import_string, '')
    code = go_content[idx]['canonical_solution']        # FYI, change this to *generated code* later!

    test = go_content[idx]['test']
    test_setup = go_content[idx]['test_setup']
    other_pkgs = []

    for pkg in IMPORT_HELPER['go']:
        if pkg not in test_setup:
            p = pkg.split('/')[-1]
            if p + '.' in code:    
                other_pkgs.append(f"\"{pkg}\"")
    if other_pkgs:
        import_other_pkgs = "import (\n" + "    ".join([p + "\n" for p in other_pkgs]) + ")"
        test_string = test_setup + "\n" + import_other_pkgs + "\n" + prompt + code + "\n" + test
    else:
        test_string = test_setup + "\n" + prompt + code + "\n" + test

    return test_string


GO NOTES  
- run the following shell code to install go  
    `cd ~/`  
    `GO_VERSION=1.24.5`  
    `wget https://go.dev/dl/go${GO_VERSION}.linux-amd64.tar.gz`  
    `rm -rf /usr/local/go`  
    `tar -C /usr/local -xzf go${GO_VERSION}.linux-amd64.tar.gz`  
    `echo 'export PATH=$PATH:/usr/local/go/bin' >> ~/.bashrc`  
    `export PATH=$PATH:/usr/local/go/bin`  
    `go version`  
- you need to create separate Go package for *every* test case. Otherwise the `go.mod` files will interfere (cuz multi threading)  
- also make sure you're running eval from the same directory (ie. specify the `cwd` flag)

In [None]:
# JS SCORER

@scorer(metrics=[accuracy(), stderr()])
def js_scorer() -> Scorer:
    async def score(state: TaskState, target: Target) -> Score:
        task_id = state.sample_id
        idx = int(task_id.split('/')[1])
        ans = get_code_idx(idx)

        try:
            result = await sandbox().exec(
                cmd=["node", "-e", ans],
                timeout=30,
            )
        except TimeoutError:
            result = ExecResult(False, 1, "", "Verification timed out.")

        return Score(
            value=CORRECT if result.success else INCORRECT,
            explanation="".join(
                ["The following verification code was executed:\n\n"]
                + ["```python\n\n"]
                + [ans]
                + ["\n```\n"]
                + [f"\nThe submission was incorrect\n\n{result.stderr}"]
                if not result.success
                else [""]
            ),
        )

    return score

JS NOTES  
- remember to install node.js!  
    `apt-get update`  
    `apt-get install -y curl`  
    `curl -fsSL https://deb.nodesource.com/setup_lts.x | bash -`  
    `apt-get install -y nodejs`  
    verify installation: `node -v` // `npm -v` // `node -e "console.log('Node.js is working')"`  
- 

In [None]:
# PYTHON SCORER

@scorer(metrics=[accuracy(), stderr()])
def python_scorer() -> Scorer:
    async def score(state: TaskState, target: Target) -> Score:
        task_id = state.sample_id
        idx = int(task_id.split('/')[1])
        ans = get_code_idx(idx)

        try:
            result = await sandbox().exec(
                cmd=["python", "-c", ans],
                timeout=30,
            )
        except TimeoutError:
            result = ExecResult(False, 1, "", "Verification timed out.")

        return Score(
            value=CORRECT if result.success else INCORRECT,
            explanation="".join(
                ["The following verification code was executed:\n\n"]
                + ["```python\n\n"]
                + [ans]
                + ["\n```\n"]
                + [f"\nThe submission was incorrect\n\n{result.stderr}"]
                if not result.success
                else [""]
            ),
        )

    return score

In [None]:
# PARSER: PYTHON + JS + JAVA 

def get_executable_code(records):
    code_list = []
    for record in records:
        code = [
            record['prompt'],
            record['canonical_solution'],
            '\n',
            record['test'],
        ]
    
    return code_list

def get_code_idx(idx):
    out = get_executable_code(go_content[idx:idx+1])
    return out[0]

### Ignore RN