In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import numpy as np
import inspect_ai
from tqdm.notebook import tqdm
from huggingface_hub import login
import torch as t
import pynvml
import gc
import pandas as pd
from pprint import pprint
import os
from dotenv import load_dotenv

device = 'cuda' if t.cuda.is_available() else 'cpu'

In [4]:
load_dotenv()
login(token = os.environ['HF_TOKEN'])

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


### Test Eval GPQA

In [1]:
from inspect_ai import Task, task
from inspect_ai.dataset import Sample, csv_dataset
from inspect_ai.model import GenerateConfig
from inspect_ai.scorer import choice
from inspect_ai.solver import multiple_choice

KeyboardInterrupt: 

In [4]:
dataset = pd.read_csv('gpqa/gpqa_diamond.csv')
questions = dataset['Question'].to_list()

In [45]:
@task
def gpqa_diamond(cot: bool = True) -> Task:
    return Task(
        dataset=csv_dataset(
            csv_file="gpqa/updated_gpqa_diamond.csv",
            sample_fields=record_to_sample,
        ),
        solver=[
            multiple_choice(shuffle=True, cot=cot),
        ],
        scorer=choice(),
        config=GenerateConfig(temperature=0.5),
        epochs=1,
    )

def record_to_sample(record) -> Sample:
    return Sample(
        input=record["Question"],
        choices=[
            str(record["Correct Answer"]),
            str(record["Incorrect Answer 1"]),
            str(record["Incorrect Answer 2"]),
            str(record["Incorrect Answer 3"]),
        ],
        target="A",
        id=record["Record ID"],
    )

In [47]:
inspect_ai.eval(gpqa_diamond(), model='openai/gpt-4o-mini', epochs=1)

Output()

### Eval Model

In [5]:
from inspect_ai import Task, task
from inspect_ai.dataset import Sample, hf_dataset
from inspect_ai.util import ExecResult, sandbox
from inspect_ai.scorer import CORRECT, INCORRECT, Score, Scorer, Target, accuracy, scorer, stderr
from inspect_ai.solver import TaskState, generate
from inspect_ai.model import get_model
import re

In [6]:
# repeat each problem n times
NUM_EPOCHS = 1

# timeout for verification/scoring
VERIFY_TIMEOUT = 30

# instruction prepended to code problem
INSTRUCTION = """
Read the following function signature and docstring, and fully implement
the function described. Your response should only contain the code for
this function.\n
"""

LANG_PREFIX = {
    "cpp"          : "// language: C++",
    "java"         : "// language: Java",
    "js"           : "// language: JavaScript",
    "javascript"   : "// language: JavaScript",
    "go"           : "// language: Go",
    "python"       : "# language: Python",
}

In [7]:
@scorer(metrics=[accuracy(), stderr()])
def verify() -> Scorer:
    async def score(state: TaskState, target: Target) -> Score:
        # extract answer from completion and format for verification
        answer = find_code(state.output.completion)

        code = [
            state.metadata["prompt"],
            answer,
            "\n",
            state.metadata["test"],
            "\n",
            "".join(["check(", state.metadata["entry_point"], ")"]),
        ]

        # verify (returns error status if incorrect)
        try:
            result = await sandbox().exec(
                cmd=["python", "-c", "".join(code)],
                timeout=VERIFY_TIMEOUT,
            )
        except TimeoutError:
            result = ExecResult(False, 1, "", "Verification timed out.")

        # return score w/ model answer + scoring details
        return Score(
            value=CORRECT if result.success else INCORRECT,
            answer=answer,
            explanation="".join(
                ["The following verification code was executed:\n\n"]
                + ["```python\n\n"]
                + code
                + ["\n```\n"]
                + [f"\nThe submission was incorrect\n\n{result.stderr}"]
                if not result.success
                else [""]
            ),
        )

    return score


# extract code from completion (removing markdown and/or signature)
def find_code(completion: str) -> str:
    pattern_1 = re.compile(r"```python\n(.*?)```", re.DOTALL)
    pattern_2 = re.compile(r"```\n(.*?)```", re.DOTALL)
    matches = pattern_1.findall(completion) + pattern_2.findall(completion)
    if matches:
        extracted_answer = matches[0]
        extracted_answer = extract_function_body(extracted_answer)
    else:
        extracted_answer = completion
    # print(f'extracted answer: \n{str(extracted_answer)}')
    return str(extracted_answer)


def extract_function_body(code: str) -> str:
    return code[code.find(":\n    ") + 2 :]

In [10]:
print('from typing import List\n\n\ndef filter_by_substring(strings: List[str], substring: str) -> List[str]:\n    """ Filter an input list of strings only for ones that contain given substring\n    >>> filter_by_substring([], \'a\')\n    []\n    >>> filter_by_substring([\'abc\', \'bacd\', \'cde\', \'array\'], \'a\')\n    [\'abc\', \'bacd\', \'array\']\n    """\n    return [s for s in strings if substring in s]\n\n\n\nMETADATA = {\n    \'author\': \'jt\',\n    \'dataset\': \'test\'\n}\n\n\ndef check(candidate):\n    assert candidate([], \'john\') == []\n    assert candidate([\'xxx\', \'asd\', \'xxy\', \'john doe\', \'xxxAAA\', \'xxx\'], \'xxx\') == [\'xxx\', \'xxxAAA\', \'xxx\']\n    assert candidate([\'xxx\', \'asd\', \'aaaxxy\', \'john doe\', \'xxxAAA\', \'xxx\'], \'xx\') == [\'xxx\', \'aaaxxy\', \'xxxAAA\', \'xxx\']\n    assert candidate([\'grunt\', \'trumpet\', \'prune\', \'gruesome\'], \'run\') == [\'grunt\', \'prune\']\n\ncheck(filter_by_substring)')

from typing import List


def filter_by_substring(strings: List[str], substring: str) -> List[str]:
    """ Filter an input list of strings only for ones that contain given substring
    >>> filter_by_substring([], 'a')
    []
    >>> filter_by_substring(['abc', 'bacd', 'cde', 'array'], 'a')
    ['abc', 'bacd', 'array']
    """
    return [s for s in strings if substring in s]



METADATA = {
    'author': 'jt',
    'dataset': 'test'
}


def check(candidate):
    assert candidate([], 'john') == []
    assert candidate(['xxx', 'asd', 'xxy', 'john doe', 'xxxAAA', 'xxx'], 'xxx') == ['xxx', 'xxxAAA', 'xxx']
    assert candidate(['xxx', 'asd', 'aaaxxy', 'john doe', 'xxxAAA', 'xxx'], 'xx') == ['xxx', 'aaaxxy', 'xxxAAA', 'xxx']
    assert candidate(['grunt', 'trumpet', 'prune', 'gruesome'], 'run') == ['grunt', 'prune']

check(filter_by_substring)


In [8]:
lang = 'python'

def humaneval_record_to_sample(record):
    model_input = INSTRUCTION + LANG_PREFIX[lang] + '\n' + record['prompt'] 
    
    return Sample(
        id=record["task_id"],
        input=model_input,
        target=record["canonical_solution"],
        metadata={
            "prompt": record["prompt"],
            "test": record["test"],
            "entry_point": record["entry_point"],
        },
    )

humaneval_dataset = hf_dataset(
    path = 'openai_humaneval',
    # path = 'THUDM/humaneval-x',
    # name = lang,
    split = 'test',
    sample_fields = humaneval_record_to_sample,
    trust = True,
)

Loading dataset openai_humaneval from Hugging Face...


README.md: 0.00B [00:00, ?B/s]

openai_humaneval/test-00000-of-00001.par(…):   0%|          | 0.00/83.9k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/164 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/164 [00:00<?, ? examples/s]

In [9]:
@task
def humaneval():
    return Task(
        dataset = humaneval_dataset,
        solver = generate(),
        scorer = verify(),
        sandbox = 'local',
    )

In [8]:
# model = get_model(
#         'hf/meta-llama/Llama-3.1-8B-Instruct', 
#         device = 'auto',
#         torch_dtype=t.bfloat16,
# )
inspect_ai.eval(humaneval(), model = 'openai/gpt-4o-mini', epochs = NUM_EPOCHS)

Output()

ERROR:tornado.general:SEND Error: Host unreachable


KeyboardInterrupt: 

### GPU Deets

In [8]:
# Imports
free_memory, total_memory = t.cuda.mem_get_info()

# Convert bytes to GB
free_memory_gb = free_memory / (1024 * 1024 * 1024)
total_memory_gb = total_memory / (1024 * 1024 * 1024)
mem_used = t.cuda.device_memory_used() / (1024 ** 3)

print(f"Free GPU Memory: {free_memory_gb:.2f} GB")
print(f"Total GPU Memory: {total_memory_gb:.2f} GB")
print(f'Memory Used: {mem_used:.2f} GB')

Free GPU Memory: 47.17 GB
Total GPU Memory: 47.43 GB
Memory Used: 0.82 GB


In [18]:
t.cuda.get_device_properties()

_CudaDeviceProperties(name='NVIDIA RTX A6000', major=8, minor=6, total_memory=48566MB, multi_processor_count=84, uuid=63cca492-796d-c7cf-69cf-c5dd0a3d9c8d, L2_cache_size=6MB)

In [13]:
print(t.cuda.memory_allocated() / 1024**2, "MB allocated")
print(t.cuda.memory_reserved() / 1024**2, "MB reserved")

15316.50830078125 MB allocated
15318.0 MB reserved


In [13]:
# del model, tokenizer
t.cuda.empty_cache()
gc.collect()
# clean()

2215