In [None]:
## Benchpress Hackathon

The challenge comes with a Jupyter notebook for your implementation and various utilities.
We provide a development set and a validation set you can use to develop your solution.
The development set is for testing your code and consists of 300 problems with a varying number of test cases.
You are free to use all data provided with a problem, a sample has the following structure:

```python
{
    # Unique identifier for the problem in the APPS dataset.
    "problem_id": 4424,
    # The problem statement
    "question": "Given three integers ...",
    # The expected function name and the input/output examples
    # representing test cases.
    "input_output": {
        "fn_name": "expression_matter",
        "inputs": [ ... ],
        "outputs": [ ... ]
    },
    "url": "https://www.codewars.com/kata/5ae62fcf252e66d44d00008e",
    "difficulty": "introductory",
    # The starter code for the problem.
    "starter_code": "def expression_matter(a, b, c):\n\t"
}
```

The validation set is consists of 200 problems, and includes an additional key `test_cases` which is used to score your solution with the provided scoring function.

```python
{
    ...
    "test_cases": {
        "fn_name": "expression_matter",
        "inputs": [ ... ],
        "outputs": [ ... ]
    },
    ...
}
```

### Loading Problems

Use the `load_sample` function to load a problem from the development or validation set.

```python
from utilities import load_sample

problem = load_sample(index=0, dataset_path="./data/dev")
```

### Generating Code

Use the `aleph_alpha_client` to generate code.
Make sure your `AA_TOKEN` is set.

```python
from aleph_alpha_client import Client, CompletionRequest, Prompt

client = Client(AA_TOKEN)

request = CompletionRequest(
    prompt=Prompt.from_text("Your prompt."),
    maximum_tokens=256,
)

# API reference for the client:
# https://aleph-alpha-client.readthedocs.io/en/latest/
response = client.complete(request, model=MODEL)
```

### Running Tests

Use the `run_test_cases` function to run the generated code against the test cases.
The function returns a dictionary with the test results, including the expected output, the generated output, a boolean indicating whether the test passed and a traceback in case of an error.

```python
from utilities import run_test_cases

test_results = run_test_cases(
    problem=problem, 
    generation=response.completions[0].completion, 
    timeout=10,
)
```

### Scoring

Use the `score` function to score your solution on the validation set.
It expects a function that takes a problem and a client and returns a generation.

```python
from utilities import score

passed_problems, passed_test_cases = score(
    generation_func=generate_code, 
    client=client,
    dataset_path="./data/val", 
    length=50,
)
```

In [None]:
%pip install -r requirements.txt

In [None]:
import os

AA_TOKEN = ""
# MODEL = "llama-3.1-8b-instruct-long-context"
MODEL = "llama-3.1-70b-instruct-long-context"

if AA_TOKEN is None:
    raise ValueError("Aleph Alpha Playground token is not set.")


In [None]:
from aleph_alpha_client import Client, CompletionRequest, Prompt
from utilities import load_sample, run_test_cases


client = Client(AA_TOKEN)


def generate_prompt(problem: dict) -> str:
    """
    Generate a prompt for a given problem.
    
    Args:
        problem (dict): A dictionary containing the problem, test cases, and starter code.
    
    Returns:
        str: A prompt for the given problem.
    """

    prompt = "\nQUESTION:\n"
    prompt += problem["question"]
    prompt += "\n\nSTARTER CODE:\n"
    prompt += problem["starter_code"]
    prompt += "\n\nWrite only the code in Python, directly executable, no tests or other comments."
    prompt += '''\n\n Example of a proper response format:
    def climb(n):
        seq = [1]
        while seq[-1] < n:
        if seq[-1] * 2 + 1 <= n:
            seq.append(seq[-1] * 2 + 1)
        else:
            seq.append(seq[-1] * 2)
        return seq if seq[-1] == n else []\n\n'''
    prompt += " Don't use markdown code blocks."
    prompt += "\n\nCODE:\n"
    return prompt


def generate_code(problem: dict, client: Client) -> str:
    """
    Implement the generation function.
    
    Args:
        problem (dict): The problem to solve.
        
    Returns:
        str: The generated code.
    """
    
    prompt = generate_prompt(problem=problem)

    request = CompletionRequest(
        prompt=Prompt.from_text(prompt),
        maximum_tokens=64,
    )

    # API reference for the client:
    # https://aleph-alpha-client.readthedocs.io/en/latest/
    response = client.complete(request, model=MODEL)
    
    # Run test cases on the generated code and iterate.
    # test_results = run_test_cases(
    #    problem=problem, 
    #    generation=response.completions[0].completion, 
    #    timeout=10,
    # )

    response_string = response.completions[0].completion    
    response_string = response_string.replace("'''", "").replace("```", "").replace('"""','').replace('python', '').strip()

    return response_string


problem = load_sample(index=0, dataset_path="./data/dev")
generated_code = generate_code(problem, client)
print(generated_code)

In [None]:
from utilities import score

passed_problems, passed_test_cases = score(
    generation_func=generate_code, 
    client=client,
    dataset_path="./data/val", 
    length=100,
)

print(f"Passed {passed_problems*100}% of problems")
print(f"Passed {passed_test_cases*100}% of test cases")