## Benchpress Hackathon

The challenge comes with a Jupyter notebook for your implementation and various utilities.
We provide a development set and a validation set you can use to develop your solution.
The development set is for testing your code and consists of 300 problems with a varying number of test cases.
You are free to use all data provided with a problem, a sample has the following structure:

```python
{
    # Unique identifier for the problem in the APPS dataset.
    "problem_id": 4424,
    # The problem statement
    "question": "Given three integers ...",
    # The expected function name and the input/output examples
    # representing test cases.
    "input_output": {
        "fn_name": "expression_matter",
        "inputs": [ ... ],
        "outputs": [ ... ]
    },
    "url": "https://www.codewars.com/kata/5ae62fcf252e66d44d00008e",
    "difficulty": "introductory",
    # The starter code for the problem.
    "starter_code": "def expression_matter(a, b, c):\n\t"
}
```

The validation set is consists of 200 problems, and includes an additional key `test_cases` which is used to score your solution with the provided scoring function.

```python
{
    ...
    "test_cases": {
        "fn_name": "expression_matter",
        "inputs": [ ... ],
        "outputs": [ ... ]
    },
    ...
}
```

### Loading Problems

Use the `load_sample` function to load a problem from the development or validation set.

```python
from utilities import load_sample

problem = load_sample(index=0, dataset_path="./data/dev")
```

### Generating Code

Use the `aleph_alpha_client` to generate code.
Make sure your `AA_TOKEN` is set.

```python
from aleph_alpha_client import Client, CompletionRequest, Prompt

client = Client(AA_TOKEN)

request = CompletionRequest(
    prompt=Prompt.from_text("Your prompt."),
    maximum_tokens=256,
)

# API reference for the client:
# https://aleph-alpha-client.readthedocs.io/en/latest/
response = client.complete(request, model=MODEL)
```

### Running Tests

Use the `run_test_cases` function to run the generated code against the test cases.
The function returns a dictionary with the test results, including the expected output, the generated output, a boolean indicating whether the test passed and a traceback in case of an error.

```python
from utilities import run_test_cases

test_results = run_test_cases(
    problem=problem, 
    generation=response.completions[0].completion, 
    timeout=10,
)
```

### Scoring

Use the `score` function to score your solution on the validation set.
It expects a function that takes a problem and a client and returns a generation.

```python
from utilities import score

passed_problems, passed_test_cases = score(
    generation_func=generate_code, 
    client=client,
    dataset_path="./data/val", 
    length=50,
)
```

In [8]:
import os
import ast
import inspect

AA_TOKEN = "eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJ1c2VyX2lkIjoyNTk4OCwidG9rZW5faWQiOjY0MjB9.UjTE0uGz0u94aKi0MzFr6RJ9eCqEECyxE8bimbpfoyM"
# MODEL = "llama-3.1-8b-instruct-long-context"
MODEL = "llama-3.1-70b-instruct-long-context"

if AA_TOKEN is None:
    raise ValueError("Aleph Alpha Playground token is not set.")

from aleph_alpha_client import Client, CompletionRequest, Prompt
from utilities import load_sample, run_test_cases


client = Client(AA_TOKEN)


In [9]:

def generate_prompt(problem):
    prompt = "\nQUESTION:\n"
    prompt += problem["question"]
    prompt += "\n\nSTARTER CODE:\n"
    prompt += problem["starter_code"]
    
    # Add Input-Output Examples
    prompt += "\n\nInput-Output Examples:\n"
    for inp, out in zip(problem["input_output"]["inputs"], problem["input_output"]["outputs"]):
        # Assuming inputs and outputs are single-element lists
        input_str = inp[0]
        output_str = out[0]
        prompt += f"Input: \"{input_str}\"\nOutput: {output_str}\n\n"
    
    # Add output examples
    prompt += "An example of a proper response format:"
    prompt += "def expression_matter(a, b, c): \n return max(a*(b+c), a+b*c, (a+b)*c, a*b*c, a+b+c)"
    
    # Clear and Explicit Instructions
    prompt += "Write only the executable Python code that completes the function above. Do not include any comments, tests, or additional text.\n"
    prompt += "Do not use markdown code blocks.\n\n"
    
    # Indicate where the code should be inserted
    prompt += "CODE:\n"
    
    return prompt

In [10]:
def generate_prompt2(problem) -> str:

    prompt = "<|begin_of_text|>\n"
    prompt += "<|start_header_id|>system<|end_header_id|>\n"
    prompt += "You are a programming assistant. Generate Python code to complete functions as instructed. Follow these rules:\n"
    prompt += "1. Write only the executable Python code that completes the function.\n"
    prompt += "2. Do not use markdown or code blocks.\n"
    prompt += "3. Do not include any comments, tests, or additional text.\n"
    prompt += "<|start_header_id|>user<|end_header_id|>\n"
    prompt += problem["question"]
    prompt += "\nWrite only the Python code that completes the function:\n"
    prompt += "\n\nSTARTER CODE:\n"
    prompt += problem["starter_code"]
    prompt += "\n<|start_header_id|>assistant<|end_header_id|>"
    return prompt

In [11]:

def generate_code(problem: dict, client: Client) -> str:
    
    prompt = generate_prompt2(problem=problem)
    request = CompletionRequest(
        prompt=Prompt.from_text(prompt),
        maximum_tokens=350,
    )

    # API reference for the client:
    # https://aleph-alpha-client.readthedocs.io/en/latest/
    response = client.complete(request, model=MODEL)
    response = response.completions[0].completion
    
    # Run test cases on the generated code and iterate.
    # test_results = run_test_cases(
    #    problem=problem, 
    #    generation=response.completions[0].completion, 
    #    timeout=10,
    # )

    # response_string = response.completions[0].completion    
    # response_string = response_string.replace("'''", "").replace("```", "").replace('"""','').replace('python', '').strip()

    return response


In [12]:
# Test the generated code

problem = load_sample(index=0, dataset_path="./data/dev")
generated_code = generate_code(problem, client)
print(generated_code + "\n")

def test_code_given_str(problem, generated_code, inputs):
    namespace = {}
    exec(generated_code, namespace)
    
     # Step 4: Retrieve the function from the namespace
    # Assume only one function is defined in the generated code
    generated_func = None
    for obj in namespace.values():
        if callable(obj):
            generated_func = obj
            break
    if generated_func is None:
        raise ValueError("No callable function found in the generated code.")
    
    if generated_func is None:
        raise ValueError("No callable function found in the generated code.")
    
    result = generated_func(*inputs)
    print(f"Function Output: {result}")

inputs = (2, 1, 2)
test_code_given_str(problem, generated_code,inputs)



def expression_matter(a, b, c):
    return max(a * (b + c), a + b * c, (a + b) * c, a + b + c)

Function Output: 6


In [13]:
def delete_json_file(file_path):
    """
    Deletes a JSON file if it exists.

    Args:
        file_path (str): Path to the JSON file to be deleted.
    """
    try:
        if os.path.exists(file_path):
            os.remove(file_path)
            print(f"File '{file_path}' has been deleted.")
        else:
            print(f"File '{file_path}' does not exist.")
    except Exception as e:
        print(f"An error occurred while trying to delete the file: {e}")

In [14]:
from utilities import score
delete_json_file("./analysis_results.json")
passed_problems, passed_test_cases = score(
    generation_func=generate_code, 
    #generation_func=generate_code_syntax_coorected,
    client=client,
    dataset_path="./data/val", 
    length=100,
)

print(f"Passed {passed_problems*100}% of problems")
print(f"Passed {passed_test_cases*100}% of test cases")

File './analysis_results.json' has been deleted.


 17%|█▋        | 17/100 [00:20<01:36,  1.16s/it]

Standard input runtime error or time limit exceeded error = combinationSum3() missing 1 required positional argument: 'n'


 28%|██▊       | 28/100 [00:38<02:10,  1.82s/it]

type 0 compilation error = 'return' outside function (<string>, line 18)


 38%|███▊      | 38/100 [00:47<00:51,  1.19it/s]

type 0 compilation error = 'return' outside function (<string>, line 18)


 41%|████      | 41/100 [00:49<00:49,  1.18it/s]

Standard input runtime error or time limit exceeded error = list index out of range


 45%|████▌     | 45/100 [00:51<00:28,  1.93it/s]

Standard input runtime error or time limit exceeded error = float division by zero


 85%|████████▌ | 85/100 [01:30<00:15,  1.07s/it]

Standard input runtime error or time limit exceeded error = 'R'


100%|██████████| 100/100 [02:39<00:00,  1.60s/it]

Passed 69.0% of problems
Passed 69.6078431372549% of test cases





In [15]:
problem = load_sample(index=12, dataset_path="./data/val")
generated_code = generate_code(problem, client)
print(generated_code)



def shape_area(n):
    return n*n + (n-1)*(n-1)


In [16]:
problem = load_sample(index=43, dataset_path="./data/val")
generated_code = generate_code(problem, client)
print(generated_code)



def flatten(*args):
    result = []
    for arg in args:
        if isinstance(arg, list):
            result.extend(flatten(*arg))
        else:
            result.append(arg)
    return result
