#Human eval dataset

In [None]:
import os
import json
from langchain_ollama.llms import OllamaLLM
from datasets import load_dataset
import re
from transformers import pipeline

# Load Dataset
humaneval = load_dataset("openai/openai_humaneval")["test"]

# Use a pipeline as a high-level helper
pipe = pipeline("text-generation", model="Qwen/Qwen2.5-Coder-0.5B-Instruct", device="cuda")
llm = pipe

# File paths for saving results
final_results_file = "final_eval_results.json"
prompt_results_file = "prompt_results.json"
results_list_file = "results_list.json"

# Function to load saved results
def load_previous_results():
    if os.path.exists(final_results_file):
        with open(final_results_file, "r") as f:
            final_eval_results = json.load(f)
    else:
        final_eval_results = []

    if os.path.exists(prompt_results_file):
        with open(prompt_results_file, "r") as f:
            prompt_results = json.load(f)
    else:
        prompt_results = []

    if os.path.exists(results_list_file):
        with open(results_list_file, "r") as f:
            results_list = json.load(f)
    else:
        results_list = []

    return final_eval_results, prompt_results, results_list

# Function to save results
def save_results(final_eval_results, prompt_results, results_list):
    with open(final_results_file, "w") as f:
        json.dump(final_eval_results, f, indent=4)
    with open(prompt_results_file, "w") as f:
        json.dump(prompt_results, f, indent=4)
    with open(results_list_file, "w") as f:
        json.dump(results_list, f, indent=4)

# Modified evaluate_function
def evaluate_function(llm, iterations: int, prompt):
    results = []
    prompt2_results = []

    messages = [{"role": "user", "content": prompt}]
    response = llm(messages, max_length=2000)[0]['generated_text'][-1]['content']
    print("Response: ", response)

    pattern = r"```python\s+(.*?)\s+```"
    python_blocks = re.findall(pattern, response, re.DOTALL)
    if python_blocks:
        results.append(python_blocks[0])
    else:
        print("No valid Python code block found in the initial response.")
        return results, prompt2_results

    for _ in range(iterations):
        prompt2 = (
            f"This was the function you were previously asked to implement: \n{prompt}"
            f" and this was your solution: \n\n\n{results[-1]}"
            f"\n There may or may not be issues with your previous solution. Analyze it and generate a new solution. If you think your previous solution is wrong, it is ok to try a new approach."
        )
        prompt2_results.append(prompt2)

        message = [{"role": "user", "content": prompt2}]
        response = llm(messages, max_length=2000)[0]['generated_text'][-1]['content']
        python_blocks = re.findall(pattern, response, re.DOTALL)

        if python_blocks:
            results.append(python_blocks[0])
            print("Python Code (latest iteration): ", python_blocks[0])
        else:
            print("No valid Python code block found in the response during iteration.")
            break

    return results, prompt2_results

def evaluate_with_humaneval(function_code, test_code, entrypoint_function):
    full_code = f"""
{function_code}

{test_code}

# Entrypoint
if __name__ == "__main__":
    check({entrypoint_function})
    print("All tests passed!")
"""
    print(full_code)
    try:
        exec(full_code)
        return "Passed"
    except AssertionError as e:
        print("Assertion failed:", e)
        return "Failed"
    except Exception as e:
        print("Execution error:", e)
        return "ExecutionFailed"

# Main loop with resume capability
final_eval_results, prompt_results, results_list = load_previous_results()
start_index = len(results_list)

print(f"Resuming from index {start_index}...")

# Ensure we iterate over rows as dictionaries
for i in range(start_index, len(humaneval)):
    # Access each row as a dictionary
    prompt = humaneval[i]

    # Pass the prompt dictionary to the evaluation function
    results, prompt2_values = evaluate_function(llm=llm, iterations=4, prompt=prompt["prompt"])
    results_list.append(results)  # Save the list of results for this prompt
    prompt_results.append(prompt2_values)

    eval_intermediate = []
    for res in results:
        eval_intermediate.append(
            evaluate_with_humaneval(
                function_code=res,
                test_code=prompt["test"],
                entrypoint_function=prompt["entry_point"]
            )
        )

    final_eval_results.append(eval_intermediate)

    # Save after processing each prompt
    save_results(final_eval_results, prompt_results, results_list)
    print(f"Processed prompt {i + 1}/{len(humaneval)} and saved results.")

print("All prompts processed.")


#0.5b revamped with safe execute

In [None]:
import os
import json
from langchain_ollama.llms import OllamaLLM
from datasets import load_dataset
import re
from transformers import pipeline
import threading
import logging
import time

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Load Dataset
humaneval = load_dataset("openai/openai_humaneval")["test"]

# Use a pipeline as a high-level helper
pipe = pipeline("text-generation", model="Qwen/Qwen2.5-Coder-0.5B-Instruct", device="cuda")
llm = pipe

# File paths for saving results
final_results_file = "final_eval_results.json"
prompt_results_file = "prompt_results.json"
results_list_file = "results_list.json"

# Function to load saved results
def load_previous_results():
    if os.path.exists(final_results_file):
        with open(final_results_file, "r") as f:
            final_eval_results = json.load(f)
    else:
        final_eval_results = []

    if os.path.exists(prompt_results_file):
        with open(prompt_results_file, "r") as f:
            prompt_results = json.load(f)
    else:
        prompt_results = []

    if os.path.exists(results_list_file):
        with open(results_list_file, "r") as f:
            results_list = json.load(f)
    else:
        results_list = []

    return final_eval_results, prompt_results, results_list

# Function to save results
def save_results(final_eval_results, prompt_results, results_list):
    with open(final_results_file, "w") as f:
        json.dump(final_eval_results, f, indent=4)
    with open(prompt_results_file, "w") as f:
        json.dump(prompt_results, f, indent=4)
    with open(results_list_file, "w") as f:
        json.dump(results_list, f, indent=4)

# Modified evaluate_function
def evaluate_function(llm, iterations: int, prompt):
    results = []
    prompt2_results = []

    messages = [{"role": "user", "content": prompt}]
    response = llm(messages, max_length=2000)[0]['generated_text'][-1]['content']
    print("Response: ", response)

    pattern = r"```python\s+(.*?)\s+```"
    python_blocks = re.findall(pattern, response, re.DOTALL)
    if python_blocks:
        results.append(python_blocks[0])
    else:
        print("No valid Python code block found in the initial response.")
        return results, prompt2_results

    for _ in range(iterations):
        prompt2 = (
            f"This was the function you were previously asked to implement: \n{prompt}"
            f" and this was your solution: \n\n\n{results[-1]}"
            f"\n There may or may not be issues with your previous solution. Analyze it and generate a new solution. If you think your previous solution is wrong, it is ok to try a new approach."
        )
        prompt2_results.append(prompt2)

        message = [{"role": "user", "content": prompt2}]
        response = llm(messages, max_length=2000)[0]['generated_text'][-1]['content']
        python_blocks = re.findall(pattern, response, re.DOTALL)

        if python_blocks:
            results.append(python_blocks[0])
            print("Python Code (latest iteration): ", python_blocks[0])
        else:
            print("No valid Python code block found in the response during iteration.")
            break

    return results, prompt2_results

def gen_human_eval_code(function_code, test_code, entrypoint_function):
    full_code = f"""
{function_code}

{test_code}

# Entrypoint
if __name__ == "__main__":
    check({entrypoint_function})
    print("All tests passed!")
"""
    return full_code

def safe_execute_code(code: str, test: str, bm:str, timeout: int = 5) -> bool:
  """
  Safely execute generated code with a test case.

  Args:
      code (str): Generated code.
      test (str): Test case code.
      timeout (int): Timeout in seconds.

  Returns:
      bool: Execution success status.
  """

  def target(exec_globals: dict[str, any]) -> None:
    try:
      if bm == "mbpp":
        exec(code, exec_globals)
        exec(test, exec_globals)
      else:
        exec(code, exec_globals)
      exec_globals['exec_success'] = True
    except AssertionError:
      exec_globals['exec_success'] = False
      logger.warning("Some Test Fail")
    except Exception as e:
      logger.warning(f"Execution error: {e}")
      exec_globals['exec_success'] = False

  exec_globals: dict[str, any] = {}
  thread = threading.Thread(target=target, args=(exec_globals,), daemon=True)
  try:
    thread.start()
    thread.join(timeout)
    success = exec_globals.get('exec_success', False)
    if not success and thread.is_alive():
      logger.warning("Code execution timed out.")
      return False
    return success
  except Exception as e:
    logger.error(f"Error during code execution thread: {e}")
    return False

# Main loop with resume capability
final_eval_results, prompt_results, results_list = load_previous_results()
start_index = len(results_list)

print(f"Resuming from index {start_index}...")

# Ensure we iterate over rows as dictionaries
for i in range(start_index, len(humaneval)):
    # Access each row as a dictionary
    prompt = humaneval[i]

    # Pass the prompt dictionary to the evaluation function
    results, prompt2_values = evaluate_function(llm=llm, iterations=2, prompt=prompt["prompt"])
    results_list.append(results)  # Save the list of results for this prompt
    prompt_results.append(prompt2_values)

    eval_intermediate = []
    for res in results:
      full_code = gen_human_eval_code(function_code=res, test_code=prompt["test"], entrypoint_function=prompt["entry_point"])
      eval_intermediate.append(
          safe_execute_code(
              code=full_code,
              test="",
              bm="humaneval",
          )
      )

    final_eval_results.append(eval_intermediate)

    # Save after processing each prompt
    save_results(final_eval_results, prompt_results, results_list)
    print(f"Processed prompt {i + 1}/{len(humaneval)} and saved results.")

print("All prompts processed.")


#1.5b 3 iterations total

In [None]:
import os
import json
from langchain_ollama.llms import OllamaLLM
from datasets import load_dataset
import re
from transformers import pipeline

# Load Dataset
humaneval = load_dataset("openai/openai_humaneval")["test"]

# Use a pipeline as a high-level helper
pipe = pipeline("text-generation", model="Qwen/Qwen2.5-Coder-1.5B-Instruct", device="cuda")
llm = pipe

# File paths for saving results
final_results_file = "final_eval_results.json"
prompt_results_file = "prompt_results.json"
results_list_file = "results_list.json"

# Function to load saved results
def load_previous_results():
    if os.path.exists(final_results_file):
        with open(final_results_file, "r") as f:
            final_eval_results = json.load(f)
    else:
        final_eval_results = []

    if os.path.exists(prompt_results_file):
        with open(prompt_results_file, "r") as f:
            prompt_results = json.load(f)
    else:
        prompt_results = []

    if os.path.exists(results_list_file):
        with open(results_list_file, "r") as f:
            results_list = json.load(f)
    else:
        results_list = []

    return final_eval_results, prompt_results, results_list

# Function to save results
def save_results(final_eval_results, prompt_results, results_list):
    with open(final_results_file, "w") as f:
        json.dump(final_eval_results, f, indent=4)
    with open(prompt_results_file, "w") as f:
        json.dump(prompt_results, f, indent=4)
    with open(results_list_file, "w") as f:
        json.dump(results_list, f, indent=4)

# Modified evaluate_function
def evaluate_function(llm, iterations: int, prompt):
    results = []
    prompt2_results = []

    messages = [{"role": "user", "content": prompt}]
    response = llm(messages, max_length=2000)[0]['generated_text'][-1]['content']
    print("Response: ", response)

    pattern = r"```python\s+(.*?)\s+```"
    python_blocks = re.findall(pattern, response, re.DOTALL)
    if python_blocks:
        results.append(python_blocks[0])
    else:
        print("No valid Python code block found in the initial response.")
        return results, prompt2_results

    for _ in range(iterations):
        prompt2 = (
            f"This was the function you were previously asked to implement: \n{prompt}"
            f" and this was your solution: \n\n\n{results[-1]}"
            f"\n There may or may not be issues with your previous solution. Analyze it and generate a new solution. If you think your previous solution is wrong, it is ok to try a new approach."
        )
        prompt2_results.append(prompt2)

        message = [{"role": "user", "content": prompt2}]
        response = llm(messages, max_length=2000)[0]['generated_text'][-1]['content']
        python_blocks = re.findall(pattern, response, re.DOTALL)

        if python_blocks:
            results.append(python_blocks[0])
            print("Python Code (latest iteration): ", python_blocks[0])
        else:
            print("No valid Python code block found in the response during iteration.")
            break

    return results, prompt2_results

def evaluate_with_humaneval(function_code, test_code, entrypoint_function):
    full_code = f"""
{function_code}

{test_code}

# Entrypoint
if __name__ == "__main__":
    check({entrypoint_function})
    print("All tests passed!")
"""
    print(full_code)
    try:
        exec(full_code)
        return "Passed"
    except AssertionError as e:
        print("Assertion failed:", e)
        return "Failed"
    except Exception as e:
        print("Execution error:", e)
        return "ExecutionFailed"

# Main loop with resume capability
final_eval_results, prompt_results, results_list = load_previous_results()
start_index = len(results_list)

print(f"Resuming from index {start_index}...")

# Ensure we iterate over rows as dictionaries
for i in range(start_index, len(humaneval)):
    # Access each row as a dictionary
    prompt = humaneval[i]

    # Pass the prompt dictionary to the evaluation function
    results, prompt2_values = evaluate_function(llm=llm, iterations=2, prompt=prompt["prompt"])
    results_list.append(results)  # Save the list of results for this prompt
    prompt_results.append(prompt2_values)

    eval_intermediate = []
    for res in results:
        eval_intermediate.append(
            evaluate_with_humaneval(
                function_code=res,
                test_code=prompt["test"],
                entrypoint_function=prompt["entry_point"]
            )
        )

    final_eval_results.append(eval_intermediate)

    # Save after processing each prompt
    save_results(final_eval_results, prompt_results, results_list)
    print(f"Processed prompt {i + 1}/{len(humaneval)} and saved results.")

print("All prompts processed.")


In [None]:
import json
with open("final_eval_results.json", "r") as file:
    data_fe = json.load(file)
print((data_fe))

with open("prompt_results.json", "r") as file:
    data_pr = json.load(file)
print((data_pr))

with open("results_list.json", "r") as file:
    data_rl = json.load(file)

print((data_rl))



[['Passed', 'Passed', 'Failed'], ['Passed', 'Passed', 'Passed']]
[['This was the function you were previously asked to implement: \nWrite a python function to remove first and last occurrence of a given character from the string. Your Code should pass the following tests: assert remove_Occ("hello","l") == "heo" assert remove_Occ("abcda","a") == "bcd" assert remove_Occ("PHP","P") == "H" and this was your solution: \n\n\ndef remove_Occ(s, char):\n    # Replace the first occurrence of the character with an empty string\n    s = s.replace(char, "")\n    \n    # Replace the last occurrence of the character with an empty string\n    s = s.replace(char[::-1], "")\n    \n    return s\n\n# Test cases\nassert remove_Occ("hello", "l") == "heo"\nassert remove_Occ("abcda", "a") == "bcd"\nassert remove_Occ("PHP", "P") == "H"\n There may or may not be issues with your previous solution. Analyze it and generate a new solution. If you think your previous solution is wrong, it is ok to try a new approac

#mbpp dataset

In [None]:
import pandas as pd

splits = {'train': 'sanitized/train-00000-of-00001.parquet', 'test': 'sanitized/test-00000-of-00001.parquet', 'validation': 'sanitized/validation-00000-of-00001.parquet', 'prompt': 'sanitized/prompt-00000-of-00001.parquet'}
mbpp_df = pd.read_parquet("hf://datasets/google-research-datasets/mbpp/" + splits["test"])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:

mbpp_df.head()

Unnamed: 0,source_file,task_id,prompt,code,test_imports,test_list
0,Benchmark Questions Verification V2.ipynb,11,Write a python function to remove first and la...,"def remove_Occ(s,ch): \n for i in range(len...",[],"[assert remove_Occ(""hello"",""l"") == ""heo"", asse..."
1,Benchmark Questions Verification V2.ipynb,12,Write a function to sort a given matrix in asc...,"def sort_matrix(M):\n result = sorted(M, ke...",[],"[assert sort_matrix([[1, 2, 3], [2, 4, 5], [1,..."
2,Benchmark Questions Verification V2.ipynb,14,Write a python function to find the volume of ...,"def find_Volume(l,b,h) : \n return ((l * b ...",[],"[assert find_Volume(10,8,6) == 240, assert fin..."
3,Benchmark Questions Verification V2.ipynb,16,Write a function to that returns true if the i...,import re\ndef text_lowercase_underscore(text)...,[],"[assert text_lowercase_underscore(""aab_cbbbc"")..."
4,Benchmark Questions Verification V2.ipynb,17,Write a function that returns the perimeter of...,def square_perimeter(a):\n perimeter=4*a\n r...,[],"[assert square_perimeter(10)==40, assert squar..."


In [None]:
!pip install datasets
!pip install transformers
!pip install langchain-ollama

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [None]:
import os
import json
from langchain_ollama.llms import OllamaLLM
from datasets import load_dataset
import re
from transformers import pipeline

# # Load Dataset
# humaneval = load_dataset("openai/openai_humaneval")["test"]
import pandas as pd

splits = {'train': 'sanitized/train-00000-of-00001.parquet', 'test': 'sanitized/test-00000-of-00001.parquet', 'validation': 'sanitized/validation-00000-of-00001.parquet', 'prompt': 'sanitized/prompt-00000-of-00001.parquet'}
mbpp_df = pd.read_parquet("hf://datasets/google-research-datasets/mbpp/" + splits["test"])


# Use a pipeline as a high-level helper
pipe = pipeline("text-generation", model="Qwen/Qwen2.5-Coder-0.5B-Instruct", device="cuda")
llm = pipe

# File paths for saving results
final_results_file = "final_eval_results_mbpp.json"
prompt_results_file = "prompt_results_mbpp.json"
results_list_file = "results_list_mbpp.json"

# Function to load saved results
def load_previous_results():
    if os.path.exists(final_results_file):
        with open(final_results_file, "r") as f:
            final_eval_results = json.load(f)
    else:
        final_eval_results = []

    if os.path.exists(prompt_results_file):
        with open(prompt_results_file, "r") as f:
            prompt_results = json.load(f)
    else:
        prompt_results = []

    if os.path.exists(results_list_file):
        with open(results_list_file, "r") as f:
            results_list = json.load(f)
    else:
        results_list = []

    return final_eval_results, prompt_results, results_list

# Function to save results
def save_results(final_eval_results, prompt_results, results_list):
    with open(final_results_file, "w") as f:
        json.dump(final_eval_results, f, indent=4)
    with open(prompt_results_file, "w") as f:
        json.dump(prompt_results, f, indent=4)
    with open(results_list_file, "w") as f:
        json.dump(results_list, f, indent=4)

# Modified evaluate_function
def evaluate_function(llm, iterations: int, prompt, test_list):
    results = []
    prompt2_results = []

    prompt_plus_test = prompt + " Your Code should pass the following tests: " + " ".join(test_list)
    # for test in test_list:
    #     prompt_plus_test = prompt_plus_test + " " + test
    messages = [{"role": "user", "content": prompt_plus_test}]
    response = llm(messages, max_length=2000)[0]['generated_text'][-1]['content']
    print("Response: ", response)

    pattern = r"```python\s+(.*?)\s+```"
    python_blocks = re.findall(pattern, response, re.DOTALL)
    if python_blocks:
        results.append(python_blocks[0])
    else:
        print("No valid Python code block found in the initial response.")
        return results, prompt2_results

    for _ in range(iterations):
        prompt2 = (
            f"This was the function you were previously asked to implement: \n{prompt_plus_test}"
            f" and this was your solution: \n\n\n{results[-1]}"
            f"\n There may or may not be issues with your previous solution. Analyze it and generate a new solution. If you think your previous solution is wrong, it is ok to try a new approach."
        )
        prompt2_results.append(prompt2)

        message = [{"role": "user", "content": prompt2}]
        response = llm(messages, max_length=2000)[0]['generated_text'][-1]['content']
        python_blocks = re.findall(pattern, response, re.DOTALL)

        if python_blocks:
            results.append(python_blocks[0])
            print("Python Code (latest iteration): ", python_blocks[0])
        else:
            print("No valid Python code block found in the response during iteration.")
            break

    return results, prompt2_results

def evaluate_with_humaneval(function_code, test_code):
    full_code = f"""
{function_code}

{test_code}

# Entrypoint
if __name__ == "__main__":
    print("All tests passed!")
"""
    print(full_code)
    try:
        exec(full_code)
        return "Passed"
    except AssertionError as e:
        print("Assertion failed:", e)
        return "Failed"
    except Exception as e:
        print("Execution error:", e)
        return "ExecutionFailed"

# Main loop with resume capability
final_eval_results, prompt_results, results_list = load_previous_results()
start_index = len(results_list)

print(f"Resuming from index {start_index}...")

# Ensure we iterate over rows as dictionaries
for i in range(start_index, len(mbpp_df)):
    # Access each row as a dictionary
    prompt = mbpp_df.iloc[i]

    # Pass the prompt dictionary to the evaluation function
    results, prompt2_values = evaluate_function(llm=llm, iterations=2, prompt=prompt["prompt"], test_list=prompt["test_list"])
    results_list.append(results)  # Save the list of results for this prompt
    prompt_results.append(prompt2_values)

    eval_intermediate = []
    for res in results:
        eval_intermediate.append(
            evaluate_with_humaneval(
                function_code=res,
                test_code=prompt["test_list"]
            )
        )

    final_eval_results.append(eval_intermediate)

    # Save after processing each prompt
    save_results(final_eval_results, prompt_results, results_list)
    print(f"Processed prompt {i + 1}/{len(mbpp_df)} and saved results.")

print("All prompts processed.")


Resuming from index 0...
Response:  To remove the first and last occurrence of a given character from a string in Python, you can use list slicing. Here's how you can implement this:

```python
def remove_Occ(s, char):
    # Check if the character is present in the string
    if char in s:
        # Remove the first occurrence
        s = s[1:]
        # Remove the last occurrence
        s = s[:-1]
    return s

# Test cases to verify the correctness of the function
assert remove_Occ("hello", "l") == "heo"
assert remove_Occ("abcda", "a") == "bcd"
assert remove_Occ("PHP", "P") == "H"
```

In this code:
- The function `remove_Occ` takes two arguments: `s`, which is the string from which you want to remove the first and last occurrences of the character, and `char`, which is the character you want to remove.
- It first checks if the character is present in the string using the `in` keyword.
- If the character is found, it slices the string to remove both the first and last occurrences.
-

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
            total += num
    return total

# Test cases to verify the correctness of the function
assert sum_of_digits([10, 2, 56]) == 14
assert sum_of_digits([[10, 20, 4, 5, 'b', 70, 'a']]) == 19
assert sum_of_digits([10, 20, -4, 5, -70]) == 19

['assert sum_of_digits([10,2,56])==14'
 "assert sum_of_digits([[10,20,4,5,'b',70,'a']])==19"
 'assert sum_of_digits([10,20,-4,5,-70])==19']

# Entrypoint
if __name__ == "__main__":
    print("All tests passed!")

Assertion failed: 
Processed prompt 184/257 and saved results.
Response:  To perform the bitwise XOR operation across two tuples, you can define a function that iterates over the corresponding elements of the tuples and applies the XOR operation. Here's how you can implement this:

```python
def bitwise_xor(tuple1, tuple2):
    # Ensure both tuples have the same length
    if len(tuple1)!= len(tuple2):
        raise ValueError("Tuples must be of the same length.")
    
 

In [None]:
pd_fe_df = pd.DataFrame(data_fe)
print(pd_fe_df)

pd_pr_df = pd.DataFrame(data_pr)
print(pd_pr_df)
pd_rl_df = pd.DataFrame(data_rl)
print(pd_rl_df)

        0       1       2
0  Passed  Passed  Failed
1  Passed  Passed  Passed
                                                   0  \
0  This was the function you were previously aske...   
1  This was the function you were previously aske...   

                                                   1  
0  This was the function you were previously aske...  
1  This was the function you were previously aske...  
                                                   0  \
0  def remove_Occ(s, char):\n    # Replace the fi...   
1  def sort_matrix(matrix):\n    # Sort each row ...   

                                                   1  \
0  def remove_Occ(s, char):\n    # Replace the fi...   
1  def sort_matrix(matrix):\n    """\n    Sorts a...   

                                                   2  
0  def remove_Occ(str1, char):\n    # Check if th...  
1  def sort_matrix(matrix):\n    # Sort the matri...  


In [None]:
# import os
# import json
# from langchain_ollama.llms import OllamaLLM
# from datasets import load_dataset
# import re
# from transformers import pipeline

# # # Load Dataset
# # humaneval = load_dataset("openai/openai_humaneval")["test"]
# import pandas as pd

# splits = {'train': 'sanitized/train-00000-of-00001.parquet', 'test': 'sanitized/test-00000-of-00001.parquet', 'validation': 'sanitized/validation-00000-of-00001.parquet', 'prompt': 'sanitized/prompt-00000-of-00001.parquet'}
# mbpp_df = pd.read_parquet("hf://datasets/google-research-datasets/mbpp/" + splits["test"])


# # Use a pipeline as a high-level helper
# pipe = pipeline("text-generation", model="Qwen/Qwen2.5-Coder-1.5B-Instruct", device="cuda")
# llm = pipe

# # File paths for saving results
# final_results_file = "final_eval_results_mbpp.json"
# prompt_results_file = "prompt_results_mbpp.json"
# results_list_file = "results_list_mbpp.json"

# # Function to load saved results
# def load_previous_results():
#     if os.path.exists(final_results_file):
#         with open(final_results_file, "r") as f:
#             final_eval_results = json.load(f)
#     else:
#         final_eval_results = []

#     if os.path.exists(prompt_results_file):
#         with open(prompt_results_file, "r") as f:
#             prompt_results = json.load(f)
#     else:
#         prompt_results = []

#     if os.path.exists(results_list_file):
#         with open(results_list_file, "r") as f:
#             results_list = json.load(f)
#     else:
#         results_list = []

#     return final_eval_results, prompt_results, results_list

# # Function to save results
# def save_results(final_eval_results, prompt_results, results_list):
#     with open(final_results_file, "w") as f:
#         json.dump(final_eval_results, f, indent=4)
#     with open(prompt_results_file, "w") as f:
#         json.dump(prompt_results, f, indent=4)
#     with open(results_list_file, "w") as f:
#         json.dump(results_list, f, indent=4)

# # Modified evaluate_function
# def evaluate_function(llm, iterations: int, prompt, test_list):
#     results = []
#     prompt2_results = []

#     prompt_plus_test = prompt + " Your Code should pass the following tests: " + " ".join(test_list)
#     # for test in test_list:
#     #     prompt_plus_test = prompt_plus_test + " " + test
#     messages = [{"role": "user", "content": prompt_plus_test}]
#     response = llm(messages, max_length=2000)[0]['generated_text'][-1]['content']
#     print("Response: ", response)

#     pattern = r"```python\s+(.*?)\s+```"
#     python_blocks = re.findall(pattern, response, re.DOTALL)
#     if python_blocks:
#         results.append(python_blocks[0])
#     else:
#         print("No valid Python code block found in the initial response.")
#         return results, prompt2_results

#     for _ in range(iterations):
#         prompt2 = (
#             f"This was the function you were previously asked to implement: \n{prompt_plus_test}"
#             f" and this was your solution: \n\n\n{results[-1]}"
#             f"\n There may or may not be issues with your previous solution. Analyze it and generate a new solution. If you think your previous solution is wrong, it is ok to try a new approach."
#         )
#         prompt2_results.append(prompt2)

#         message = [{"role": "user", "content": prompt2}]
#         response = llm(messages, max_length=2000)[0]['generated_text'][-1]['content']
#         python_blocks = re.findall(pattern, response, re.DOTALL)

#         if python_blocks:
#             results.append(python_blocks[0])
#             print("Python Code (latest iteration): ", python_blocks[0])
#         else:
#             print("No valid Python code block found in the response during iteration.")
#             break

#     return results, prompt2_results

# def evaluate_with_humaneval(function_code, test_code):
#     full_code = f"""
# {function_code}

# {test_code}

# # Entrypoint
# if __name__ == "__main__":
#     print("All tests passed!")
# """
#     print(full_code)
#     try:
#         exec(full_code)
#         return "Passed"
#     except AssertionError as e:
#         print("Assertion failed:", e)
#         return "Failed"
#     except Exception as e:
#         print("Execution error:", e)
#         return "ExecutionFailed"

# # Main loop with resume capability
# final_eval_results, prompt_results, results_list = load_previous_results()
# start_index = len(results_list)

# print(f"Resuming from index {start_index}...")

# # Ensure we iterate over rows as dictionaries
# for i in range(start_index, len(mbpp_df)):
#     # Access each row as a dictionary
#     prompt = mbpp_df.iloc[i]

#     # Pass the prompt dictionary to the evaluation function
#     results, prompt2_values = evaluate_function(llm=llm, iterations=2, prompt=prompt["prompt"], test_list=prompt["test_list"])
#     results_list.append(results)  # Save the list of results for this prompt
#     prompt_results.append(prompt2_values)

#     eval_intermediate = []
#     for res in results:
#         eval_intermediate.append(
#             evaluate_with_humaneval(
#                 function_code=res,
#                 test_code=prompt["test_list"]
#             )
#         )

#     final_eval_results.append(eval_intermediate)

#     # Save after processing each prompt
#     save_results(final_eval_results, prompt_results, results_list)
#     print(f"Processed prompt {i + 1}/{len(mbpp_df)} and saved results.")

# print("All prompts processed.")
