In [1]:
import warnings

# Suppress all DeprecationWarnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [2]:
import os
os.getcwd()

'/home/ubuntu/project/human-eval'

In [4]:
from human_eval.data import write_jsonl, read_problems
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
problems = read_problems()

In [7]:
print(problems["HumanEval/13"]["prompt"])



def greatest_common_divisor(a: int, b: int) -> int:
    """ Return a greatest common divisor of two integers a and b
    >>> greatest_common_divisor(3, 5)
    1
    >>> greatest_common_divisor(25, 15)
    5
    """



In [8]:
a =  "def greatest_common_divisor(a: int, b: int) -> int:\n    \"\"\" Return a greatest common divisor of two integers a and b\n    >>> greatest_common_divisor(3, 5)\n    1\n    >>> greatest_common_divisor(25, 15)\n    5\n    \"\"\"\n    while b:\n        a, b = b, a % b\n    return a"

In [9]:
print(a)

def greatest_common_divisor(a: int, b: int) -> int:
    """ Return a greatest common divisor of two integers a and b
    >>> greatest_common_divisor(3, 5)
    1
    >>> greatest_common_divisor(25, 15)
    5
    """
    while b:
        a, b = b, a % b
    return a


## Qwen2.5-7B-Instruct

In [3]:
from human_eval.data import write_jsonl, read_problems
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

problems = read_problems()

model_name = "Qwen/Qwen2.5-3B-Instruct"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

#### Plain Prompts

In [4]:
def generate_one_completion0(prompt: str, max_length: int = 512) -> str:
    """
    Generates a code completion for a given prompt using a T5 model.

    Args:
        prompt (str): The input prompt for code generation.
        model_name (str): The name of the pre-trained T5 model to use.
        max_length (int): The maximum length of the generated sequence.

    Returns:
        str: The generated code completion.
    """
    content = """Give me python code to accomplish this function, Do not include any explanation. 
                Only provide me the code that is needed after the function definition."""

    
    messages = [
        {"role": "system", "content": content + "Now answer the following coding question. "},
        {"role": "user", "content": prompt}
    ]

    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=512
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

    return response

In [5]:
num_samples_per_task = 1

samples = []
outer = tqdm(problems, desc="Processing tasks")
for task_id in outer:
    inner = tqdm(range(num_samples_per_task), desc=f"Generating samples for {task_id}", leave=False)
    for _ in inner:
        completion = generate_one_completion0(problems[task_id]["prompt"])
        samples.append(dict(task_id=task_id, completion=completion))
write_jsonl("Qwen_3B_plain_prompts.jsonl", samples)

Processing tasks:   0%|                                                                                         | 0/164 [00:00<?, ?it/s]
Generating samples for HumanEval/0:   0%|                                                                         | 0/1 [00:00<?, ?it/s][A
Generating samples for HumanEval/0: 100%|█████████████████████████████████████████████████████████████████| 1/1 [00:05<00:00,  5.11s/it][A
Processing tasks:   1%|▍                                                                                | 1/164 [00:05<13:53,  5.11s/it][A
Generating samples for HumanEval/1:   0%|                                                                         | 0/1 [00:00<?, ?it/s][A
Generating samples for HumanEval/1: 100%|█████████████████████████████████████████████████████████████████| 1/1 [00:04<00:00,  4.75s/it][A
Processing tasks:   1%|▉                                                                                | 2/164 [00:09<13:14,  4.90s/it][A
Generating samples for 

#### Few-shot prompts

In [6]:
def generate_one_completion1(prompt: str, max_length: int = 512) -> str:
    """
    Generates a code completion for a given prompt using a T5 model.

    Args:
        prompt (str): The input prompt for code generation.
        model_name (str): The name of the pre-trained T5 model to use.
        max_length (int): The maximum length of the generated sequence.

    Returns:
        str: The generated code completion.
    """
    content = """Give me python code to accomplish this function. Only provide me the code that is needed after the function definition in your response. Output in plain text without any formats.
                For example,
                For problem 'def return1():\n', you are supposed to response '    return 1'.
                
                For problem 'from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n   ### Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n     ###\n', 
                you are supposed to response '    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx != idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n\n    return False\n'. 
                
                For problem 'from typing import List\n\n\ndef mean_absolute_deviation(numbers: List[float]) -> float:\n    ### For a given list of input numbers, calculate Mean Absolute Deviation\n    around the mean of this dataset.\n    Mean Absolute Deviation is the average absolute difference between each\n    element and a centerpoint (mean in this case):\n    MAD = average | x - x_mean |\n    >>> mean_absolute_deviation([1.0, 2.0, 3.0, 4.0])\n    1.0\n    ###\n'', 
                you are supposed to response '    mean = sum(numbers) / len(numbers)\n    return sum(abs(x - mean) for x in numbers) / len(numbers)\n'"""

    # , Do not include any explanation. Import libraries when necessary. Do not use recursion, use for loop if possible.
    #             Only provide me the code that is needed after the function definition.
    messages = [
        {"role": "system", "content": content + "\nNow answer the following coding question. \n"},
        {"role": "user", "content": prompt}
    ]

    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=512
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

    return response

num_samples_per_task = 1

samples = []
outer = tqdm(problems, desc="Processing tasks")
for task_id in outer:
    inner = tqdm(range(num_samples_per_task), desc=f"Generating samples for {task_id}", leave=False)
    for _ in inner:
        completion = generate_one_completion1(problems[task_id]["prompt"])
        samples.append(dict(task_id=task_id, completion=completion))
write_jsonl("Qwen_3B_fewshot.jsonl", samples)

### Qwen2.5-3B-Instruct-finetune

In [8]:
import torch

def generate_one_completion2(prompt: str, max_length: int = 512) -> str:
    """
    Generates a code completion for a given prompt using a T5 model.

    Args:
        prompt (str): The input prompt for code generation.
        model_name (str): The name of the pre-trained T5 model to use.
        max_length (int): The maximum length of the generated sequence.

    Returns:
        str: The generated code completion.
    """
    client = OpenAI(api_key="0",base_url="http://0.0.0.0:8000/v1")
    
    content = """Give me python code to accomplish this function, Do not include any explanation. 
                    Only provide me the code that is needed after the function definition."""

    messages = [
        {"role": "system", "content": content + "Now answer the following coding question. "},
        # {"role": "system", "content": "Answer the following coding question. "},
        {"role": "user", "content": prompt}
    ]

    result = client.chat.completions.create(messages=messages, model="Qwen/Qwen2.5-3B-Instruct")

    response = result.choices[0].message.content

    return response

In [9]:
from tqdm import tqdm
from human_eval.data import write_jsonl, read_problems
from openai import OpenAI


num_samples_per_task = 1
problems = read_problems()

samples = []
outer = tqdm(problems, desc="Processing tasks")
for task_id in outer:
    inner = tqdm(range(num_samples_per_task), desc=f"Generating samples for {task_id}", leave=False)
    for _ in inner:
        completion = generate_one_completion2(problems[task_id]["prompt"])
        samples.append(dict(task_id=task_id, completion=completion))
write_jsonl("Qwen_3B_finetune.jsonl", samples)

Processing tasks:   0%|                                                                    | 0/164 [00:00<?, ?it/s]
Generating samples for HumanEval/0:   0%|                                                    | 0/1 [00:00<?, ?it/s][A
Generating samples for HumanEval/0: 100%|████████████████████████████████████████████| 1/1 [00:02<00:00,  2.11s/it][A
Processing tasks:   1%|▎                                                           | 1/164 [00:02<05:44,  2.11s/it][A
Generating samples for HumanEval/1:   0%|                                                    | 0/1 [00:00<?, ?it/s][A
Generating samples for HumanEval/1: 100%|████████████████████████████████████████████| 1/1 [00:05<00:00,  5.52s/it][A
Processing tasks:   1%|▋                                                           | 2/164 [00:07<11:07,  4.12s/it][A
Generating samples for HumanEval/2:   0%|                                                    | 0/1 [00:00<?, ?it/s][A
Generating samples for HumanEval/2: 100%|██████████

## Qwen2.5-7B-Instruct

In [3]:
from human_eval.data import write_jsonl, read_problems
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

problems = read_problems()

model_name = "Qwen/Qwen2.5-7B-Instruct"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

#### Plain Prompts

In [4]:
def generate_one_completion0(prompt: str, max_length: int = 512) -> str:
    """
    Generates a code completion for a given prompt using a T5 model.

    Args:
        prompt (str): The input prompt for code generation.
        model_name (str): The name of the pre-trained T5 model to use.
        max_length (int): The maximum length of the generated sequence.

    Returns:c
        str: The generated code completion.
    """
    content = """Give me python code to accomplish this function, Do not include any explanation. 
                Only provide me the code that is needed after the function definition."""

    
    messages = [
        {"role": "system", "content": content + "Now answer the following coding question. "},
        {"role": "user", "content": prompt}
    ]

    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=512
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

    return response

In [6]:
num_samples_per_task = 1

samples = []
outer = tqdm(problems, desc="Processing tasks")
for task_id in outer:
    inner = tqdm(range(num_samples_per_task), desc=f"Generating samples for {task_id}", leave=False)
    for _ in inner:
        completion = generate_one_completion0(problems[task_id]["prompt"])
        samples.append(dict(task_id=task_id, completion=completion))
write_jsonl("samples_plain_prompts.jsonl", samples)

Processing tasks:   0%|                                                                                 | 0/164 [00:00<?, ?it/s]
Generating samples for HumanEval/0:   0%|                                                                 | 0/1 [00:00<?, ?it/s][A
Generating samples for HumanEval/0: 100%|█████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.16s/it][A
Processing tasks:   1%|▍                                                                        | 1/164 [00:03<08:35,  3.16s/it][A
Generating samples for HumanEval/1:   0%|                                                                 | 0/1 [00:00<?, ?it/s][A
Generating samples for HumanEval/1: 100%|█████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.94s/it][A
Processing tasks:   1%|▉                                                                        | 2/164 [00:07<09:46,  3.62s/it][A
Generating samples for HumanEval/2:   0%|                                      

#### Few-shot prompts

In [10]:
def generate_one_completion1(prompt: str, max_length: int = 512) -> str:
    """
    Generates a code completion for a given prompt using a T5 model.

    Args:
        prompt (str): The input prompt for code generation.
        model_name (str): The name of the pre-trained T5 model to use.
        max_length (int): The maximum length of the generated sequence.

    Returns:
        str: The generated code completion.
    """
    content = """Give me python code to accomplish this function. Only provide me the code that is needed after the function definition in your response. Output in plain text without any formats.
                For example,
                For problem 'def return1():\n', you are supposed to response '    return 1'.
                
                For problem 'from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n   ### Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n     ###\n', 
                you are supposed to response '    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx != idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n\n    return False\n'. 
                
                For problem 'from typing import List\n\n\ndef mean_absolute_deviation(numbers: List[float]) -> float:\n    ### For a given list of input numbers, calculate Mean Absolute Deviation\n    around the mean of this dataset.\n    Mean Absolute Deviation is the average absolute difference between each\n    element and a centerpoint (mean in this case):\n    MAD = average | x - x_mean |\n    >>> mean_absolute_deviation([1.0, 2.0, 3.0, 4.0])\n    1.0\n    ###\n'', 
                you are supposed to response '    mean = sum(numbers) / len(numbers)\n    return sum(abs(x - mean) for x in numbers) / len(numbers)\n'"""

    # , Do not include any explanation. Import libraries when necessary. Do not use recursion, use for loop if possible.
    #             Only provide me the code that is needed after the function definition.
    messages = [
        {"role": "system", "content": content + "\nNow answer the following coding question. \n"},
        {"role": "user", "content": prompt}
    ]

    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=512
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

    return response

In [11]:
num_samples_per_task = 1

samples = []
outer = tqdm(problems, desc="Processing tasks")
for task_id in outer:
    inner = tqdm(range(num_samples_per_task), desc=f"Generating samples for {task_id}", leave=False)
    for _ in inner:
        completion = generate_one_completion1(problems[task_id]["prompt"])
        samples.append(dict(task_id=task_id, completion=completion))
write_jsonl("Qwen_fewshot4.jsonl", samples)

Processing tasks:   0%|                                                                                                                                                                                                                                | 0/164 [00:00<?, ?it/s]
Generating samples for HumanEval/0:   0%|                                                                                                                                                                                                                | 0/1 [00:00<?, ?it/s][A
Generating samples for HumanEval/0: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.94s/it][A
Processing tasks:   1%|█▎                                                                                                                                                         

### Qwen2.5-7B-Instruct-finetune

In [17]:
import torch

def generate_one_completion2(prompt: str, max_length: int = 512) -> str:
    """
    Generates a code completion for a given prompt using a T5 model.

    Args:
        prompt (str): The input prompt for code generation.
        model_name (str): The name of the pre-trained T5 model to use.
        max_length (int): The maximum length of the generated sequence.

    Returns:
        str: The generated code completion.
    """
    client = OpenAI(api_key="0",base_url="http://0.0.0.0:8000/v1")
    
    content = """Give me python code to accomplish this function, Do not include any explanation. 
                    Only provide me the code that is needed after the function definition."""

    messages = [
        {"role": "system", "content": content + "Now answer the following coding question. "},
        # {"role": "system", "content": "Answer the following coding question. "},
        {"role": "user", "content": prompt}
    ]

    result = client.chat.completions.create(messages=messages, model="Qwen/Qwen2.5-7B-Instruct")

    response = result.choices[0].message.content

    return response

In [18]:
from tqdm import tqdm
from human_eval.data import write_jsonl, read_problems
from openai import OpenAI


num_samples_per_task = 1
problems = read_problems()

samples = []
outer = tqdm(problems, desc="Processing tasks")
for task_id in outer:
    inner = tqdm(range(num_samples_per_task), desc=f"Generating samples for {task_id}", leave=False)
    for _ in inner:
        completion = generate_one_completion2(problems[task_id]["prompt"])
        samples.append(dict(task_id=task_id, completion=completion))
write_jsonl("samples_Qwen_finetune.jsonl", samples)

Processing tasks:   0%|                                                                                                                                                | 0/164 [00:00<?, ?it/s]
Generating samples for HumanEval/0:   0%|                                                                                                                                | 0/1 [00:00<?, ?it/s][A
Generating samples for HumanEval/0: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.15s/it][A
Processing tasks:   1%|▊                                                                                                                                       | 1/164 [00:02<05:51,  2.16s/it][A
Generating samples for HumanEval/1:   0%|                                                                                                                                | 0/1 [00:00<?, ?it/s][A
Generating samples for Human

## Gemma

In [10]:
from human_eval.data import write_jsonl, read_problems
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

problems = read_problems()

model_name = "google/gemma-7b-it"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

#### Plain prompts

In [24]:
def generate_one_completion3(prompt: str, max_length: int = 512) -> str:
    """
    Generates a code completion for a given prompt using a T5 model.

    Args:
        prompt (str): The input prompt for code generation.
        model_name (str): The name of the pre-trained T5 model to use.
        max_length (int): The maximum length of the generated sequence.

    Returns:
        str: The generated code completion.
    """
    content1 = """Write the code for the following task. \n"""
    content2 = """\n Do not include any explanations or any examples for the code writing. Just write the entire function once. Do not include any python format, just give plain texts."""

    messages = content1 + prompt + content2

    input_ids = tokenizer(messages, return_tensors="pt").to(model.device)

    outputs = model.generate(
        **input_ids,
        max_new_tokens=512
    )

    response = tokenizer.decode(outputs[0])
    answer_part = response.replace(messages, "").strip()
    code_blocks = re.findall(r"```python(.*?)```", answer_part, re.DOTALL)

    return code_blocks

In [25]:
num_samples_per_task = 1

samples = []
outer = tqdm(problems, desc="Processing tasks")
for task_id in outer:
    inner = tqdm(range(num_samples_per_task), desc=f"Generating samples for {task_id}", leave=False)
    for _ in inner:
        completion = generate_one_completion3(problems[task_id]["prompt"])
        samples.append(dict(task_id=task_id, completion=completion))
write_jsonl("Gemma_plain_prompts.jsonl", samples)

Processing tasks:   0%|                                                                                 | 0/164 [00:00<?, ?it/s]
Generating samples for HumanEval/0:   0%|                                                                 | 0/1 [00:00<?, ?it/s][A
Generating samples for HumanEval/0: 100%|█████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.51s/it][A
Processing tasks:   1%|▍                                                                        | 1/164 [00:03<09:32,  3.51s/it][A
Generating samples for HumanEval/1:   0%|                                                                 | 0/1 [00:00<?, ?it/s][A
Generating samples for HumanEval/1: 100%|█████████████████████████████████████████████████████████| 1/1 [00:05<00:00,  5.03s/it][A
Processing tasks:   1%|▉                                                                        | 2/164 [00:08<11:53,  4.41s/it][A
Generating samples for HumanEval/2:   0%|                                      

In [30]:
import re
import json
from tqdm import tqdm

# Function to process completion text
def process_completion(completion):
    # Split the completion text into parts
    content1 = "Write the code for the following task. \n"
    content2 = "\n Do not include any explanations or any examples for the code writing. Just write the entire function once. Do not include any python format, just give plain texts."

    # Extract the answer part by removing the message components
    if content1 in completion and content2 in completion:
        prompt_start = len(content1)
        prompt_end = completion.find(content2)
        prompt = completion[prompt_start:prompt_end].strip()

        # Replace message structure with empty space to isolate the answer
        messages = content1 + prompt + content2
        answer_part = completion.replace(messages, "").strip()

        # Extract code blocks from the answer part
        code_blocks = re.findall(r"```python(.*?)```", answer_part, re.DOTALL)
        if code_blocks:
            return code_blocks[0].strip()  # Return the first code block
    return None

# Function to process JSONL file
def update_completions_jsonl(input_file, output_file):
    with open(input_file, "r") as infile, open(output_file, "w") as outfile:
        for line in tqdm(infile, desc="Processing tasks"):
            task = json.loads(line.strip())  # Load each JSON object
            task_id = task.get("task_id", "unknown")
            completion = task.get("completion", "")

            # Process the completion to extract the main code block
            code_block = process_completion(completion)

            if code_block:
                # Update completion with the code block
                task["completion"] = code_block

            # Write the updated task back to the output file
            outfile.write(json.dumps(task) + "\n")

# File paths
input_file = "Gemma_plain_prompts.jsonl"  # Replace with the path to your input file
output_file = "Gemma_plain_prompts2.jsonl"  # Replace with the path to save the updated file

# Process the file
update_completions_jsonl(input_file, output_file)


Processing tasks: 164it [00:00, 32265.39it/s]


In [28]:
import os
os.getcwd()

'/home/ubuntu/project/human-eval'

#### Few Shot

In [4]:
import re
import json
from tqdm import tqdm
def generate_one_completion4(prompt: str, max_length: int = 512) -> str:
    """
    Generates a code completion for a given prompt using a T5 model.

    Args:
        prompt (str): The input prompt for code generation.
        model_name (str): The name of the pre-trained T5 model to use.
        max_length (int): The maximum length of the generated sequence.

    Returns:
        str: The generated code completion.
    """
    content1 = """You need to do a code generation task. \n"""
    content2 = """\n Do not include any explanations or any examples for the code writing. Just write the entire function once. Do not include any python format, just give plain texts."""
    few_shots = """For example, for problem 'def return1():\n', you are supposed to response '    return 1'.
                For problem 'from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n   ### Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n     ###\n', 
                you are supposed to response '    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx != idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n\n    return False\n'. 
                For problem 'from typing import List\n\n\ndef mean_absolute_deviation(numbers: List[float]) -> float:\n    ### For a given list of input numbers, calculate Mean Absolute Deviation\n    around the mean of this dataset.\n    Mean Absolute Deviation is the average absolute difference between each\n    element and a centerpoint (mean in this case):\n    MAD = average | x - x_mean |\n    >>> mean_absolute_deviation([1.0, 2.0, 3.0, 4.0])\n    1.0\n    ###\n'', 
                you are supposed to response '    mean = sum(numbers) / len(numbers)\n    return sum(abs(x - mean) for x in numbers) / len(numbers)\n'"""
    content3 = """Write the code for the following task. \n"""

    messages = content1 + content2 + prompt + content3

    input_ids = tokenizer(messages, return_tensors="pt").to(model.device)

    outputs = model.generate(
        **input_ids,
        max_new_tokens=512
    )

    response = tokenizer.decode(outputs[0])
    answer_part = response.replace(messages, "").strip()
    code_blocks = re.findall(r"```python(.*?)```", answer_part, re.DOTALL)

    return code_blocks

In [6]:
num_samples_per_task = 1

samples = []
outer = tqdm(problems, desc="Processing tasks")
for task_id in outer:
    inner = tqdm(range(num_samples_per_task), desc=f"Generating samples for {task_id}", leave=False)
    for _ in inner:
        completion = generate_one_completion4(problems[task_id]["prompt"])
        samples.append(dict(task_id=task_id, completion=completion))
write_jsonl("Gemma_few_shot.jsonl", samples)

Processing tasks:   0%|                                                                                         | 0/164 [00:00<?, ?it/s]
Generating samples for HumanEval/0:   0%|                                                                         | 0/1 [00:00<?, ?it/s][A
Generating samples for HumanEval/0: 100%|█████████████████████████████████████████████████████████████████| 1/1 [00:07<00:00,  7.34s/it][A
Processing tasks:   1%|▍                                                                                | 1/164 [00:07<19:57,  7.35s/it][A
Generating samples for HumanEval/1:   0%|                                                                         | 0/1 [00:00<?, ?it/s][A
Generating samples for HumanEval/1: 100%|█████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.03s/it][A
Processing tasks:   1%|▉                                                                                | 2/164 [00:10<12:59,  4.81s/it][A
Generating samples for 

In [23]:
prompt = problems["HumanEval/1"]["prompt"]

content1 = """You need to do a code generation task. \n"""
content2 = """\n Do not include any explanations or any examples for the code writing. Just write the entire function once. Do not include any python format, just give plain texts."""
few_shots = """For example, for problem 'def return1():\n', you are supposed to response '    return 1'.
            For problem 'from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n   ### Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n     ###\n', 
            you are supposed to response '    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx != idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n\n    return False\n'. 
            For problem 'from typing import List\n\n\ndef mean_absolute_deviation(numbers: List[float]) -> float:\n    ### For a given list of input numbers, calculate Mean Absolute Deviation\n    around the mean of this dataset.\n    Mean Absolute Deviation is the average absolute difference between each\n    element and a centerpoint (mean in this case):\n    MAD = average | x - x_mean |\n    >>> mean_absolute_deviation([1.0, 2.0, 3.0, 4.0])\n    1.0\n    ###\n'', 
            you are supposed to response '    mean = sum(numbers) / len(numbers)\n    return sum(abs(x - mean) for x in numbers) / len(numbers)\n'"""
content3 = """Write the code for the following task. \n"""

messages = content1 + content2 + few_shots + content3 + prompt + "\n Now please write the code. \n"

input_ids = tokenizer(messages, return_tensors="pt").to(model.device)

outputs = model.generate(
    **input_ids,
    max_new_tokens=1024
)

response = tokenizer.decode(outputs[0])
answer_part = response.replace(messages, "").strip()
code_blocks = re.findall(r"```python(.*?)```", answer_part, re.DOTALL)


print(response)
print(answer_part)
print(code_blocks)

<bos>You need to do a code generation task. 

 Do not include any explanations or any examples for the code writing. Just write the entire function once. Do not include any python format, just give plain texts.For example, for problem 'def return1():
', you are supposed to response '    return 1'.
            For problem 'from typing import List


def has_close_elements(numbers: List[float], threshold: float) -> bool:
   ### Check if in given list of numbers, are any two numbers closer to each other than
    given threshold.
    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)
    False
    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
    True
     ###
', 
            you are supposed to response '    for idx, elem in enumerate(numbers):
        for idx2, elem2 in enumerate(numbers):
            if idx != idx2:
                distance = abs(elem - elem2)
                if distance < threshold:
                    return True

    return False
'. 
            For problem 'f

In [24]:
print(response)

<bos>You need to do a code generation task. 

 Do not include any explanations or any examples for the code writing. Just write the entire function once. Do not include any python format, just give plain texts.For example, for problem 'def return1():
', you are supposed to response '    return 1'.
            For problem 'from typing import List


def has_close_elements(numbers: List[float], threshold: float) -> bool:
   ### Check if in given list of numbers, are any two numbers closer to each other than
    given threshold.
    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)
    False
    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
    True
     ###
', 
            you are supposed to response '    for idx, elem in enumerate(numbers):
        for idx2, elem2 in enumerate(numbers):
            if idx != idx2:
                distance = abs(elem - elem2)
                if distance < threshold:
                    return True

    return False
'. 
            For problem 'f

In [20]:
print(prompt)

from typing import List


def has_close_elements(numbers: List[float], threshold: float) -> bool:
    """ Check if in given list of numbers, are any two numbers closer to each other than
    given threshold.
    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)
    False
    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
    True
    """



## Gemma-finetune

In [9]:
import torch

def generate_one_completion5(prompt: str, max_length: int = 512) -> str:
    """
    Generates a code completion for a given prompt using a T5 model.

    Args:
        prompt (str): The input prompt for code generation.
        model_name (str): The name of the pre-trained T5 model to use.
        max_length (int): The maximum length of the generated sequence.

    Returns:
        str: The generated code completion.
    """
    client = OpenAI(api_key="0",base_url="http://0.0.0.0:8000/v1")
    
    content1 = """Write the code for the following task. \n"""
    content2 = """\n Do not include any explanations or any examples for the code writing. Just write the entire function once. Do not include any python format, just give plain texts."""

    messages = [
        {"role": "system", "content": "You are a helpful assistant specialized in code generation."},
        {"role": "user", "content": content1 + prompt + content2}
    ]

    result = client.chat.completions.create(messages=messages, model="google/gemma-7b-it")

    response = result.choices[0].message.content

    return response

In [10]:
from tqdm import tqdm
from human_eval.data import write_jsonl, read_problems
from openai import OpenAI


num_samples_per_task = 1
problems = read_problems()

samples = []
outer = tqdm(problems, desc="Processing tasks")
for task_id in outer:
    inner = tqdm(range(num_samples_per_task), desc=f"Generating samples for {task_id}", leave=False)
    for _ in inner:
        completion = generate_one_completion5(problems[task_id]["prompt"])
        samples.append(dict(task_id=task_id, completion=completion))
write_jsonl("Gemma_finetune.jsonl", samples)

Processing tasks:   0%|                                                                                                                 | 0/164 [00:00<?, ?it/s]
Generating samples for HumanEval/0:   0%|                                                                                                 | 0/1 [00:00<?, ?it/s][A
Generating samples for HumanEval/0: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:04<00:00,  4.05s/it][A
Processing tasks:   1%|▋                                                                                                        | 1/164 [00:04<11:00,  4.05s/it][A
Generating samples for HumanEval/1:   0%|                                                                                                 | 0/1 [00:00<?, ?it/s][A
Generating samples for HumanEval/1: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:05<00:00,  5.32s/it][A
Processing tasks:  

In [5]:
from openai import OpenAI
from human_eval.data import write_jsonl, read_problems
problems = read_problems()

In [6]:
client = OpenAI(api_key="0",base_url="http://0.0.0.0:8000/v1")

prompt = problems["HumanEval/0"]["prompt"]

content1 = """Write the code for the following task. \n"""
content2 = """\n Do not include any explanations or any examples for the code writing. Just write the entire function once. Do not include any python format, just give plain texts."""

messages = [
    {"role": "system", "content": "You are a helpful assistant specialized in code generation."},
    {"role": "user", "content": content1 + prompt + content2}
]

result = client.chat.completions.create(messages=messages, model="google/gemma-7b-it")

response = result.choices[0].message.content

print(response)

def has_close_elements(numbers: List[float], threshold: float) -> bool:
    """ Check if in given list of numbers, are any two numbers closer to each other than
    given threshold.
    """
    for i in range(len(numbers)):
        for j in range(i+1, len(numbers)):
            if abs(numbers[i] - numbers[j]) <= threshold:
                return True
    return False


In [8]:
print(prompt)

from typing import List


def has_close_elements(numbers: List[float], threshold: float) -> bool:
    """ Check if in given list of numbers, are any two numbers closer to each other than
    given threshold.
    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)
    False
    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
    True
    """



In [21]:
content1 = """Write the code for the following task. \n"""
content2 = """\n  Do not include any explanations or any examples for the code writing. Just write the entire function once. Do not include any python format, just give plain texts."""

prompt = "Write a function that returns the sum of the array."
messages = content1 + prompt + content2
input_ids = tokenizer(messages, return_tensors="pt").to(model.device)

outputs = model.generate(
    **input_ids,
    max_new_tokens=512
)

response = tokenizer.decode(outputs[0])
answer_part = response.replace(messages, "").strip()

code_blocks = re.findall(r"```python(.*?)```", answer_part, re.DOTALL)

print(response)
print(answer_part)
print(code_blocks)

<bos>Write the code for the following task. 
Write a function that returns the sum of the array.
  Do not include any explanations or any examples for the code writing. Just write the entire function once. Do not include any python format, just give plain texts.

```
Sum of the array
```

```python
def sum_array(arr):
    total = 0
    for num in arr:
        total += num
    return total
```<eos>
<bos>

```
Sum of the array
```

```python
def sum_array(arr):
    total = 0
    for num in arr:
        total += num
    return total
```<eos>
['\ndef sum_array(arr):\n    total = 0\n    for num in arr:\n        total += num\n    return total\n']
