In [7]:
!pip install -U transformers



## Local Inference on GPU 
Model page: https://huggingface.co/Salesforce/codegen2-1B_P

⚠️ If the generated code snippets do not work, please open an issue on either the [model repo](https://huggingface.co/Salesforce/codegen2-1B_P)
			and/or on [huggingface.js](https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/src/model-libraries-snippets.ts) 🙏

In [8]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="Salesforce/codegen2-1B_P", trust_remote_code=True)

Device set to use cuda:0


In [9]:
import torch
from datasets import load_dataset
import subprocess, tempfile, os, sys
import pandas as pd

In [10]:
from transformers import AutoTokenizer, AutoModelForCausalLM


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

tokenizer = AutoTokenizer.from_pretrained("Salesforce/codegen2-1B_P", trust_remote_code=True)

# Ensure model is fully loaded on the chosen device
model = AutoModelForCausalLM.from_pretrained(
    "Salesforce/codegen2-1B_P",
    trust_remote_code=True,
    torch_dtype=torch.float16 if device.type == "cuda" else torch.float32
).to(device)
model.eval()

Using device: cuda


CodeGenForCausalLM(
  (transformer): CodeGenModel(
    (wte): Embedding(51200, 2048)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-15): 16 x CodeGenBlock(
        (ln_1): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (attn): CodeGenAttention(
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
          (qkv_proj): Linear(in_features=2048, out_features=6144, bias=False)
          (out_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): CodeGenMLP(
          (fc_in): Linear(in_features=2048, out_features=8192, bias=True)
          (fc_out): Linear(in_features=8192, out_features=2048, bias=True)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.0, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=2048, out_features=51200, bias=True)
)

In [11]:

dataset = load_dataset("openai/openai_humaneval")
problems = dataset["test"]  # 164 tasks

print("Number of problems:", len(problems))
print("Example problem:\n", problems[0])

Number of problems: 164
Example problem:
 {'task_id': 'HumanEval/0', 'prompt': 'from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    """ Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    """\n', 'canonical_solution': '    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx != idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n\n    return False\n', 'test': "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True\n    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False\n    assert candidate([1.0, 2.0, 5.9

# Prompt Engineering (augmentation at inference time)

In [12]:
prompt_format_1= '''
# Write a correct Python function.
'''

In [13]:
prompt_format_2 = '''
Write a python function for above prompt. 
Function should be 100% accurate and should do the task asked in the prompts.
 Follow these rules: 
 1) Include type hints
 2) Handle edge cases 
 3) Return correct output
'''

In [None]:
prompt_format_3 = '''
Task Description:
{task_description}
Your goal is to write a Python function that solves the task accurately.
Follow these detailed instructions carefully:
1) Include type hints for function parameters and return type.
2) Handle edge cases, such as empty inputs, negative numbers, or special characters.
3) Return the correct output exactly as expected.
4) Use clear variable names and maintain clean code style.
5) Include an example showing input and expected output.

Example:
Task: Write a function add_numbers(a, b) that returns the sum of two numbers.
Input: a = 3, b = 5
Expected Output: 8
Function:
def add_numbers(a: int, b: int) -> int:
    return a + b

Now, write the solution for the task above:
def
'''


In [14]:
prompt_styles = [
    # 1. Minimal (baseline)
    lambda task: task,
    
    # 2. Instruction style
    lambda task: f"# Python 3\n# Task: {task} {prompt_format_1}",
    
    # 3. Step-by-step / clarify inputs
    lambda task: f"# Python 3\n# Task: {task} {prompt_format_2}"
]


# Measure Pass@K

For each test case, we can generate multiple samples and try pass @1 , pass@2 , pass@10 etc

In [15]:
def generate_multiple(prompt, n_samples=5, temperature=0.8, top_p=0.95, max_tokens=256):
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
    outputs = model.generate(
        input_ids,
        max_length=input_ids.shape[1] + max_tokens,
        temperature=temperature,
        top_p=top_p,
        do_sample=True,
        num_return_sequences=n_samples,
        pad_token_id=tokenizer.eos_token_id,
    )
    decoded = [tokenizer.decode(o, skip_special_tokens=True)[len(prompt):].strip()
               for o in outputs]
    return decoded



In [16]:
def run_tests(code, test_code):
    with tempfile.NamedTemporaryFile(delete=False, suffix=".py", mode="w") as f:
        f.write(code + "\n\n")
        f.write(test_code + "\n")
        test_file = f.name

    try:
        result = subprocess.run([sys.executable, test_file],
                                capture_output=True, text=True, timeout=5)
        passed = (result.returncode == 0)
    except subprocess.TimeoutExpired:
        passed = False
    finally:
        os.remove(test_file)

    return passed


In [17]:
def evaluate_humaneval(n=10, n_samples=5, temps=[0.2,0.5 ,  0.8]):
    results = []
    for i in range(n):
        problem = problems[i]
        task_id = problem["task_id"]
        test_code = problem["test"]
        base_prompt = problem["prompt"]

        for style_idx, style_fn in enumerate(prompt_styles):
            prompt = style_fn(base_prompt)
            for temp in temps:
                codes = generate_multiple(prompt, n_samples=n_samples, temperature=temp)
                # Compute pass@1, pass@k
                passes = [run_tests(c, test_code) for c in codes]
                pass_at_1 = passes[0]
                pass_at_k = {k: any(passes[:k]) for k in [5, 10]}
                results.append({
                    "task_id": task_id,
                    "prompt_style": style_idx,
                    "temperature": temp,
                    "pass@1": pass_at_1,
                    "pass@5": pass_at_k[5],
                    "pass@10": pass_at_k[10],
                    "codes": codes[:3]  # preview first 3 samples
                })
                print(f"[{task_id}] style={style_idx} temp={temp} pass@1={pass_at_1} pass@5={pass_at_k[5]} pass@10={pass_at_k[10]}")
    return results



# Run evaluation

In [18]:
print("Evaluation results for Salesforce/codegen2-1B_P after prompt optimization ")
results = evaluate_humaneval(n=10, n_samples=5, temps=[0.2, 0.8])

Evaluation results for Salesforce/codegen2-1B_P after prompt optimization 


  attn_weights = torch.where(causal_mask, attn_weights, mask_value)


[HumanEval/0] style=0 temp=0.2 pass@1=False pass@5=False pass@10=False
[HumanEval/0] style=0 temp=0.8 pass@1=False pass@5=False pass@10=False
[HumanEval/0] style=1 temp=0.2 pass@1=True pass@5=True pass@10=True
[HumanEval/0] style=1 temp=0.8 pass@1=False pass@5=True pass@10=True
[HumanEval/0] style=2 temp=0.2 pass@1=False pass@5=False pass@10=False
[HumanEval/0] style=2 temp=0.8 pass@1=False pass@5=False pass@10=False
[HumanEval/1] style=0 temp=0.2 pass@1=True pass@5=True pass@10=True
[HumanEval/1] style=0 temp=0.8 pass@1=True pass@5=True pass@10=True
[HumanEval/1] style=1 temp=0.2 pass@1=False pass@5=True pass@10=True
[HumanEval/1] style=1 temp=0.8 pass@1=True pass@5=True pass@10=True
[HumanEval/1] style=2 temp=0.2 pass@1=False pass@5=False pass@10=False
[HumanEval/1] style=2 temp=0.8 pass@1=False pass@5=False pass@10=False
[HumanEval/2] style=0 temp=0.2 pass@1=False pass@5=False pass@10=False
[HumanEval/2] style=0 temp=0.8 pass@1=False pass@5=False pass@10=False
[HumanEval/2] style=1 

# Summarize results

In [None]:
df = pd.DataFrame(results)
summary = df.groupby(["prompt_style", "temperature"])[["pass@1", "pass@5", "pass@10"]].mean()
print("\nSummary table (mean pass@k by style and temperature):")
print(summary)

df.to_csv("humaneval_codegen2_passk_results_with_prompting.csv", index=False)
print("\nSaved detailed results to humaneval_codegen2_passk_results.csv")

# Data Augmentation

# Hyperparameter Tuning

**Try systematically:**

temperature ∈ {0.2, 0.6, 0.8, 1.0} 

top_p ∈ {0.9, 0.95}

num_return_sequences ∈ {1, 20, 100}