In [11]:
!pip install -U transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




# **Salesforce/codegen2-1B_P**

## Local Inference on GPU 
Model page: https://huggingface.co/Salesforce/codegen2-1B_P

⚠️ If the generated code snippets do not work, please open an issue on either the [model repo](https://huggingface.co/Salesforce/codegen2-1B_P)
			and/or on [huggingface.js](https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/src/model-libraries-snippets.ts) 🙏

In [12]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="Salesforce/codegen2-1B_P", trust_remote_code=True)

Device set to use cuda:0


In [13]:
# device = "cuda" if torch.cuda.is_available() else "cpu"
# print("Using device:", device)

Using device: cuda


Load Tokenizer and model


In [14]:
# # Load model directly
# from transformers import AutoTokenizer, AutoModelForCausalLM

# tokenizer = AutoTokenizer.from_pretrained("Salesforce/codegen2-1B_P", trust_remote_code=True)
# model = AutoModelForCausalLM.from_pretrained("Salesforce/codegen2-1B_P", trust_remote_code=True)

In [21]:
import torch
from datasets import load_dataset
import subprocess, tempfile, os, sys
import pandas as pd

In [20]:
from transformers import AutoTokenizer, AutoModelForCausalLM


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

tokenizer = AutoTokenizer.from_pretrained("Salesforce/codegen2-1B_P", trust_remote_code=True)

# Ensure model is fully loaded on the chosen device
model = AutoModelForCausalLM.from_pretrained(
    "Salesforce/codegen2-1B_P",
    trust_remote_code=True,
    torch_dtype=torch.float16 if device.type == "cuda" else torch.float32
).to(device)
model.eval()

Using device: cuda


`torch_dtype` is deprecated! Use `dtype` instead!


CodeGenForCausalLM(
  (transformer): CodeGenModel(
    (wte): Embedding(51200, 2048)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-15): 16 x CodeGenBlock(
        (ln_1): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (attn): CodeGenAttention(
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
          (qkv_proj): Linear(in_features=2048, out_features=6144, bias=False)
          (out_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): CodeGenMLP(
          (fc_in): Linear(in_features=2048, out_features=8192, bias=True)
          (fc_out): Linear(in_features=8192, out_features=2048, bias=True)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.0, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=2048, out_features=51200, bias=True)
)

# Load Dataset

**here I am using HumanEval dataset from openai , accessing it via huggingface** 

In [22]:

dataset = load_dataset("openai/openai_humaneval")
problems = dataset["test"]  # 164 tasks

print("Number of problems:", len(problems))
print("Example problem:\n", problems[0])

Number of problems: 164
Example problem:
 {'task_id': 'HumanEval/0', 'prompt': 'from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    """ Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    """\n', 'canonical_solution': '    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx != idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n\n    return False\n', 'test': "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True\n    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False\n    assert candidate([1.0, 2.0, 5.9

# **Code generation**

In [23]:
def generate_code(prompt, temperature=0.8, top_p=0.95, max_tokens=256):
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
    output = model.generate(
        input_ids,
        max_length=input_ids.shape[1] + max_tokens,
        temperature=temperature,
        top_p=top_p,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
    )
    decoded = tokenizer.decode(output[0], skip_special_tokens=True)
    return decoded[len(prompt):].strip()


# **Run tests**

In [24]:
def run_tests(code, test_code):
    with tempfile.NamedTemporaryFile(delete=False, suffix=".py", mode="w") as f:
        f.write(code + "\n\n")
        f.write(test_code + "\n")
        test_file = f.name

    try:
        result = subprocess.run([sys.executable, test_file],
                                capture_output=True, text=True, timeout=5)
        passed = (result.returncode == 0)
    except subprocess.TimeoutExpired:
        passed = False
    finally:
        os.remove(test_file)

    return passed


# **Evaluate using a subset , here 10 tests**

In [25]:
def evaluate_subset(n=10, temps=[0.2, 0.8]):
    results = []
    for i in range(n):
        problem = problems[i]   # <-- access dict row
        prompt = problem["prompt"]
        test_code = problem["test"]

        for temp in temps:
            code = generate_code(prompt, temperature=temp)
            passed = run_tests(code, test_code)
            results.append({
                "task_id": problem["task_id"],
                "temp": temp,
                "passed": passed,
                "code_snippet": code[:200]
            })
            print(f"[{problem['task_id']}] temp={temp}, passed={passed}")
    return results

print("Baseline performance - Salesforce/codegen2-1B_P")
results = evaluate_subset(n=10, temps=[0.2, 0.8])

  attn_weights = torch.where(causal_mask, attn_weights, mask_value)


[HumanEval/0] temp=0.2, passed=False
[HumanEval/0] temp=0.8, passed=False
[HumanEval/1] temp=0.2, passed=False
[HumanEval/1] temp=0.8, passed=False
[HumanEval/2] temp=0.2, passed=False
[HumanEval/2] temp=0.8, passed=False
[HumanEval/3] temp=0.2, passed=False
[HumanEval/3] temp=0.8, passed=False
[HumanEval/4] temp=0.2, passed=False
[HumanEval/4] temp=0.8, passed=False
[HumanEval/5] temp=0.2, passed=False
[HumanEval/5] temp=0.8, passed=False
[HumanEval/6] temp=0.2, passed=False
[HumanEval/6] temp=0.8, passed=False
[HumanEval/7] temp=0.2, passed=False
[HumanEval/7] temp=0.8, passed=False
[HumanEval/8] temp=0.2, passed=False
[HumanEval/8] temp=0.8, passed=False
[HumanEval/9] temp=0.2, passed=False
[HumanEval/9] temp=0.8, passed=False


In [None]:
df = pd.DataFrame(results)
print("\nSummary (pass rate by temperature):")
print(df.groupby("temp")["passed"].mean())

df.to_csv("humaneval_codegen2_results.csv", index=False)
print("\nSaved results to humaneval_codegen2_results.csv")