## Categorizing into simple, medium and hard categories

In [11]:
import yaml

# Load the already sampled 100 entries
with open("sampled_100_entries.yml", "r") as f:
    sampled_data = yaml.safe_load(f)

# Prepare YAML block scalar formatting
class LiteralStr(str): pass

def str_presenter(dumper, data):
    return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')

yaml.add_representer(LiteralStr, str_presenter)

# Create buckets
simple, medium, complex = [], [], []

# Classify by code line count
for entry in sampled_data:
    code = entry.get("code", "")
    formatted_code = code.replace("\\n", "\n").replace("\\\\", "\\")
    entry["code"] = LiteralStr(formatted_code)

    line_count = formatted_code.count("\n") + 1
    if line_count <= 9:
        simple.append(entry)
    elif line_count <= 13:
        medium.append(entry)
    else:
        complex.append(entry)

# Save categorized datasets
with open("simple.yml", "w") as f:
    yaml.dump(simple, f, sort_keys=False)

with open("medium.yml", "w") as f:
    yaml.dump(medium, f, sort_keys=False)

with open("complex.yml", "w") as f:
    yaml.dump(complex, f, sort_keys=False)

print("Categorized into simple.yml, medium.yml, and complex.yml.")

Categorized into simple.yml, medium.yml, and complex.yml.


### lengths of files

In [13]:
import yaml

# Filenames
files = {
    "Simple": "simple.yml",
    "Medium": "medium.yml",
    "Complex": "complex.yml"
}

# Count entries in each file
for label, filename in files.items():
    with open(filename, "r") as f:
        data = yaml.safe_load(f)
        print(f"{label} complexity: {len(data)} code blocks")

Simple complexity: 35 code blocks
Medium complexity: 35 code blocks
Complex complexity: 30 code blocks


In [20]:
import yaml

# Filenames
files = {
    "Simple": "simple@1/simple@1_variations.yml",
    "Medium": "medium@1/medium@1_variations.yml",
    "Complex": "complex@1/complex@1_variations.yml"
}

# Count entries in each file
for label, filename in files.items():
    with open(filename, "r") as f:
        data = yaml.safe_load(f)
        print(f"{label} codes: {len(data)} code blocks")

Simple codes: 35 code blocks
Medium codes: 35 code blocks
Complex codes: 30 code blocks


In [18]:
import yaml

# Filenames
files = {
    "Simple": "simple@1_variations_gpt.yml",
    "Medium": "medium@1_variations_gpt.yml",
    "Complex": "complex@1_variations_gpt.yml"
}

# Count entries in each file
for label, filename in files.items():
    with open(filename, "r") as f:
        data = yaml.safe_load(f)
        print(f"{label} codes: {len(data)} code blocks")

Simple codes: 35 code blocks
Medium codes: 35 code blocks
Complex codes: 30 code blocks


In [7]:

import yaml

# Filenames
files = {
    "Simple@1": "simple@1/simple@1_variations.yml",
    # "Medium@1": "start-coder-3b/medium@1/medium@1_variations.yml",
    # "Complex@1": "start-coder-3b/complex@1/complex@1_variations.yml"
}

# Count entries in each file
for label, filename in files.items():
    with open(filename, "r") as f:
        data = yaml.safe_load(f)
        print(f"{label} codes: {len(data)} code blocks")

Simple@1 codes: 35 code blocks


### Validate a single code

In [251]:
import yaml
import subprocess
import tempfile
from pathlib import Path

# === CONFIGURATION ===
yaml_file = "complex_variations_gpt.yml"  # Change to your file path

# === LOAD YAML FILE ===
try:
    with open(yaml_file, "r") as f:
        codes = yaml.safe_load(f)
except FileNotFoundError:
    print(f"❌ YAML file not found: {yaml_file}")
    exit(1)

if not isinstance(codes, list) or not all("code" in item for item in codes):
    print("❌ Invalid YAML format. Expected a list of items with 'code' fields.")
    exit(1)

# === PROMPT FOR INDEX ===
max_index = len(codes) - 1
print(f"📄 Loaded {len(codes)} code snippets from {yaml_file}")

try:
    index = int(input(f"🔢 Enter index to validate (0–{max_index}): "))
except ValueError:
    print("❌ Invalid input. Please enter an integer.")
    exit(1)

if not (0 <= index <= max_index):
    print(f"❌ Invalid index. Must be between 0 and {max_index}.")
    exit(1)

code = codes[index]["code"].strip()

# === WRITE TO TEMP FILE ===
with tempfile.NamedTemporaryFile(delete=False, suffix=".sv", mode="w") as tmp_file:
    tmp_file.write(code)
    tmp_file_path = Path(tmp_file.name)

# === RUN VERILATOR ===
try:
    print(f"\n🎯 Validating index {index} from {yaml_file}")
    result = subprocess.run(
        ["verilator", "--lint-only", "--timing", str(tmp_file_path)],
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        timeout=10
    )
    passed = result.returncode == 0
    print("✅ PASS" if passed else "❌ FAIL")

    if result.stdout:
        print("\nSTDOUT:")
        print(result.stdout.decode().strip())
    if result.stderr:
        print("\nSTDERR:")
        print(result.stderr.decode().strip())

except subprocess.TimeoutExpired:
    print("❌ FAIL — Timeout while linting")
except FileNotFoundError:
    print("❌ FAIL — Verilator not found. Is it installed and in PATH?")
finally:
    tmp_file_path.unlink(missing_ok=True)  # Clean up temp file

📄 Loaded 30 code snippets from complex_variations_gpt.yml

🎯 Validating index 29 from complex_variations_gpt.yml
✅ PASS

STDOUT:
- V e r i l a t i o n   R e p o r t: Verilator 5.036 2025-04-27 rev UNKNOWN.REV
- Verilator: Built from 0.027 MB sources in 2 modules, into 0.008 MB in 4 C++ files needing 0.000 MB
- Verilator: Walltime 0.013 s (elab=0.002, cvt=0.003, bld=0.000); cpu 0.006 s on 1 threads


In [252]:
import yaml
import subprocess
import tempfile
from pathlib import Path

# === CONFIGURATION ===
yaml_file = "complex_variations_gpt.yml"  # Change to your file path

# === LOAD YAML FILE ===
try:
    with open(yaml_file, "r") as f:
        codes = yaml.safe_load(f)
except FileNotFoundError:
    print(f"❌ YAML file not found: {yaml_file}")
    exit(1)

if not isinstance(codes, list) or not all("code" in item for item in codes):
    print("❌ Invalid YAML format. Expected a list of items with 'code' fields.")
    exit(1)

# === PROMPT FOR INDEX ===
max_index = len(codes) - 1
print(f"📄 Loaded {len(codes)} code snippets from {yaml_file}")

try:
    index = int(input(f"🔢 Enter index to validate (0–{max_index}): "))
except ValueError:
    print("❌ Invalid input. Please enter an integer.")
    exit(1)

if not (0 <= index <= max_index):
    print(f"❌ Invalid index. Must be between 0 and {max_index}.")
    exit(1)

code = codes[index]["code"].strip()

# === WRITE TO TEMP FILE ===
with tempfile.NamedTemporaryFile(delete=False, suffix=".sv", mode="w") as tmp_file:
    tmp_file.write(code)
    tmp_file_path = Path(tmp_file.name)

# === RUN VERILATOR ===
try:
    print(f"\n🎯 Validating index {index} from {yaml_file}")
    result = subprocess.run(
        ["verilator", "--lint-only", "--timing", str(tmp_file_path)],
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        timeout=10
    )
    passed = result.returncode == 0
    print("✅ PASS" if passed else "❌ FAIL")

    if result.stdout:
        print("\nSTDOUT:")
        print(result.stdout.decode().strip())
    if result.stderr:
        print("\nSTDERR:")
        print(result.stderr.decode().strip())

except subprocess.TimeoutExpired:
    print("❌ FAIL — Timeout while linting")
except FileNotFoundError:
    print("❌ FAIL — Verilator not found. Is it installed and in PATH?")
finally:
    tmp_file_path.unlink(missing_ok=True)  # Clean up temp file

📄 Loaded 30 code snippets from complex_variations_gpt.yml

🎯 Validating index 0 from complex_variations_gpt.yml
✅ PASS

STDOUT:
- V e r i l a t i o n   R e p o r t: Verilator 5.036 2025-04-27 rev UNKNOWN.REV
- Verilator: Built from 0.028 MB sources in 3 modules, into 0.018 MB in 6 C++ files needing 0.000 MB
- Verilator: Walltime 0.005 s (elab=0.000, cvt=0.003, bld=0.000); cpu 0.005 s on 1 threads


## Validating GPT codes

In [253]:
import yaml
import subprocess
import tempfile
from pathlib import Path

# === CONFIG ===
input_files = [
    "complex_variations_gpt.yml",
    # "medium@1_variations_gpt.yml",
    # "complex@1_variations_gpt.yml"
]

results = []

for file in input_files:
    with open(file, "r") as f:
        codes = yaml.safe_load(f)

    for i, item in enumerate(codes):
        code = item["code"]
        # Create a temporary .sv file for each code block
        with tempfile.NamedTemporaryFile(delete=False, suffix=".sv", mode="w") as tmp_file:
            tmp_file.write(code)
            tmp_file_path = Path(tmp_file.name)

        # Run verilator lint check
        try:
            result = subprocess.run(
                ["verilator", "--lint-only", "--timing", tmp_file_path],
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                timeout=10
            )
            passed = result.returncode == 0
            results.append({
                "file": file,
                "index": i,
                "passed": passed,
                "stdout": result.stdout.decode(),
                "stderr": result.stderr.decode()
            })
        except subprocess.TimeoutExpired:
            results.append({
                "file": file,
                "index": i,
                "passed": False,
                "stderr": "❌ Timeout while linting",
                "stdout": ""
            })

        # Clean up temp file
        tmp_file_path.unlink()

# === REPORT ===
total = len(results)
passed = sum(1 for r in results if r["passed"])
failed = total - passed

print(f"\n✅ Verilator Validation Summary:")
print(f"Total Files Checked: {total}")
print(f"Passed: {passed}")
print(f"Failed: {failed}\n")

for r in results:
    status = "✅ PASS" if r["passed"] else "❌ FAIL"
    print(f"{status} — {r['file']} [index {r['index']}]")
    if not r["passed"]:
        print(r["stderr"].strip())
        print()


✅ Verilator Validation Summary:
Total Files Checked: 30
Passed: 26
Failed: 4

✅ PASS — complex_variations_gpt.yml [index 0]
✅ PASS — complex_variations_gpt.yml [index 1]
✅ PASS — complex_variations_gpt.yml [index 2]
✅ PASS — complex_variations_gpt.yml [index 3]
✅ PASS — complex_variations_gpt.yml [index 4]
❌ FAIL — complex_variations_gpt.yml [index 5]
                                                                                           : ... note: In instance 'mealy'
   15 |         if (w inside {wlist[0], wlist[1]})
      |               ^~~~~~
                      ... Use "/* verilator lint_off WIDTHEXPAND */" and lint_on around source to disable this message.
%Error-UNSUPPORTED: /var/folders/v6/jvlsn67j71g102qybf_hsvxr0000gn/T/tmp69t4l4qm.sv:15:28: Unsupported: RHS of ==? or !=? is fourstate but not a constant
                                                                                         : ... note: In instance 'mealy'
   15 |         if (w inside {wlist[0], wlist[1

## Batch inferencing (simple@5, star-coder-3b)

In [None]:
import yaml
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
import torch

# === CONFIG ===
code_file = "/tmp/simple_codes.yml"
prompt_file = "/tmp/simple_prompts.yml"
output_file = "/tmp/simple@5_variations.yml"
num_variations_per_prompt = 5

# === LOAD MODEL ===
tokenizer = AutoTokenizer.from_pretrained("bigcode/starcoder2-3b")
model = AutoModelForCausalLM.from_pretrained("bigcode/starcoder2-3b", torch_dtype=torch.float16)
print("✅ Using StarCoder")

generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0)

# === LOAD INPUT DATA ===
with open(code_file, "r") as f:
    codes = yaml.safe_load(f)

with open(prompt_file, "r") as f:
    prompts = yaml.safe_load(f)

assert len(codes) == len(prompts), "❌ Code and prompt counts do not match!"

# === BUILD AND EXPAND PROMPT BATCH ===
batch_prompts = []
source_indices = []

for i, (c, p) in enumerate(zip(codes, prompts)):
    prompt_text = f"### Base Verilog Code:\n{c['code'].strip()}\n### Variation Instruction:\n{p['prompt'].strip()}\n### Modified Code:\n"
    for _ in range(num_variations_per_prompt):
        batch_prompts.append(prompt_text)
        source_indices.append(i)

# === RUN BATCH INFERENCE ===
responses = generator(
    batch_prompts,
    max_new_tokens=2048,
    do_sample=True,
    temperature=0.8
)

# === EXTRACT VARIATIONS ===
outputs = []
counts = [0] * len(codes)

for i, resp in enumerate(responses):
    response_text = resp['generated_text']
    variation = response_text.split("### Modified Code:\n")[-1].strip()
    outputs.append({"code": variation})
    src_idx = source_indices[i]
    counts[src_idx] += 1
    print(f"✅ [{src_idx+1}/{len(codes)}] Variation {counts[src_idx]}/{num_variations_per_prompt} generated")

# === SAVE OUTPUTS WITH BLOCK STYLE ===
def str_presenter(dumper, data):
    if '\n' in data:
        return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')
    return dumper.represent_scalar('tag:yaml.org,2002:str', data)

yaml.add_representer(str, str_presenter)

with open(output_file, "w") as f:
    yaml.dump(outputs, f, sort_keys=False)

print(f"\n✅ All {len(outputs)} variations saved to {output_file}")

## Batch inferencing (medium@5, star-coder-3b)

In [None]:
import yaml
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
import torch

# === CONFIG ===
code_file = "/tmp/simple_codes.yml"
prompt_file = "/tmp/simple_prompts.yml"
output_file = "/tmp/simple@5_variations.yml"
num_variations_per_prompt = 5

# === LOAD MODEL ===
tokenizer = AutoTokenizer.from_pretrained("bigcode/starcoder2-3b")
model = AutoModelForCausalLM.from_pretrained("bigcode/starcoder2-3b", torch_dtype=torch.float16)
print("✅ Using StarCoder")

generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0)

# === LOAD INPUT DATA ===
with open(code_file, "r") as f:
    codes = yaml.safe_load(f)

with open(prompt_file, "r") as f:
    prompts = yaml.safe_load(f)

assert len(codes) == len(prompts), "❌ Code and prompt counts do not match!"

# === BUILD AND EXPAND PROMPT BATCH ===
batch_prompts = []
source_indices = []

for i, (c, p) in enumerate(zip(codes, prompts)):
    prompt_text = f"### Base Verilog Code:\n{c['code'].strip()}\n### Variation Instruction:\n{p['prompt'].strip()}\n### Modified Code:\n"
    for _ in range(num_variations_per_prompt):
        batch_prompts.append(prompt_text)
        source_indices.append(i)

# === RUN BATCH INFERENCE ===
responses = generator(
    batch_prompts,
    max_new_tokens=2048,
    do_sample=True,
    temperature=0.8
)

# === EXTRACT VARIATIONS ===
outputs = []
counts = [0] * len(codes)

for i, resp in enumerate(responses):
    response_text = resp['generated_text']
    variation = response_text.split("### Modified Code:\n")[-1].strip()
    outputs.append({"code": variation})
    src_idx = source_indices[i]
    counts[src_idx] += 1
    print(f"✅ [{src_idx+1}/{len(codes)}] Variation {counts[src_idx]}/{num_variations_per_prompt} generated")

# === SAVE OUTPUTS WITH BLOCK STYLE ===
def str_presenter(dumper, data):
    if '\n' in data:
        return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')
    return dumper.represent_scalar('tag:yaml.org,2002:str', data)

yaml.add_representer(str, str_presenter)

with open(output_file, "w") as f:
    yaml.dump(outputs, f, sort_keys=False)

print(f"\n✅ All {len(outputs)} variations saved to {output_file}")

In [None]:
import yaml
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
import torch

# === CONFIG ===
code_file = "/tmp/complex_codes.yml"
prompt_file = "/tmp/complex_prompts.yml"
output_file = "/tmp/complex@10_variations.yml"
num_variations_per_prompt = 10

# === LOAD MODEL ===
model_name = "deepseek-ai/deepseek-coder-6.7b-base"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    trust_remote_code=True
).to("cuda")  # manually move to GPU 0

print("✅ Using DeepSeek Coder 6.7B Base")

generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0)

# === LOAD INPUT DATA ===
with open(code_file, "r") as f:
    codes = yaml.safe_load(f)

with open(prompt_file, "r") as f:
    prompts = yaml.safe_load(f)

assert len(codes) == len(prompts), "❌ Code and prompt counts do not match!"

# === BUILD PROMPT BATCH ===
batch_prompts = [
    f"### Base Verilog Code:\n{c['code'].strip()}\n### Variation Instruction:\n{p['prompt'].strip()}\n### Modified Code:\n"
    for c, p in zip(codes, prompts)
]

# === RUN MULTIPLE VARIATIONS PER PROMPT ===
outputs = []

for i, prompt in enumerate(batch_prompts):
    responses = generator(
        prompt,
        max_new_tokens=512,
        do_sample=True,
        temperature=0.8,
        num_return_sequences=num_variations_per_prompt
    )
    for j, resp in enumerate(responses):
        response = resp['generated_text']
        variation = response.split("### Modified Code:\n")[-1].strip()
        outputs.append({"code": variation})
        print(f"✅ [{i+1}/{len(batch_prompts)}] Variation {j+1}/{num_variations_per_prompt} generated")

# === SAVE OUTPUTS WITH BLOCK STYLE ===
def str_presenter(dumper, data):
    if '\n' in data:
        return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')
    return dumper.represent_scalar('tag:yaml.org,2002:str', data)

yaml.add_representer(str, str_presenter)

with open(output_file, "w") as f:
    yaml.dump(outputs, f, sort_keys=False)

print(f"\n✅ All {len(outputs)} variations saved to {output_file}")