# Synthetic Data Generator

In [2]:
# Installs

%pip install requests

Collecting requests
  Downloading requests-2.32.4-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.8/64.8 KB[0m [31m958.7 kB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hCollecting charset_normalizer<4,>=2
  Using cached charset_normalizer-3.4.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (149 kB)
Collecting urllib3<3,>=1.21.1
  Downloading urllib3-2.5.0-py3-none-any.whl (129 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.8/129.8 KB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting idna<4,>=2.5
  Using cached idna-3.10-py3-none-any.whl (70 kB)
Collecting certifi>=2017.4.17
  Downloading certifi-2025.7.9-py3-none-any.whl (159 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m159.2/159.2 KB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: urllib3, idna, charset_normalizer, certifi, requests
Successfully insta

In [5]:
# Imports

import requests
import json
import random
import time

In [1]:
# Constants

NUM_SAMPLES = 10
OUTPUT_FILE = "dataset.jsonl"
MODEL_NAME = "qwen2.5-coder:7b-instruct"
OLLAMA_ENDPOINT = "http://localhost:11434/api/chat"

In [2]:
def build_prompt(n_issues: int):
    return f"""You are a Java static code analyzer.

Generate a synthetic Java code snippet that contains exactly {n_issues} code issues, randomly sampled from the categories below. For each issue, also output a structured JSON feedback object, where each key is a metric and the value is a list of {{line, feedback}} entries.

Only output valid JSON in this format:

```json
{{
  "code": "public class Example {{ ... }}",
  "feedback": {{
    "magicNumbers": [{{ "line": 3, "feedback": "Avoid magic numbers." }}]
  }}
}}

Only include the feedback for issues actually present in the code.
Do not include any explanation or extra text. Just the JSON object.

Available metrics include:

- Correctness_SyntaxError
- Correctness_UndefinedVariable
- Correctness_TypeMismatch
- Correctness_LogicBug
- Correctness_UnreachableCode
- Correctness_IncorrectReturnType
- Correctness_NullDereference
- Correctness_IncorrectLoopBounds
- Correctness_VariableShadowing
- Correctness_IntegerOverflow
- Correctness_ThreadSafety
- Correctness_NonDeterminism
- Quality_MagicNumbers
- Quality_DeadCode
- Quality_UnusedVariable
- Quality_MethodLength
- Quality_LineLength
- Quality_RedundantOperations
- Quality_UnnecessaryObjectCreation
- Quality_InefficientDataStructure
- Quality_BoxingUnboxing
- Quality_StringConcatenationInLoop
- Quality_CodeDuplication
- Quality_CommentsQuality
- Quality_JavadocMissing
- Quality_OrderingInClass
- Quality_NamingConventions
- Quality_PoorSpacing
- Quality_Indentation
- Quality_StaticMethodOveruse
- Quality_SingletonMisuse
- Quality_AntiPatternDetected
- OOP_SingleResponsibility
- OOP_OpenClosedPrinciple
- OOP_LiskovViolation
- OOP_InterfaceSegregation
- OOP_CompositionPreferred
- OOP_InheritanceOveruse
- OOP_AbstractionLeak
- OOP_OverriddenEqualsHashcode
- OOP_Encapsulation
- OOP_PolymorphismMissing
- OOP_TightCoupling
- OOP_LowCohesion
- OOP_CircularDependency
- OOP_MissingFactoryPattern
- OOP_ControllerLogicLeak
- Security_HardcodedCredentials
- Security_SensitiveLogging
- Security_ExceptionSwallowing
- Security_ExceptionHandling
- Security_InjectionRisk
- Security_MemoryLeakRisk
- Security_ResourceManagement
- Security_NonFinalStatic
- Testability_NoTestsDetected
- Testability_HardToMock
- Testability_NoDependencyInjection
- Performance_InefficientDataStructure
- Performance_RedundantOperations
- Performance_BoxingUnboxing
- Performance_StringConcatenationInLoop
- Performance_MemoryLeakRisk
- Performance_LoopToStream
"""

In [7]:
def qwen_generate(prompt):
    data = {
        "model": MODEL_NAME,
        "prompt": prompt,
        "stream": True
    }

    try:
        print("uerying")
        response = requests.post("http://localhost:11434/api/generate", json=data, stream=True)
        print("done")
        if response.status_code != 200:
            print(f"[!] API error {response.status_code}")
            return None

        full_response = ''
        for line in response.iter_lines():
            if line:
                try:
                    obj = json.loads(line)
                    if 'response' in obj:
                        full_response += obj['response']
                except json.JSONDecodeError:
                    continue

        json_start = full_response.find('{')
        json_end = full_response.rfind('}') + 1
        if json_start == -1 or json_end == -1:
            print("[!] JSON boundaries not found.")
            return None

        return json.loads(full_response[json_start:json_end])

    except Exception as e:
        print(f"[!] Exception: {e}")
        return None

In [8]:
written = 0
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
    while written < NUM_SAMPLES:
        n_issues = random.choice([0, 1, 2, 3, 4, 5])
        prompt = build_prompt(n_issues)
        result = qwen_generate(prompt)

        if not result or "code" not in result or "feedback" not in result:
            print("[!] Invalid response. Skipping.")
            continue

        f.write(json.dumps(result) + '\n')
        written += 1

        if written % 50 == 0:
            print(f"[+] {written}/{NUM_SAMPLES} samples saved.")

        time.sleep(0.5)  # delay to avoid overload

print("[✅] Dataset generation complete!")

uerying


KeyboardInterrupt: 