# Synthetic Data Generator

In [1]:
# Installs

!pip install -q requests

In [2]:
# Imports

import time
import random
import requests
import json
from prompt_context import PromptContext

In [3]:
# Constants

NUM_SAMPLES = 10000
OUTPUT_FILE = "dataset.jsonl"
MODEL_NAME = "qwen2.5-coder:7b-instruct"
OLLAMA_ENDPOINT = "http://localhost:11434/api/chat"

In [4]:
def qwen_generate(prompt):
    data = {
        "model": MODEL_NAME,
        "prompt": prompt,
        "stream": True
    }

    try:
        response = requests.post("http://localhost:11434/api/generate", json=data, stream=True)
        if response.status_code != 200:
            print(f"[!] API error {response.status_code}")
            return None

        full_response = ''
        for line in response.iter_lines():
            if line:
                try:
                    obj = json.loads(line)
                    if 'response' in obj:
                        full_response += obj['response']
                except json.JSONDecodeError:
                    continue

        json_start = full_response.find('{')
        json_end = full_response.rfind('}') + 1
        if json_start == -1 or json_end == -1:
            print("[!] JSON boundaries not found.")
            return None

        return json.loads(full_response[json_start:json_end])

    except Exception as e:
        print(f"[!] Exception: {e}")
        return None

In [5]:
prompt_ctx = PromptContext()

def write_dataset(output_file: str):
    written = 0
    with open(output_file, 'a', encoding='utf-8', buffering=1) as f:
        while written < NUM_SAMPLES:
            n_issues = random.choice([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
            prompt = prompt_ctx.build_prompt(n_issues)
            result = qwen_generate(prompt)

            if not result or "code" not in result or "feedback" not in result:
                print("[!] Invalid response. Skipping.")
                continue

            f.write(json.dumps(result) + '\n')
            written += 1

            if written % 50 == 0:
                print(f"[+] {written}/{NUM_SAMPLES} samples saved.")

            time.sleep(0.5)

    print("[✅] Dataset generation complete!")

write_dataset(OUTPUT_FILE)

[!] Exception: Expecting ',' delimiter: line 2 column 41 (char 42)
[!] Invalid response. Skipping.
[!] Exception: Expecting ',' delimiter: line 2 column 38 (char 39)
[!] Invalid response. Skipping.
[!] Exception: Expecting ',' delimiter: line 2 column 43 (char 44)
[!] Invalid response. Skipping.


KeyboardInterrupt: 