In [None]:
import requests
import json
import time
import uuid
import math
import itertools
from datetime import timedelta
from pathlib import Path
from typing import Dict, Generator, List, Tuple, Union

In [None]:
class ModelClient:
    def __init__(
        self,
        api_key: str = "tpsg-MNvTQUAqUL84o4THLV1395IqTBIZHJJ",
        model: str = "gpt-4o-mini-2024-07-18",
        base_url: str = "https://api.tapsage.com",
        provider: str = "openai_chat_completion",
        max_tokens: int = 1024,
        temperature: float = 0.7,
    ):
        self.endpoint = f"{base_url}/api/v1/wrapper/{provider}/chat/completions"
        self.headers = {
            "Authorization": f"Bearer {api_key}",
            "Content-Type": "application/json",
        }
        self.model = model
        self.max_tokens = max_tokens
        self.temperature = temperature
        self.system_prompt = (
            "ÿ¥ŸÖÿß €å⁄© ŸÖÿ™ÿÆÿµÿµ ŸÖÿ¨ÿ±ÿ® ÿØÿ± ÿ≠Ÿàÿ≤Ÿá‚ÄåŸáÿß€å ⁄Øÿ±ÿØÿ¥⁄Øÿ±€åÿå ÿßŸÇŸÑ€åŸÖ‚Äåÿ¥ŸÜÿßÿ≥€å Ÿà ÿ¨ÿ∫ÿ±ÿßŸÅ€åÿß€å ÿß€åÿ±ÿßŸÜ Ÿáÿ≥ÿ™€åÿØ. "
            "ÿ™ŸÖÿßŸÖ Ÿæÿßÿ≥ÿÆ‚ÄåŸáÿß€å ÿ¥ŸÖÿß ÿ®ÿß€åÿØ ŸÅŸÇÿ∑ ÿ®Ÿá ÿ≤ÿ®ÿßŸÜ ŸÅÿßÿ±ÿ≥€å ŸÖÿπ€åÿßÿ±ÿå ÿØŸÇ€åŸÇÿå ÿ±ŸàÿßŸÜ Ÿà ŸÖÿ∑ÿßÿ®ŸÇ ÿ®ÿß ÿßÿ≥ÿ™ÿßŸÜÿØÿßÿ±ÿØŸáÿß€å ŸÜ⁄Øÿßÿ±ÿ¥ ÿπŸÑŸÖ€å Ÿà ÿßÿ∑ŸÑÿßÿπ‚Äåÿ±ÿ≥ÿßŸÜ€å ÿßÿ±ÿßÿ¶Ÿá ÿ¥ŸàŸÜÿØ. "
            "ÿ®Ÿá Ÿá€å⁄Ü Ÿàÿ¨Ÿá ÿØÿ± ŸÖÿ™ŸÜ ÿ™ŸàŸÑ€åÿØ€å ÿÆŸàÿØ ÿßÿ≤ ÿπÿ®ÿßÿ±ÿßÿ™ Ÿà ⁄©ŸÑŸÖÿßÿ™ ÿ≤ÿ®ÿßŸÜ ÿØ€å⁄Øÿ±€å ÿ®Ÿá ÿ∫€åÿ± ÿßÿ≤ ŸÅÿßÿ±ÿ≥€å ÿßÿ≥ÿ™ŸÅÿßÿØŸá ŸÜ⁄©ŸÜ€åÿØ."
        )

    def start(self, prompt: str) -> str:
        messages = [
            {"role": "system", "content": self.system_prompt},
            {"role": "user", "content": prompt},
        ]
        payload = {
            "model": self.model,
            "messages": messages,
            "max_tokens": self.max_tokens,
            "temperature": self.temperature,
        }
        resp = requests.post(self.endpoint, json=payload, headers=self.headers, timeout=60)
        resp.raise_for_status()
        data = resp.json()
        return data["choices"][0]["message"]["content"].strip()

In [None]:
assistant = ModelClient()

In [None]:
class DataGenerator:
    PROMPT_TPL = (
        "\nÿ¥ŸÖÿß ŸÜŸÇÿ¥ €å⁄© ⁄©ÿßÿ±ÿ¥ŸÜÿßÿ≥ ÿÆÿ®ÿ±Ÿá ÿØÿ± ÿ≠Ÿàÿ≤ŸáŸî ⁄Øÿ±ÿØÿ¥⁄Øÿ±€åÿå ÿßŸÇŸÑ€åŸÖ‚Äåÿ¥ŸÜÿßÿ≥€å Ÿà ÿ¨ÿ∫ÿ±ÿßŸÅ€åÿß€å ÿß€åÿ±ÿßŸÜ ÿ±ÿß ÿØÿßÿ±€åÿØ. ÿßÿ∑ŸÑÿßÿπÿßÿ™ ÿ≤€åÿ± ŸÖÿ±ÿ®Ÿàÿ∑ ÿ®Ÿá €å⁄© ¬´{d_fa}¬ª ŸàÿßŸÇÿπ ÿØÿ± ÿßÿ≥ÿ™ÿßŸÜ {prov} ÿßÿ≥ÿ™. "
        "ÿ®ÿ± Ÿæÿß€åŸáŸî ŸáŸÖŸáŸî ÿ¨ÿ≤ÿ¶€åÿßÿ™ÿå ÿ≠ÿØÿß⁄©ÿ´ÿ± ÿØÿ± €±€∞€∞ Ÿàÿß⁄òŸáÿå €å⁄© Ÿæÿßÿ±ÿß⁄Øÿ±ÿßŸÅ ÿØŸÇ€åŸÇÿå ÿ±ŸàÿßŸÜ Ÿà ⁄©ÿßŸÖŸÑÿßŸã ÿ®Ÿá ÿ≤ÿ®ÿßŸÜ ŸÅÿßÿ±ÿ≥€å ŸÖÿπ€åÿßÿ± ÿ®ŸÜŸà€åÿ≥ ⁄©Ÿá ÿ™ŸÖÿßŸÖ ÿØÿßÿØŸá‚ÄåŸáÿß€å ŸÖŸàÿ¨ŸàÿØ (ÿßÿ≤ ÿ¨ŸÖŸÑŸá ŸÜÿßŸÖÿå ŸÖŸàŸÇÿπ€åÿ™ ÿ¨ÿ∫ÿ±ÿßŸÅ€åÿß€å€å €åÿß ÿ∑ÿ®ŸÇŸá‚Äåÿ®ŸÜÿØ€åÿå "
        "Ÿà€å⁄ò⁄Ø€å‚ÄåŸáÿß€å ÿ¥ÿßÿÆÿµÿå ÿßÿπÿØÿßÿØÿå ÿ≥ÿßŸÑ‚ÄåŸáÿß Ÿà Ÿáÿ± ŸÜ⁄©ÿ™ŸáŸî ŸÇÿßÿ®ŸÑ ÿ™Ÿàÿ¨Ÿá) ÿ±ÿß ÿ®Ÿá ÿµŸàÿ±ÿ™ €å⁄©Ÿæÿßÿ±⁄ÜŸá Ÿà ŸÖŸÜÿ≥ÿ¨ŸÖ ÿØÿ± ÿ®ÿ± ⁄Ø€åÿ±ÿØ. "
        "**ÿß⁄Øÿ± ÿßÿ∑ŸÑÿßÿπÿßÿ™ ŸÖÿπÿ™ÿ®ÿ± Ÿà ŸÖÿ±ÿ™ÿ®ÿ∑€å ÿØÿ±ÿ®ÿßÿ±ŸáŸî ÿß€åŸÜ ŸÖ⁄©ÿßŸÜ ŸÖ€å‚ÄåÿØÿßŸÜ€åÿå ŸÖ€å‚Äåÿ™ŸàÿßŸÜ€å ÿ¢ŸÜ ÿ±ÿß ŸáŸÖ ÿßÿ∂ÿßŸÅŸá ⁄©ŸÜ€åÿå ÿ®Ÿá ÿ¥ÿ±ÿ∑€å ⁄©Ÿá ŸÖÿ™ŸÜ ÿßÿ≤ €±€∞€∞ Ÿàÿß⁄òŸá ÿ®€åÿ¥ÿ™ÿ± ŸÜÿ¥ŸàÿØ.** "
        "ÿÆÿ±Ÿàÿ¨€å ŸÅŸÇÿ∑ ÿ®ÿß€åÿØ ŸáŸÖÿßŸÜ €å⁄© Ÿæÿßÿ±ÿß⁄Øÿ±ÿßŸÅ ÿ®ÿßÿ¥ÿØ Ÿà Ÿá€å⁄Ü ŸÖÿ™ŸÜ ÿßÿ∂ÿßŸÅ€åÿå ÿπŸÜŸàÿßŸÜ €åÿß ÿ™Ÿàÿ∂€åÿ≠€å ÿÆÿßÿ±ÿ¨ ÿßÿ≤ ÿ¢ŸÜ ÿ™ŸàŸÑ€åÿØ ŸÜÿ¥ŸàÿØ.\n\n"
        "ÿØÿßÿØŸáŸî ÿ≥ÿßÿÆÿ™‚Äå€åÿßŸÅÿ™Ÿá:\n{struct}\n"
    )

    DOM_FA = {
        "geographical_feature": "Ÿà€å⁄ò⁄Ø€å ÿ¨ÿ∫ÿ±ÿßŸÅ€åÿß€å€å",
        "natural_resources": "ŸÖŸÜÿ®ÿπ ÿ∑ÿ®€åÿπ€å",
        "topography": "Ÿà€å⁄ò⁄Ø€å ÿ™ŸàŸæŸà⁄Øÿ±ÿßŸÅ€å",
        "tourist_attraction": "ÿ¨ÿßÿ∞ÿ®ŸáŸî ⁄Øÿ±ÿØÿ¥⁄Øÿ±€å",
    }

    def __init__(
            self,
            assistant,
            data_dir: Union[str, Path] = "data",
            output_dir: Union[str, Path] = "province_texts",
            num_output: int = 5,
            progress_step: int = 1,
    ):
        self.assistant = assistant
        self.data_dir   = Path(data_dir)
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(parents=True, exist_ok=True)
        self.num_output    = num_output
        self.progress_step = progress_step
        self._spinner = itertools.cycle("‚†ã‚†ô‚†π‚†∏‚†º‚†¥‚†¶‚†ß‚†á‚†è")
        self._start_time = None

    def _load(self, p: Path) -> Dict:
        with p.open("r", encoding="utf-8") as f:
            return json.load(f)

    def _iter_records_old(self, d: Dict) -> Generator[Dict, None, None]:
        for b in d.get("geographical_features", []):
            for it in b.get("description", []):
                yield {"domain": "geographical_feature", "subcategory": b.get("category"), **it}
        for k in ("natural_resources", "topography"):
            for it in d.get(k, []):
                yield {"domain": k, **it}
        for it in d.get("tourist_attractions", []):
            yield {"domain": "tourist_attraction", **it}

    def _flatten(self, node, domain, parent_sub=None):
        if isinstance(node, dict):
            desc = node.get("description")
            if isinstance(desc, list):
                sub = node.get("category") or node.get("name") or parent_sub
                for item in desc:
                    yield from self._flatten(item, domain, sub)
            else:
                rec = dict(node)
                rec["domain"] = domain
                if parent_sub and "subcategory" not in rec:
                    rec["subcategory"] = parent_sub
                yield rec
        elif isinstance(node, list):
            for item in node:
                yield from self._flatten(item, domain, parent_sub)

    def _iter_records_new(self, d: Dict) -> Generator[Dict, None, None]:
        for blk in d.get("geographical_features", []):
            yield from self._flatten(blk, "geographical_feature")
        for blk in d.get("natural_resources", []):
            yield from self._flatten(blk, "natural_resources")
        for blk in d.get("topography", []):
            yield from self._flatten(blk, "topography")
        yield from self._flatten(d.get("tourist_attractions", []), "tourist_attraction")

    def _iter_records(self, d: Dict) -> Generator[Dict, None, None]:
        try:
            for rec in list(self._iter_records_old(d)):
                yield rec
        except Exception:
            for rec in self._iter_records_new(d):
                yield rec

    def _prompt(self, rec: Dict, prov: str) -> str:
        clean = {k: v for k, v in rec.items() if k not in ("images", "vote")}
        struct = json.dumps(clean, ensure_ascii=False, indent=2)
        return self.PROMPT_TPL.format(
            d_fa=self.DOM_FA.get(rec["domain"], rec["domain"]), prov=prov, struct=struct
        )

    def _format_time(self, seconds: float) -> str:
        return str(timedelta(seconds=int(seconds)))

    def _progress(self, idx: int, total: int):
        if idx % self.progress_step != 0 and idx != total:
            return
        elapsed = time.time() - self._start_time
        speed = idx / elapsed if elapsed else 0
        eta = (total - idx) / speed if speed else math.inf
        bar_len = 30
        filled = int(bar_len * idx / total)
        bar = "‚ñà" * filled + "‚ñë" * (bar_len - filled)
        pct = idx * 100 / total
        msg = (
            f"\r{next(self._spinner)} |{bar}| "
            f"{idx}/{total} {pct:5.1f}% "
            f"| elapsed {self._format_time(elapsed)} "
            f"| speed {speed:5.2f} rec/s "
            f"| eta {self._format_time(eta)} "
        )
        print(msg, end="", flush=True)

    def run(self):
        for fp in sorted(self.data_dir.glob("*.json")):
            prov = fp.stem
            data = self._load(fp)
            records = list(self._iter_records(data))
            total = len(records)
            results = []
            self._start_time = time.time()
            self._spinner = itertools.cycle("‚†ã‚†ô‚†π‚†∏‚†º‚†¥‚†¶‚†ß‚†á‚†è")

            for idx, rec in enumerate(records, 1):
                self._progress(idx, total)
                prompt = self._prompt(rec, prov)
                try:
                    txt = self.assistant.start(prompt)
                except Exception as e:
                    print(f"\n‚úò {prov} record {idx} failed: {e}")
                    continue
                results.append(
                    {
                        "id": str(uuid.uuid4()),
                        "province": prov,
                        "record_meta": {
                            "domain": rec["domain"],
                            "name": rec.get("name") or rec.get("subcategory"),
                        },
                        "prompt": prompt,
                        "text": txt,
                    }
                )
                if idx % self.num_output == 0:
                    tmp_path = self.output_dir / f"{prov}__checkpoint.json"
                    with tmp_path.open("w", encoding="utf-8") as f:
                        json.dump(results, f, ensure_ascii=False, indent=2)
                    print(f"\nüíæ checkpoint {idx}/{total} ‚Üí {tmp_path}")

            out_path = self.output_dir / f"{prov}.json"
            with out_path.open("w", encoding="utf-8") as f:
                json.dump(results, f, ensure_ascii=False, indent=2)
            print(f"\nüíæ {prov}: saved {len(results)} records ‚Üí {out_path}")

        print("\n‚úÖ all provinces processed")

In [None]:
DataGenerator(assistant).run()

‚†º |‚ñà‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë| 5/114   4.4% | elapsed 0:00:18 | speed  0.27 rec/s | eta 0:06:39 
üíæ checkpoint 5/114 ‚Üí province_texts/ÿßÿµŸÅŸáÿßŸÜ__checkpoint.json
‚†è |‚ñà‚ñà‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë| 10/114   8.8% | elapsed 0:00:37 | speed  0.26 rec/s | eta 0:06:34 
üíæ checkpoint 10/114 ‚Üí province_texts/ÿßÿµŸÅŸáÿßŸÜ__checkpoint.json
‚†º |‚ñà‚ñà‚ñà‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë| 15/114  13.2% | elapsed 0:00:55 | speed  0.27 rec/s | eta 0:06:05 
üíæ checkpoint 15/114 ‚Üí province_texts/ÿßÿµŸÅŸáÿßŸÜ__checkpoint.json
‚†è |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë| 20/114  17.5% | elapsed 0:01:12 | speed  0.28 rec/s | eta 0:05:40 
üíæ checkpoint 20/114 ‚Üí province_texts/ÿßÿµŸÅŸáÿßŸÜ__checkpoint.json
‚†º |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñ

In [None]:
class QuestionGenerator:
    PROMPT_TPL = (
        "\nÿ¥ŸÖÿß €å⁄© ÿ∑ÿ±ÿßÿ≠ ÿ≠ÿ±ŸÅŸá‚Äåÿß€å Ÿæÿ±ÿ≥ÿ¥‚ÄåŸáÿß€å ŸÖÿ∑ÿßŸÑÿπÿßÿ™ ÿßÿ¨ÿ™ŸÖÿßÿπ€å Ÿáÿ≥ÿ™€åÿØ. ÿ®ÿß ÿ™Ÿàÿ¨Ÿá ÿ®Ÿá ŸÖÿ™ŸÜ ÿ≤€åÿ± ÿØÿ±ÿ®ÿßÿ±ŸáŸî ÿßÿ≥ÿ™ÿßŸÜ {prov}ÿå "
        "ÿØŸÇ€åŸÇÿßŸã ŸæŸÜÿ¨ ¬´Ÿæÿ±ÿ≥ÿ¥ Ÿà Ÿæÿßÿ≥ÿÆ¬ª ÿ®ŸÜŸà€åÿ≥. ŸÖŸáŸÖ ÿßÿ≥ÿ™ ⁄©Ÿá **ÿ™ÿπÿØÿßÿØ ÿ¢ŸÜ‚ÄåŸáÿß ÿØŸÇ€åŸÇÿßŸã ŸæŸÜÿ¨ ÿ®ÿßÿ¥ÿØ (ŸÜŸá ⁄©ŸÖÿ™ÿ± Ÿà ŸÜŸá ÿ®€åÿ¥ÿ™ÿ±)**. "
        "Ÿáÿ± Ÿæÿ±ÿ≥ÿ¥ ÿ®ÿß€åÿØ ÿ®Ÿá‚Äåÿ∑Ÿàÿ± ŸÖÿ≥ÿ™ŸÇ€åŸÖ ÿ®Ÿá ÿ®ÿÆÿ¥€å ÿßÿ≤ ŸÖÿ™ŸÜ ŸÖÿ±ÿ®Ÿàÿ∑ ÿ®ÿßÿ¥ÿØ Ÿà Ÿæÿßÿ≥ÿÆ ÿ¢ŸÜ ŸÜ€åÿ≤ **ÿπ€åŸÜ ÿπÿ®ÿßÿ±ÿ™ €åÿß ÿ¨ŸÖŸÑŸáŸî ŸÖŸàÿ¨ŸàÿØ ÿØÿ± ŸÖÿ™ŸÜ** ÿ®ÿßÿ¥ÿØ. "
        "**ÿØÿ± ŸÖÿ™ŸÜ Ÿæÿ±ÿ≥ÿ¥ ÿ®ÿß€åÿØ ÿ®Ÿá‚Äåÿ±Ÿàÿ¥ŸÜ€å ŸÖÿ¥ÿÆÿµ ÿ®ÿßÿ¥ÿØ ⁄©Ÿá Ÿæÿ±ÿ≥ÿ¥ ÿØÿ±ÿ®ÿßÿ±ŸáŸî ⁄ÜŸá ŸÖŸàÿ∂Ÿàÿπ €åÿß ÿ®ÿÆÿ¥ ŸÖÿ¥ÿÆÿµ€å ÿßÿ≤ ÿßÿ≥ÿ™ÿßŸÜ ÿßÿ≥ÿ™.** "
        "ÿ™ÿ≠ÿ™ Ÿá€å⁄Ü ÿ¥ÿ±ÿß€åÿ∑€å ⁄©ŸÖÿ™ÿ± ÿßÿ≤ ŸæŸÜÿ¨ ¬´Ÿæÿ±ÿ≥ÿ¥ Ÿà Ÿæÿßÿ≥ÿÆ¬ª ÿ™ŸàŸÑ€åÿØ ŸÜ⁄©ŸÜ.\n"
        "ÿÆÿ±Ÿàÿ¨€å ÿ±ÿß ÿØŸÇ€åŸÇÿßŸã ÿØÿ± ŸÇÿßŸÑÿ® ÿ≤€åÿ± ÿ™ŸàŸÑ€åÿØ ⁄©ŸÜ (ÿ®ÿØŸàŸÜ Ÿá€å⁄Ü ŸÖÿ™ŸÜ €åÿß ÿ™Ÿàÿ∂€åÿ≠ ÿßÿ∂ÿßŸÅ€å):\n"
        "- Ÿæÿ±ÿ≥ÿ¥: ...\n  Ÿæÿßÿ≥ÿÆ: ...\n"
        "- Ÿæÿ±ÿ≥ÿ¥: ...\n  Ÿæÿßÿ≥ÿÆ: ...\n"
        "- Ÿæÿ±ÿ≥ÿ¥: ...\n  Ÿæÿßÿ≥ÿÆ: ...\n"
        "- Ÿæÿ±ÿ≥ÿ¥: ...\n  Ÿæÿßÿ≥ÿÆ: ...\n"
        "- Ÿæÿ±ÿ≥ÿ¥: ...\n  Ÿæÿßÿ≥ÿÆ: ...\n\n"
        "ŸÖÿ™ŸÜ:\n{text}\n"
    )

    def __init__(
        self,
        assistant: ModelClient,
        input_dir: Union[str, Path] = "province_texts",
        output_dir: Union[str, Path] = "province_questions",
        num_output: int = 5,
        progress_step: int = 1,
    ):
        self.assistant = assistant
        self.input_dir = Path(input_dir)
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(parents=True, exist_ok=True)
        self.num_output = num_output
        self.progress_step = progress_step
        self._spinner = itertools.cycle("‚†ã‚†ô‚†π‚†∏‚†º‚†¥‚†¶‚†ß‚†á‚†è")
        self._start_time = None

    def _format_time(self, sec: float) -> str:
        return str(timedelta(seconds=int(sec)))

    def _progress(self, idx: int, total: int):
        if idx % self.progress_step != 0 and idx != total:
            return
        elapsed = time.time() - self._start_time
        speed = idx / elapsed if elapsed else 0
        eta = (total - idx) / speed if speed else math.inf
        bar_len = 30
        filled = int(bar_len * idx / total)
        bar = "‚ñà" * filled + "‚ñë" * (bar_len - filled)
        pct = idx * 100 / total
        print(
            f"\r{next(self._spinner)} |{bar}| {idx}/{total} {pct:5.1f}% "
            f"| elapsed {self._format_time(elapsed)} "
            f"| speed {speed:5.2f} rec/s "
            f"| eta {self._format_time(eta)} ",
            end="",
            flush=True,
        )

    def _prompt(self, text: str, prov: str) -> str:
        return self.PROMPT_TPL.format(text=text, prov=prov)

    def run(self):
        for fp in sorted(self.input_dir.glob("*.json")):
            prov = fp.stem
            records = json.loads(fp.read_text(encoding="utf-8"))
            total = len(records)
            results: List[Dict] = []
            self._start_time = time.time()
            self._spinner = itertools.cycle("‚†ã‚†ô‚†π‚†∏‚†º‚†¥‚†¶‚†ß‚†á‚†è")

            for idx, rec in enumerate(records, 1):
                self._progress(idx, total)
                prompt = self._prompt(rec["text"], prov)
                try:
                    raw = self.assistant.start(prompt)
                except Exception as e:
                    print(f"\n‚úò {prov} record {idx} failed: {e}")
                    continue

                qa_pairs = []
                lines = [l.strip() for l in raw.splitlines() if l.strip()]
                i = 0
                while i < len(lines) and len(qa_pairs) < 5:
                    if lines[i].startswith("-"):
                        q = lines[i].lstrip("-").strip()
                        a = ""
                        if i + 1 < len(lines):
                            nxt = lines[i + 1]
                            if nxt.lower().startswith(("Ÿæÿßÿ≥ÿÆ", "answer", "a:", "Ÿæÿßÿ≥ÿÆ:")):
                                a = nxt.split(":", 1)[-1].strip()
                                i += 1
                            else:
                                a = nxt
                                i += 1
                        qa_pairs.append({"question": q.replace("Ÿæÿ±ÿ≥ÿ¥:", "").strip(), "answer": a})
                    i += 1
                while len(qa_pairs) < 5:
                    qa_pairs.append({"question": "", "answer": ""})

                results.append({**rec, "qa_pairs": qa_pairs})

                if idx % self.num_output == 0:
                    tmp = self.output_dir / f"{prov}__checkpoint.json"
                    tmp.write_text(json.dumps(results, ensure_ascii=False, indent=2), encoding="utf-8")
                    print(f"\nüíæ checkpoint {idx}/{total} ‚Üí {tmp}")

            out_path = self.output_dir / f"{prov}.json"
            out_path.write_text(json.dumps(results, ensure_ascii=False, indent=2), encoding="utf-8")
            print(f"\nüíæ {prov}: saved {len(results)} records ‚Üí {out_path}")

        print("\n‚úÖ question generation complete")

In [None]:
QuestionGenerator(assistant).run()

‚†º |‚ñà‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë| 5/114   4.4% | elapsed 0:00:19 | speed  0.26 rec/s | eta 0:06:57 
üíæ checkpoint 5/114 ‚Üí province_questions/ÿßÿµŸÅŸáÿßŸÜ__checkpoint.json
‚†è |‚ñà‚ñà‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë| 10/114   8.8% | elapsed 0:00:44 | speed  0.23 rec/s | eta 0:07:38 
üíæ checkpoint 10/114 ‚Üí province_questions/ÿßÿµŸÅŸáÿßŸÜ__checkpoint.json
‚†º |‚ñà‚ñà‚ñà‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë| 15/114  13.2% | elapsed 0:01:14 | speed  0.20 rec/s | eta 0:08:14 
üíæ checkpoint 15/114 ‚Üí province_questions/ÿßÿµŸÅŸáÿßŸÜ__checkpoint.json
‚†è |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë| 20/114  17.5% | elapsed 0:01:41 | speed  0.20 rec/s | eta 0:07:57 
üíæ checkpoint 20/114 ‚Üí province_questions/ÿßÿµŸÅŸáÿßŸÜ__checkpoint.json
‚†º |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚