In [None]:
import os
import json
import time
import asyncio
import hashlib
import datetime
import re
import random
import subprocess
import sys
from urllib.parse import urljoin
from difflib import SequenceMatcher

try:
    import nest_asyncio
except ImportError:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "nest_asyncio"])
    import nest_asyncio

nest_asyncio.apply()

try:
    import httpx
except ImportError:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "httpx"])
    import httpx

try:
    from bs4 import BeautifulSoup
except ImportError:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "beautifulsoup4"])
    from bs4 import BeautifulSoup

try:
    from rich.progress import Progress, SpinnerColumn, BarColumn, TextColumn, TimeElapsedColumn, TimeRemainingColumn
    from rich.console import Console
except ImportError:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "rich"])
    from rich.progress import Progress, SpinnerColumn, BarColumn, TextColumn, TimeElapsedColumn, TimeRemainingColumn
    from rich.console import Console

try:
    import requests
except ImportError:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "requests"])
    import requests

console = Console()

In [None]:
class Config:
    MIN_ITEMS_PER_CATEGORY = 50
    MIN_TENSE_LEN = 40
    MIN_FACTS = 10
    MAX_REFINE_ROUNDS = 4
    MODEL_MAX_TOKENS = 10000
    MODEL_TEMPERATURE = 0.5
    PROVINCE_RESOURCES = {
        "فارس": [
            "https://fa.wikipedia.org/wiki/استان_فارس",
            "https://fa.wikipedia.org/wiki/فهرست_شهرستان‌های_استان_فارس",
            "https://www.kojaro.com/fars/",
            "https://www.kojaro.com/shiraz/",
            "https://lastsecond.ir/attractions/fars-province",
            "https://lastsecond.ir/fars-province",
            "https://www.karnaval.ir/things-to-do/list-fars-province",
            "https://www.karnaval.ir/things-to-do/list-fars-province/the-best-nature-tourism-destinations-in-fars",
            "https://ghasedak24.com/mag/experience/fars-attractions/",
            "https://www.alibaba.ir/mag/fars/shiraz/shiraz-attractions/",
            "https://arandtour.com/news.cfm?id=2237",
            "https://hotelato.ir/gds/fa/mag/shiraz-waterfalls",
            "https://seeiran.ir/10-%D8%AC%D8%A7%DB%8C-%D8%AF%DB%8C%D8%AF%D9%86%DB%8C-%D8%B4%DB%8C%D8%B1%D8%A7%D8%B2-%D8%AF%D8%B1-%D8%AA%D8%A7%D8%A8%D8%B3%D8%AA/",
            "https://seeiran.ir/tag/%D8%AC%D8%A7%D9%87%D8%A7%DB%8C-%D8%AF%DB%8C%D8%AF%D9%86%DB%8C-%D9%81%D8%A7%D8%B1%D8%B3/",
            "https://seeiran.ir/category/states/fars/shiraz/",
            "https://fafarschto.ir/",
            "https://farschto.ir/",
            "https://radiogardesh.com/sights-of-fars/",
            "https://www.flytoday.ir/blog/fars-province-attractions/",
            "https://safarmarket.com/blog/shiraz-travel-guide",
            "https://blog.safartop.com/travel-guide/places-to-visit-in-shiraz/",
            "https://blog.flysepehran.com/fa/attractions/shiraz-tourist-attractions/",
            "https://kojaplus.ir/%D8%B3%D9%81%D8%B1-3-%D8%B1%D9%88%D8%B2%D9%87-%D8%A8%D9%87-%D8%B4%DB%8C%D8%B1%D8%A7%D8%B2/",
            "https://aminmana.com/tracks/shiraz1/"
        ],
        "اصفهان": [
            "https://fa.wikipedia.org/wiki/استان_اصفهان",
            "https://fa.wikipedia.org/wiki/اصفهان",
            "https://fa.wikipedia.org/wiki/فهرست_شهرستان‌های_استان_اصفهان",
            "https://www.kojaro.com/isfahan/",
            "https://www.kojaro.com/esfahan/",
            "https://www.kojaro.com/thingstodo/191519-attractions-around-isfahan/",
            "https://lastsecond.ir/attractions/isfahan",
            "https://lastsecond.ir/attractions/isfahan-province",
            "https://lastsecond.ir/blog/9668-best-attractions-of-isfahan",
            "https://jainjas.com/Blog/192/20-%D8%AA%D8%A7-%D8%A7%D8%B2-%D8%A8%D9%87%D8%AA%D8%B1%DB%8C%D9%86-%D8%AC%D8%A7%D8%B0%D9%87-%D9%87%D8%A7%DB%8C-%D8%A7%D8%B5%D9%81%D9%87%D8%A7%D9%86-%D8%A8%D8%B1%D8%A7%DB%8C-%DA%AF%D8%B1%D8%AF%D8%B4%D8%8C-%D8%AE%D8%B1%DB%8C%D8%AF-%D9%88-%D8%B4%DA%A9%D9%85-%DA%AF%D8%B1%D8%AF%DB%8C",
            "https://jainjas.com/Blog/458/%D8%B1%D8%A7%D9%87%D9%86%D9%85%D8%A7%DB%8C-%D8%B3%D9%81%D8%B1-%D8%A7%D8%B5%D9%81%D9%87%D8%A7%D9%86",
            "https://www.alibaba.ir/mag/isfahan/isfahan-ci/isfahan-sights/",
            "https://safarmarket.com/blog/attractions/iran/isfahan/isfahan-attractions",
            "https://safarmarket.com/blog/attractions/iran/isfahan/attractions-near-isfahan",
            "https://blog.shab.ir/%D8%AC%D8%A7%D9%87%D8%A7%DB%8C-%D8%AF%DB%8C%D8%AF%D9%86%DB%8C-%D8%A7%D8%B5%D9%81%D9%87%D8%A7%D9%86/",
            "https://fa.wikipedia.org/wiki/%D8%AC%D8%A7%D8%B0%D8%A8%D9%87%E2%80%8C%D9%87%D8%A7%DB%8C_%DA%AF%D8%B1%D8%AF%D8%B4%DA%AF%D8%B1%DB%8C_%D8%A7%D8%B3%D8%AA%D8%A7%D9%86_%D8%A7%D8%B5%D9%81%D8%A7%D9%86",
            "https://seeiran.ir/10-%D8%AC%D8%A7%DB%8C-%D8%AF%DB%8C%D8%AF%D9%86%DB%8C-%D8%A7%D8%B5%D9%81%D9%87%D8%A7%D9%86-%D8%AF%D8%B1-%D8%AA%D8%A7%D8%A8%D8%B3%D8%AA/",
            "https://seeiran.ir/%D8%AC%D8%A7%D9%87%D8%A7%DB%8C-%D8%AF%DB%8C%D8%AF%D9%86%DB%8C-%D8%A7%D8%B5%D9%81%D9%87%D8%A7%D9%86-%D8%AF%D8%B1-%D8%A8%D9%87%D8%A7%D8%B1/",
            "https://anajournal.ir/gardeshgari/makanhaye-didani-zarrinshahr/",
            "https://wanderlog.com/place/details/10042872/simorgh-amusement-park",
            "https://en.wikipedia.org/wiki/Dream_Land_Isfahan",
            "https://lastsecond.ir/attractions/tiran-and-karvan",
            "https://www.isfahanvisit.ir/fa/articles/105-esfahan-springs-list",
            "https://www.flytoday.ir/blog/isfahan-historical-attractions/",
            "https://www.eghamat24.com/blog/8765/list-of-isfahan-amusement-centers",
            "https://lastsecond.ir/blog/11150-isfahan-entertainments",
            "https://www.otaghak.com/blog/isfahan-entertainment-centers/",
            "https://3click.com/blog/free-entertainment-in-isfahan/",
            "https://lastsecond.ir/blog/11087-sightseeing-places-around-isfahan",
            "https://www.pateh.com/blog/sightseeing-places-in-isfahan/"
        ],
        "بوشهر": [
            "https://fa.wikipedia.org/wiki/استان_بوشهر",
            "https://fa.wikipedia.org/wiki/گردشگری_در_بوشهر",
            "https://ostb.ir/",
            "https://www.chtn.ir/news/1402120800367/%D8%AC%D8%A7%D9%87%D8%A8%D9%87-%D9%87%D8%A7%DB%8C-%DA%AF%D8%B1%D8%AF%D8%B4%DA%AF%D8%B1%DB%8C-%D8%AA%D8%A7%D8%B1%DB%8C%D8%AE%DB%8C-%D9%88-%D8%B7%D8%A8%DB%8C%D8%B9%DB%8C-%D8%A7%D8%B3%D8%AA%D8%A7%D9%86-%D8%A8%D9%88%D8%B4%D9%87%D8%B1",
            "https://www.kojaro.com/bandar-bushehr/",
            "https://www.kojaro.com/bushehr/",
            "https://www.alibaba.ir/mag/bushehr/bushehr-ci/travel-to-bushehr-province/",
            "https://www.karnaval.ir/things-to-do/list-bushehr-province",
            "https://www.eghamat24.com/blog/31336/bushehr-attractions",
            "https://safarmarket.com/blog/attractions/iran/bushehr/bushehr-attractions",
            "https://seeiran.ir/tag/%D8%AC%D8%A7%D9%87%D8%A7%DB%8C-%D8%AF%DB%8C%D8%AF%D9%86%DB%8C-%D8%A8%D9%88%D8%B4%D9%87%D8%B1/",
            "https://seeiran.ir/category/states/boshehr/",
            "https://seeiran.ir/%D8%AC%D8%A7%D9%87%D8%A7%DB%8C-%D8%AF%DB%8C%D8%AF%D9%86%DB%8C-%D8%A8%D9%88%D8%B4%D9%87%D8%B1-%D8%AF%D8%B1-%D8%B2%D9%85%D8%B3%D8%AA%D8%A7%D9%86/",
            "https://seeiran.ir/%D8%AC%D8%A7%D9%87%D8%A7%DB%8C-%D9%87%D8%A7%DB%8C-%D8%AA%D8%A7%D8%B1%DB%8C%D8%AE%DB%8C-%D8%A8%D9%88%D8%B4%D9%87%D8%B1/",
            "https://seeiran.ir/tag/%D8%AC%D8%A7%D9%87%DB%8C-%D8%AA%D8%A7%D8%B1%DB%8C%D8%AE%DB%8C-%D8%A8%D9%88%D8%B4%D9%87%D8%B1/",
            "https://mirasbushehr.ir/",
            "https://www.eghamat24.com/blog/31336/bushehr-attractions",
            "https://www.eligasht.com/Blog/tourism/%D8%AC%D8%A7%D8%B0%D8%A8%D9%87-%D9%87%D8%A7%DB%8C-%D8%A8%D9%88%D8%B4%D9%87%D8%B1/",
            "https://www.iranhotelonline.com/blog/post-1564/%D8%AC%D8%A7%D9%87%D8%A7%DB%8C-%D8%AF%DB%8C%D8%AF%D9%86%DB%8C-%D8%A8%D9%88%D8%B4%D9%87%D8%B1-%D8%A8%D8%A7-%D8%B9%DA%A9%D8%B3-%D9%88-%D8%A2%D8%AF%D8%B1%D8%B3/"
        ],
        "چهارمحال و بختیاری": [
            "https://fa.wikipedia.org/wiki/استان_%DA%86%D9%87%D8%A7%D8%B1%D9%85%D8%AD%D8%A7%D9%84_%D9%88_%D8%A8%D8%AE%D8%AA%DB%8C%D8%A7%D8%B1%DB%8C",
            "https://www.ostan-chb.ir/",
            "https://emroozkojaberim.com/news.cfm?id=107",
            "https://www.karnaval.ir/things-to-do/list-chaharmahal-and-bakhtiari-province",
            "https://emroozkojaberim.com/destinationlist.cfm",
            "https://youtopin.com/mag/chaharmahal-and-bakhtiari-attractions/",
            "https://fa.wikipedia.org/wiki/%D9%81%D9%87%D8%B1%D8%A7%D8%B3%D8%AA_%D8%B4%D9%87%D8%B1%D9%87%D8%A7%DB%8C_%D8%A7%D8%B3%D8%AA%D8%A7%D9%86_%DA%86%D9%87%D8%A7%D8%B1%D9%85%D8%AD%D8%A7%D9%84_%D9%88_%D8%A8%D8%AE%D8%AA%DB%8C%D8%A7%D8%B1%DB%8C",
            "https://seeiran.ir/10-%D8%AC%D8%A7%DB%8C-%D8%AF%DB%8C%D8%AF%D9%86%DB%8C-%DA%86%D9%87%D8%A7%D8%B1-%D9%85%D8%AD%D8%A7%D9%84-%D9%88-%D8%A8%D8%AE%D8%AA%DB%8C%D8%A7%D8%B1%DB%8C-%D8%AF%D8%B1-%D8%AA%D8%A7%D8%A8%D8%B3%D8%AA/",
            "https://seeiran.ir/tag/%D8%AC%D8%A7%D9%87%D8%A7%DB%8C-%D8%AF%DB%8C%D8%AF%D9%86%DB%8C-%D8%B4%D9%87%D8%B1%DA%A9%D8%B1%D8%AF/",
            "https://seeiran.ir/%D8%AC%D8%A7%D9%87%D8%A7%DB%8C-%D8%AF%DB%8C%D8%AF%D9%86%DB%8C-%D8%B4%D9%87%D8%B1%DA%A9%D8%B1%D8%AF-%D8%AF%D8%B1-%D8%A8%D9%87%D8%A7%D8%B1/",
            "https://seeiran.ir/category/states/chaharmahal-vbakhtiyari/",
            "https://seeiran.ir/category/states/chaharmahal-vbakhtiyari/shahrekord/",
            "https://seeiran.ir/tag/%D8%AF%DB%8C%D8%AF%D9%86%DB%8C%D9%87%D8%A7%DB%8C-%D8%A7%D8%B1%D8%AF%DB%8C/",
            "https://seeiran.ir/%D8%B1%D9%88%D8%B3%D8%AA%D8%A7%DB%8C-%D8%AA%D9%88%D8%B1%DB%8C%D8%B3%D8%AA%DB%8C-%D8%AF%D9%87-%DA%86%D8%B4%D9%85%D9%87-%D9%81%D8%A7%D8%B1%D8%B3%D8%A7%D9%86/",
            "https://seeiran.ir/tag/%DA%AF%D8%B1%D8%AF%D8%B4%DA%AF%D8%B1%DB%8C-%D8%A7%D8%B3%D8%AA%D8%A7%D9%86-%DA%86%D9%87%D8%A7%D8%B1%D9%85%D8%AD%D8%A7%D9%84-%D9%88-%D8%A8%D8%AE%D8%AA%DB%8C%D8%A7%D8%B1%DB%8C/",
            "https://www.flytoday.ir/blog/chaharmahal-and-bakhtiari-attractions/",
            "https://www.kojaro.com/chaharmahal-and-bakhtiari/",
            "https://www.eligasht.com/Blog/travelguide/%D8%AC%D8%A7%D8%B0%D8%A8%D9%87-%D9%87%D8%A7%DB%8C-%D8%AF%DB%8C%D8%AF%D9%86%DB%8C-%DA%86%D9%87%D8%A7%D8%B1%D9%85%D8%AD%D8%A7%D9%84-%D9%88-%D8%A8%D8%AE%D8%AA%DB%8C%D8%A7%D8%B1%DB%8C-%D8%AA%D8%B5%D9%88/",
            "https://blog.shab.ir/sights-of-chaharmahal-and-bakhtiari/"
        ],
        "هرمزگان": [
            "https://fa.wikipedia.org/wiki/%D8%A7%D8%B3%D8%A7%D9%86_%D9%87%D8%B1%D9%85%D8%B2%DA%AF%D8%A7%D9%86",
            "https://hchto.ir/",
            "https://www.istta.ir/tourism/fa/58/8",
            "https://safarmarket.com/blog/attractions/iran/hormozgan-attractions",
            "https://www.flytoday.ir/blog/hormozgan-attractions/",
            "https://youtopin.com/mag/attractions-in-hormozgan/",
            "https://www.otaghak.com/blog/attractions-of-hormozgan/",
            "https://www.alibaba.ir/mag/hormozgan/hormozgan-attractions/",
            "https://www.hoteldebitcard.com/fa/blog/Iran-tourism/hormozgan/",
            "https://www.hormozgantoday.ir/tag/%D8%A7%D8%AF%D8%A7%D9%87-%DA%A9%D9%84-%D9%85%DB%8C%D8%B1%D8%A7%D8%AB-%D9%81%D8%B1%D9%87%D9%86%DA%AF%DB%8C-%D9%87%D8%B1%D9%85%D8%B2%DA%AF%D8%A7%D9%86/",
            "https://amagestate.com/%D9%87%D8%B1%D9%85%D8%B2%DA%AF%D8%A7%D9%86%D8%8C%D8%AC%D9%88%D8%A7%D9%87%D8%B1%DB%8C-%D8%AF%D8%B1-%D8%AF%D9%84-%D8%AC%D9%86%D9%88%D8%A8-%D8%A7%DB%8C%D8%B1%D8%A7%D9%86/",
            "https://www.flytoday.ir/blog/hormozgan-attractions/",
            "https://safarmarket.com/blog/attractions/iran/hormozgan-attractions",
            "https://www.pateh.com/blog/sightseeing-places-in-hormozgan/"
        ],
        "کهگیلویه و بویراحمد": [
            "https://fa.wikipedia.org/wiki/استان_کهگیلویه_و_بویراحمد",
            "https://fa.wikipedia.org/wiki/فهرست_شهرستان‌های_استان_کهگیلویه_و_بویراحمد",
            "https://www.kojaro.com/iran-visit/165719-%DA%A9%D9%87%DA%AF%DB%8C%D9%84%D9%88%DB%8C%D9%87-%D8%A8%D9%88%DB%8C%D8%B1%D8%A7%D8%AD%D9%85%D8%AF/",
            "https://www.kojaro.com/kohgiluyeh-and-boyer-ahmad/",
            "https://www.kojaro.com/travel-guide-tips-tricks/304396-Kohgiluyeh-and-Boyer-Ahmad/",
            "https://lastsecond.ir/attractions/kohgiluyeh-and-boyer",
            "https://www.karnaval.ir/things-to-do/list-kohgiluyeh-and-boyer-ahmad-province",
            "https://radiogardesh.com/sightseeing-places-of-kohgiluyeh-and-boyerahmad/",
            "https://seeiran.ir/سفر-به-جاذبه-های-دیدنی-کهگیلویه-و-بویراحمد-در-تابستان/",
            "https://seeiran.ir/کهگیلویه،-رنگین-کمانی-از-مناطق-گردشگری/",
            "https://safarmarket.com/blog/attractions/iran/yasuj",
            "https://www.flytoday.ir/blog/kohgiluyeh-and-boyer-ahmad-attractions/",
            "https://roomtoor.com/blog/getting-to-know-the-gorges-and-waterfalls-of-kohgiluyeh-and-boyer-ahmad",
            "https://www.iranhotelonline.com/blog/post-2715/شهرهای-دیدنی-استان-کهگیلویه-و-بویراحمد/",
            "https://www.visitiran.ir/fa/province/استان-کهگیلویه-و-بویراحمد",
            "https://ittic.com/hotels/travel_info/?qj=fHQeZn5kX2cldnUydCI6InVwvVRDIrwrMaMaLTF5MHI6ImQlBn11Tl9sZXReHUBrvw2",
            "https://www.alibaba.ir/mag/kohgiluyeh-boyer-ahmad/yasuj/yasuj-attractions/",
            "https://www.alibaba.ir/mag/kohgiluyeh-boyer-ahmad/belqeis-lake/",
        ]
    }

    SUBCATEGORY_SCHEMA = {
        "geographical_features": ["رودخانه‌ها", "کوه‌ها", "دریاچه‌ها", "پوشش گیاهی"],
        "topography": ["اقلیم منطقه", "دشت‌ها و فلات‌ها", "ویژگی‌های زمین‌شناسی"],
        "natural_resources": ["منابع آبی", "منابع معدنی"]
    }

In [None]:
class CacheManager:
    def __init__(self, base_dir="cache", ttl_hours=24):
        self.base_dir = base_dir
        self.ttl = datetime.timedelta(hours=ttl_hours)
        os.makedirs(self.base_dir, exist_ok=True)

    def _path(self, url):
        h = hashlib.sha256(url.encode("utf-8")).hexdigest()
        return os.path.join(self.base_dir, h)

    def get(self, url):
        p = self._path(url)
        html_path = p + ".html"
        meta_path = p + ".json"
        if os.path.exists(html_path) and os.path.exists(meta_path):
            with open(meta_path, "r", encoding="utf-8") as f:
                meta = json.load(f)
            ts = datetime.datetime.fromisoformat(meta.get("fetched_at"))
            if datetime.datetime.utcnow() - ts < self.ttl:
                with open(html_path, "r", encoding="utf-8", errors="ignore") as f:
                    return f.read(), meta.get("status_code"), meta.get("headers", {})
        return None, None, None

    def set(self, url, html, status_code, headers):
        p = self._path(url)
        html_path = p + ".html"
        meta_path = p + ".json"
        with open(html_path, "w", encoding="utf-8") as f:
            f.write(html)
        meta = {
            "fetched_at": datetime.datetime.utcnow().isoformat(),
            "status_code": status_code,
            "headers": dict(headers),
        }
        with open(meta_path, "w", encoding="utf-8") as f:
            json.dump(meta, f, ensure_ascii=False, indent=2)

In [None]:
class ResourceFetcher:
    def __init__(self, cache_manager):
        self.cache = cache_manager

    async def fetch_single(self, client, url):
        cached_html, status, headers = self.cache.get(url)
        if cached_html is not None:
            return url, cached_html, status, headers, True
        try:
            resp = await client.get(url, follow_redirects=True, timeout=20.0)
            html = resp.text
            self.cache.set(url, html, resp.status_code, resp.headers)
            return url, html, resp.status_code, dict(resp.headers), False
        except Exception:
            return url, None, None, None, False

    async def scrape(self, province, urls, progress, task_id):
        async with httpx.AsyncClient(headers={"User-Agent": "Mozilla/5.0"}) as client:
            fetch_tasks = [self.fetch_single(client, u) for u in urls]
            results = []
            for f in asyncio.as_completed(fetch_tasks):
                url, html, status_code, headers, from_cache = await f
                results.append((url, html, status_code, headers, from_cache))
                progress.update(task_id, advance=1)
            processed = []
            for url, html, status_code, headers, from_cache in results:
                if not html:
                    continue
                base = url
                soup = BeautifulSoup(html, "html.parser")
                for tag in soup(["script", "style"]):
                    tag.decompose()
                texts = []
                for tag in soup.find_all(["p", "li", "h1", "h2", "h3", "h4"]):
                    txt = tag.get_text(separator=" ", strip=True)
                    if txt:
                        texts.append(txt)
                full_text = "\n".join(texts)
                images = []
                for img in soup.find_all("img"):
                    src = img.get("src") or img.get("data-src") or ""
                    if not src:
                        continue
                    abs_url = urljoin(base, src)
                    images.append(abs_url)
                valid_images = await self.validate_images(client, images, limit=4)
                lower_text = full_text.lower()
                matched = {}
                for cat in Config.SUBCATEGORY_SCHEMA.keys():
                    for keyword in Config.SUBCATEGORY_SCHEMA.get(cat, []):
                        if keyword.lower() in lower_text:
                            matched.setdefault(cat, []).append(keyword)
                sentences = self.extract_sentences(full_text)
                summary = {}
                for cat in Config.SUBCATEGORY_SCHEMA.keys():
                    keywords = [kw.lower() for kw in Config.SUBCATEGORY_SCHEMA.get(cat, [])]
                    scored = self.score_sentences(sentences, keywords)
                    summary[cat] = scored[:5]
                title_tag = soup.title.string.strip() if soup.title and soup.title.string else ""
                processed.append(
                    {
                        "url": url,
                        "title": title_tag,
                        "status_code": status_code,
                        "from_cache": from_cache,
                        "matched_categories": matched,
                        "summary_sentences": summary,
                        "images": valid_images,
                        "full_text": full_text,
                    }
                )
            return processed

    @staticmethod
    def extract_sentences(text):
        parts = re.split(r"(?<=[\.!\?؟])\s+", text)
        return [p.strip() for p in parts if p.strip()]

    @staticmethod
    def score_sentences(sentences, keywords):
        scores = []
        for s in sentences:
            sc = 0
            low = s.lower()
            for kw in keywords:
                if kw in low:
                    sc += low.count(kw)
            scores.append((sc, s))
        scores.sort(reverse=True, key=lambda x: x[0])
        return [s for sc, s in scores if sc > 0]

    @staticmethod
    async def validate_images(client, image_urls, limit=3):
        validated = []
        tasks = []
        for u in image_urls[:limit]:
            tasks.append(client.head(u, follow_redirects=True, timeout=5.0))
        for f in asyncio.as_completed(tasks):
            try:
                resp = await f
                ctype = resp.headers.get("content-type", "")
                if resp.status_code == 200 and "image" in ctype:
                    if hasattr(resp, "url"):
                        validated.append(str(resp.url))
                    else:
                        validated.append(resp.request.url)
            except:
                continue
        return validated

In [None]:
class PromptBuilder:
    @staticmethod
    def normalize_name(name):
        import unicodedata
        s = unicodedata.normalize("NFKD", name)
        s = "".join(c for c in s if not unicodedata.combining(c))
        s = re.sub(r"\s+", "", s)
        s = re.sub(r"[^\w]", "", s, flags=re.UNICODE)
        return s.lower()

    @staticmethod
    def build_category_prompt(province, category_key, resources, existing_names=None, remaining_needed=None, subcategory=None):
        header = f"استان: {province}\nدسته‌بندی: {category_key}\n"
        if subcategory:
            header += f"تمرکز روی زیردسته: {subcategory}\n"
        src_section = "Extracted sources and relevant snippets:\n"
        for r in resources:
            src_section += f"- URL: {r['url']}\n"
            if r.get("title"):
                src_section += f"  Title: {r['title']}\n"
            if category_key in r.get("matched_categories", {}):
                kws = r["matched_categories"][category_key]
                src_section += f"  Detected keywords: {', '.join(kws)}\n"
            summary = r.get("summary_sentences", {}).get(category_key, [])
            if summary:
                src_section += "  Summary sentences:\n"
                for s in summary[:3]:
                    src_section += f"    * {s}\n"
            if r.get("images"):
                for img in r["images"][:3]:
                    src_section += f"  Image: {img}\n"
            src_section += "\n"
        schema_desc = ""
        if category_key in ["geographical_features", "topography", "natural_resources"]:
            subs = Config.SUBCATEGORY_SCHEMA.get(category_key, [])
            schema_desc += "Expected subcategories (if available):\n"
            for s in subs:
                schema_desc += f"- {s}\n"
            schema_desc += (
                "Output must be a JSON array of objects. Each object should have:\n"
                '{"name": "<subcategory name>", "description": [ {"name": "<item name>", "images": ["<image urls>"], '
                '"description": "<short scientific description>"} , ... ] }\n'
                "If no clear subcategories, you can wrap flat items under one object named after the category.\n"
            )
        elif category_key == "tourist_attractions":
            schema_desc += (
                "Output must be a JSON array of tourist attraction objects with fields:\n"
                '{"name":"<name>","images":["<image urls>"],"year_built":"<if known>","architect":"<if known>",'
                '"constructor":"<if known>","description":"<short description>"}\n'
            )

        instruction = (
            f"Include all the information present in the attached text and add your own only if you are certain, writing it in the description with at least {Config.MIN_TENSE_LEN} words. "
            "In the description, avoid general or broad explanations; include only specific, precise, and relevant information about that item. "
            f"Each entry must include at least {Config.MIN_FACTS} documented informational attributes; if those attributes have explicit fields in the schema, populate those fields (e.g., creator in the 'سازنده' field), otherwise include them in the 'description'. "
            "Structure each item so that one could pose at least five detailed questions about it—such as creator, date of creation, location, natural or architectural features, and cultural or historical significance. "
            f"You must provide at least {Config.MIN_ITEMS_PER_CATEGORY} distinct, non-repetitive entries for this category. "
            "Use only the sources above and internal knowledge you are certain about. Do not hallucinate. "
            "If this is a refinement round, do not repeat existing items listed below."
            "Under no circumstances should an image be assigned to an item unless there is indisputable, explicit, and independently verifiable evidence proving that the image depicts the exact item in question. Vague indicators, partial matches, or assumptions based on visual similarity are categorically unacceptable. The image URL itself must contain the full, unambiguous name of the item or an officially recognized abbreviation that is directly tied to the item—generic terms or loosely associated references are insufficient. If even the slightest doubt remains about the image’s authenticity or relevance, it must not be used. Err strictly on the side of omission rather than risk incorrect attribution."
        )

        extras = ""
        if existing_names:
            extras += "Already provided item names (normalized, avoid duplicates):\n"
            for nm in sorted(existing_names):
                extras += f"- {nm}\n"
            if remaining_needed:
                extras += f"Need at least {remaining_needed} additional unique items to reach the target.\n"
        prompt = "\n".join([header, src_section, schema_desc, instruction, extras, "Output only the JSON array in the described shape."])
        return prompt

In [None]:
class ModelClient:
    def __init__(self, api_key=None, model="gpt-4o-mini-2024-07-18", base_url="https://api.tapsage.com", provider="openai_chat_completion", max_tokens=None, temperature=None):
        key = api_key or os.environ.get("TAPSAGE_API_KEY", "tpsg-MNvTQUAqUL84o4THLV1395IqTBIZHJJ")
        self.endpoint = f"{base_url}/api/v1/wrapper/{provider}/chat/completions"
        self.headers = {
            "Authorization": f"Bearer {key}",
            "Content-Type": "application/json",
        }
        self.model = model
        self.max_tokens = max_tokens or Config.MODEL_MAX_TOKENS
        self.temperature = temperature if temperature is not None else Config.MODEL_TEMPERATURE
        self.system_prompt = (
            "شما یک متخصص مجرب در حوزه‌های گردشگری، اقلیم‌شناسی و جغرافیای ایران هستید. "
            "تمام پاسخ‌های شما باید فقط به زبان فارسی معیار، دقیق، روان و مطابق با استانداردهای نگارش علمی و اطلاع‌رسانی ارائه شوند. "
            "اطلاعاتی را بیاورید که مطمئن هستید **درست و حتما مربوط به استان هدف** هستند؛ اگر در مورد چیزی مطمئن نیستید از آوردن آن خودداری کنید. "
            "می‌توانید از دانش درونی خود استفاده کنید ولی فقط مواردی را بیاورید که کاملاً مطمئن هستید. "
            f"شما باید حداقل {Config.MIN_ITEMS_PER_CATEGORY} دادهٔ کاملاً متمایز، غیرتکراری، و مطمئن درباره هر دسته‌بندی ارائه دهید. اگر در اولین پاسخ کمتر از {Config.MIN_ITEMS_PER_CATEGORY} مورد داده شد، با اصلاح پرسش و اضافه کردن موارد موجود به عنوان موارد تکراری، تا رسیدن به حداقل ادامه دهید. "
            "توضیحاتی که در قسمت description می‌نویسید باید بسیار دقیق، با جزئیات و کامل باشند، علمی و شفاف باشند و ارتباط هر مورد را با استان هدف به‌روشنی توضیح دهند؛ "
            "به‌گونه‌ای که نشان دهند چرا آن مورد به‌طور خاص در آن استان اهمیت یا ویژگی دارد. "
            "محتوای فیلد description باید غنی از اطلاعات اختصاصی و شامل ویژگی‌ها و جزئیات دقیق، ریزبینانه و منحصربه‌فرد همان مورد باشد؛ "
            "از نوشتن توضیحات کلی یا قابل تعمیم به سایر موارد خودداری کن. "
            f"بگو همه اطلاعاتی که در متن ضمیمه شده آمده و اطلاعات خودت را در صورتی که مطمئن هستی، در قالب حداقل {Config.MIN_TENSE_LEN} کلمه در description بنویس. "
            "در بخش description از نوشتن توضیحات کلی و عمومی پرهیز کن؛ فقط اطلاعات خاص، دقیق و مرتبط با همان مورد را بنویس. "
            f"هر مورد باید شامل **حداقل {Config.MIN_FACTS} ویژگی اطلاعاتی مستند** باشد؛ اگر این ویژگی‌ها فیلد مشخصی در ساختار دارند، در همان فیلد درج شوند (مثلاً سازنده در فیلد \"سازنده\")، "
            "در غیر این‌صورت در بخش \"توصیف\" (description) آورده شوند؛ به‌گونه‌ای که بتوان درباره هر مورد دست‌کم پنج سؤال دقیق مانند سازنده، تاریخ ساخت، موقعیت، "
            "ویژگی‌های طبیعی یا معماری، و اهمیت فرهنگی یا تاریخی طرح کرد. "
            "در فیلد name، حتماً از زبان فارسی معیار و دقیقاً مطابق با فرمت داده‌شده استفاده کنید. "
            "فیلدهایی که مقدار مشخص و قابل اطمینانی ندارند یا اطلاعاتشان موجود نیست، باید با مقدار null پر شوند و نباید حذف شوند. "
            "فرمت خروجی باید دقیقاً مطابق قالب داده‌شده باشد و فقط JSON بدهید. "
            "تحت هیچ شرایطی نباید به هیچ موردی تصویری نسبت داده شود، مگر آنکه با شواهد کاملاً روشن، قطعی، و قابل‌راستی‌آزمایی اثبات شود که تصویر دقیقاً متعلق به همان مورد است. صرف شباهت ظاهری، حدس، یا برداشت شخصی به‌هیچ‌وجه قابل‌قبول نیست. لینک تصویر باید حتماً شامل نام کامل یا مخفف رسمی و شناخته‌شده مورد باشد؛ هرگونه نام مبهم، اصطلاح عمومی یا اشاره غیرمستقیم مردود است. در صورت وجود حتی کمترین تردید، تصویر نباید استفاده شود. همواره با سخت‌گیری کامل عمل کن و اولویت را به عدم تخصیص تصویر بده، نه به تخصیص مشکوک یا نادرست."
            "اگر مطمئن نبودی، تصویر را نیاور."
        )

    def generate(self, prompt, max_attempts=4):
        attempt = 0
        backoff_base = 1
        while attempt < max_attempts:
            try:
                messages = [
                    {"role": "system", "content": self.system_prompt},
                    {"role": "user", "content": prompt},
                ]
                payload = {
                    "model": self.model,
                    "messages": messages,
                    "max_tokens": self.max_tokens,
                    "temperature": self.temperature,
                }
                resp = requests.post(self.endpoint, json=payload, headers=self.headers, timeout=500)
                if resp.status_code == 524 or 500 <= resp.status_code < 600:
                    raise requests.HTTPError(f"Server error {resp.status_code}")
                resp.raise_for_status()
                data = resp.json()
                return data["choices"][0]["message"]["content"].strip()
            except Exception as e:
                wait = backoff_base * (2 ** attempt) + random.uniform(0, 0.5)
                console.print(f"[yellow]Model call failed (attempt {attempt+1}/{max_attempts}): {e}. Retrying in {wait:.1f}s[/yellow]")
                time.sleep(wait)
                attempt += 1
        raise RuntimeError("Failed to get a valid response from model after retries.")

In [None]:
class DataMerger:
    @staticmethod
    def normalize_name(name):
        import unicodedata
        s = unicodedata.normalize("NFKD", name)
        s = "".join(c for c in s if not unicodedata.combining(c))
        s = re.sub(r"\s+", "", s)
        s = re.sub(r"[^\w]", "", s, flags=re.UNICODE)
        return s.lower()

    @staticmethod
    def is_duplicate_name(name, existing_names):
        n = DataMerger.normalize_name(name)
        for e in existing_names:
            if SequenceMatcher(None, n, DataMerger.normalize_name(e)).ratio() > 0.85:
                return True
        return False

    @staticmethod
    def merge_category_blocks(existing_blocks, new_blocks):
        for new in new_blocks:
            new_name = new.get("name", "").strip()
            if not new_name:
                continue
            matched = None
            for ex in existing_blocks:
                if DataMerger.normalize_name(ex.get("name", "")) == DataMerger.normalize_name(new_name):
                    matched = ex
                    break
            if matched:
                existing_items = matched.get("description", [])
                new_items = new.get("description", [])
                existing_item_names = [i.get("name", "") for i in existing_items if i.get("name")]
                for ni in new_items:
                    name = ni.get("name", "").strip()
                    if not name:
                        continue
                    if DataMerger.is_duplicate_name(name, existing_item_names):
                        continue
                    existing_items.append(ni)
                    existing_item_names.append(name)
                matched["description"] = existing_items
            else:
                existing_blocks.append(new)
        return existing_blocks

    @staticmethod
    def merge_attraction_items(existing, new_items):
        existing_names = [item.get("name", "") for item in existing if item.get("name")]
        for item in new_items:
            name = item.get("name", "").strip()
            if not name:
                continue
            if DataMerger.is_duplicate_name(name, existing_names):
                continue
            existing.append(item)
            existing_names.append(name)
        return existing

    @staticmethod
    def flatten_count(category_data, category_key):
        if category_key in ["geographical_features", "topography", "natural_resources"]:
            names = set()
            for block in category_data:
                for itm in block.get("description", []):
                    n = itm.get("name", "").strip()
                    if n:
                        names.add(DataMerger.normalize_name(n))
            return len(names), names
        else:
            names = set()
            for itm in category_data:
                n = itm.get("name", "").strip()
                if n:
                    names.add(DataMerger.normalize_name(n))
            return len(names), names

In [None]:
class ProvinceProcessor:
    def __init__(self, province, assistant, fetcher, output_dir, progress, overall_task):
        self.province = province
        self.assistant = assistant
        self.fetcher = fetcher
        self.output_dir = output_dir
        self.progress = progress
        self.overall_task = overall_task
        self.resources = []
        self.relevant = []

    async def run(self):
        scrape_task = self.progress.add_task(f"Fetching resources for {self.province}", total=len(Config.PROVINCE_RESOURCES.get(self.province, [])))
        self.resources = await self.fetcher.scrape(self.province, Config.PROVINCE_RESOURCES.get(self.province, []), self.progress, scrape_task)
        self.relevant = self.resources
        result = {
            "title": f"ویژگی‌های جغرافیایی {self.province}",
            "location": {"province": self.province, "city": self.province},
            "geographical_features": [],
            "topography": [],
            "natural_resources": [],
            "tourist_attractions": [],
            "additional_info": {
                "books_source": "http://chap.sch.ir/category/دوره-آموزش-متوسطه/جغرافیای-استان‌ها",
                "online_resources": [r["url"] for r in self.resources],
            },
        }

        for cat in ["geographical_features", "topography", "natural_resources", "tourist_attractions"]:
            cat_task = self.progress.add_task(f"{self.province} - {cat}", total=1)
            total_collected = 0
            existing_names = set()
            rounds = 0

            while rounds < Config.MAX_REFINE_ROUNDS:
                remaining = max(0, Config.MIN_ITEMS_PER_CATEGORY - total_collected)
                prompt = PromptBuilder.build_category_prompt(
                    self.province,
                    cat,
                    self.relevant,
                    existing_names=existing_names if rounds > 0 else None,
                    remaining_needed=remaining if rounds > 0 else None,
                )
                try:
                    response = await asyncio.to_thread(self.assistant.generate, prompt)
                except Exception as e:
                    console.print(f"[red]Error contacting model for {self.province}/{cat}: {e}[/red]")
                    break

                parsed = self._extract_json(response)
                if not parsed:
                    rounds += 1
                    continue

                if cat in ["geographical_features", "topography", "natural_resources"]:
                    normalized_blocks = []
                    for block in parsed:
                        if isinstance(block, dict):
                            if "category" in block and "items" in block:
                                subcat_name = block["category"]
                                description_items = []
                                for item in block.get("items", []):
                                    description_items.append(
                                        {
                                            "name": item.get("name", ""),
                                            "images": item.get("images", []),
                                            "description": item.get("description", ""),
                                        }
                                    )
                                normalized_blocks.append({"name": subcat_name, "description": description_items})
                            elif "name" in block and isinstance(block.get("description"), list):
                                normalized_blocks.append(block)
                    if not normalized_blocks:
                        flat_items = []
                        for item in parsed:
                            if isinstance(item, dict) and "name" in item:
                                name = item.get("name", "")
                                images = item.get("images", []) if isinstance(item.get("images", []), list) else []
                                desc = item.get("description", "") if isinstance(item.get("description", ""), str) else ""
                                flat_items.append({"name": name, "images": images, "description": desc})
                        if flat_items:
                            wrapper = {"name": cat, "description": flat_items}
                            normalized_blocks = [wrapper]
                    result[cat] = DataMerger.merge_category_blocks(result[cat], normalized_blocks)
                else:
                    attractions = []
                    for block in parsed:
                        if isinstance(block, dict):
                            if "category" in block and "items" in block:
                                for item in block.get("items", []):
                                    attractions.append(item)
                            elif "name" in block:
                                attractions.append(block)
                    result[cat] = DataMerger.merge_attraction_items(result[cat], attractions)

                count, names = DataMerger.flatten_count(result[cat], cat)
                total_collected = count
                existing_names = names
                rounds += 1
                if total_collected >= Config.MIN_ITEMS_PER_CATEGORY:
                    break

            if total_collected < Config.MIN_ITEMS_PER_CATEGORY and cat in Config.SUBCATEGORY_SCHEMA:
                for sub in Config.SUBCATEGORY_SCHEMA.get(cat, []):
                    if total_collected >= Config.MIN_ITEMS_PER_CATEGORY:
                        break
                    prompt = PromptBuilder.build_category_prompt(
                        self.province,
                        cat,
                        self.relevant,
                        existing_names=existing_names,
                        remaining_needed=max(0, Config.MIN_ITEMS_PER_CATEGORY - total_collected),
                        subcategory=sub,
                    )
                    try:
                        response = await asyncio.to_thread(self.assistant.generate, prompt)
                    except Exception as e:
                        console.print(f"[red]Error contacting model for subcategory {sub} of {self.province}/{cat}: {e}[/red]")
                        continue
                    parsed = self._extract_json(response)
                    if not parsed:
                        continue
                    if cat in ["geographical_features", "topography", "natural_resources"]:
                        normalized_blocks = []
                        for block in parsed:
                            if isinstance(block, dict):
                                if "category" in block and "items" in block:
                                    subcat_name = block["category"]
                                    description_items = []
                                    for item in block.get("items", []):
                                        description_items.append(
                                            {
                                                "name": item.get("name", ""),
                                                "images": item.get("images", []),
                                                "description": item.get("description", ""),
                                            }
                                        )
                                    normalized_blocks.append({"name": subcat_name, "description": description_items})
                                elif "name" in block and isinstance(block.get("description"), list):
                                    normalized_blocks.append(block)
                        if not normalized_blocks:
                            flat_items = []
                            for item in parsed:
                                if isinstance(item, dict) and "name" in item:
                                    name = item.get("name", "")
                                    images = item.get("images", []) if isinstance(item.get("images", []), list) else []
                                    desc = item.get("description", "") if isinstance(item.get("description", ""), str) else ""
                                    flat_items.append({"name": name, "images": images, "description": desc})
                            if flat_items:
                                wrapper = {"name": cat, "description": flat_items}
                                normalized_blocks = [wrapper]
                        result[cat] = DataMerger.merge_category_blocks(result[cat], normalized_blocks)
                    else:
                        attractions = []
                        for block in parsed:
                            if isinstance(block, dict):
                                if "category" in block and "items" in block:
                                    for item in block.get("items", []):
                                        attractions.append(item)
                                elif "name" in block:
                                    attractions.append(block)
                        result[cat] = DataMerger.merge_attraction_items(result[cat], attractions)

                    count, names = DataMerger.flatten_count(result[cat], cat)
                    total_collected = count
                    existing_names = names

            console.print(f"[bold cyan]{self.province} - {cat}: collected {total_collected} unique items[/bold cyan]")
            self.progress.update(cat_task, advance=1)

        self._prune_english_name_entries(result)

        filename = os.path.join(self.output_dir, f"{self.province.replace(' ', '_')}.json")
        os.makedirs(self.output_dir, exist_ok=True)
        with open(filename, "w", encoding="utf-8") as f:
            json.dump(result, f, ensure_ascii=False, indent=2)
        self.progress.update(self.overall_task, advance=1)
        console.print(f"[bold green]Saved:[/bold green] {filename}")

    @staticmethod
    def _extract_json(response_text):
        idx = response_text.find("[")
        if idx == -1:
            return []
        try:
            arr = json.loads(response_text[idx: response_text.rfind("]") + 1])
            return arr
        except:
            try:
                cleaned = re.sub(r",\s*}", "}", response_text[idx: response_text.rfind("]") + 1])
                return json.loads(cleaned)
            except:
                return []

    @staticmethod
    def _prune_english_name_entries(result):
        def has_latin(s):
            return bool(re.search(r"[A-Za-z]", s))

        def conditional_replace(cat, new_list):
            if new_list:
                result[cat] = new_list

        for cat in ["geographical_features", "topography", "natural_resources"]:
            original_blocks = result.get(cat, [])
            new_blocks = []
            for block in original_blocks:
                desc = block.get("description", [])
                filtered_items = []
                for item in desc:
                    name = item.get("name", "")
                    if isinstance(name, str) and has_latin(name):
                        continue
                    filtered_items.append(item)
                if filtered_items:
                    block_copy = block.copy()
                    block_copy["description"] = filtered_items
                    new_blocks.append(block_copy)
            conditional_replace(cat, new_blocks)

        original_attractions = result.get("tourist_attractions", [])
        filtered_attractions = []
        for att in original_attractions:
            name = att.get("name", "")
            if isinstance(name, str) and has_latin(name):
                continue
            filtered_attractions.append(att)
        if filtered_attractions:
            result["tourist_attractions"] = filtered_attractions

        for cat in ["geographical_features", "topography", "natural_resources"]:
            current_blocks = result.get(cat, [])
            pruned = []
            for block in current_blocks:
                name = block.get("name", "")
                if isinstance(name, str) and name.strip().lower() == cat.lower():
                    continue
                pruned.append(block)
            if pruned:
                result[cat] = pruned

In [None]:
async def main():
    assistant = ModelClient()
    output_dir = "province_datasets"
    os.makedirs(output_dir, exist_ok=True)
    cache_manager = CacheManager()
    fetcher = ResourceFetcher(cache_manager)
    provinces = list(Config.PROVINCE_RESOURCES.keys())

    with Progress(
        SpinnerColumn(),
        TextColumn("{task.description}"),
        BarColumn(bar_width=None),
        TimeElapsedColumn(),
        TimeRemainingColumn(),
        console=console,
    ) as progress:
        overall_task = progress.add_task("Processing all provinces", total=len(provinces))
        for province in provinces:
            processor = ProvinceProcessor(province, assistant, fetcher, output_dir, progress, overall_task)
            await processor.run()

In [None]:
asyncio.run(main())

Output()

In [15]:
import os
import re
import json
import zipfile
import logging
from collections import defaultdict
from typing import Optional, Dict, List, Tuple
import matplotlib.pyplot as plt
import csv

In [19]:
class ProvinceDataAnalyzer:
    def __init__(
        self,
        input_dir: str,
        output_dir: str,
        zip_filename: str,
        exclude_fields: Optional[List[str]] = None,
        logger: Optional[logging.Logger] = None,
    ):
        self.input_dir = input_dir
        self.output_dir = output_dir
        self.zip_filename = zip_filename
        self.exclude_fields = exclude_fields or [
            "additional_info",
            "title",
            "location.province",
            "location.city",
        ]
        self.logger = logger or self._default_logger()
        os.makedirs(self.output_dir, exist_ok=True)

    def _default_logger(self) -> logging.Logger:
        logger = logging.getLogger(__name__)
        if not logger.handlers:
            handler = logging.StreamHandler()
            formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
            handler.setFormatter(formatter)
            logger.addHandler(handler)
            logger.setLevel(logging.INFO)
        return logger

    @staticmethod
    def _count_words(text: str) -> int:
        if not isinstance(text, str):
            return 0
        return len(re.findall(r"\S+", text))

    def _is_excluded(self, field_path: str) -> bool:
        return any(
            field_path == excl or field_path.startswith(f"{excl}.")
            for excl in self.exclude_fields
        )

    def _process_data(self, data: dict) -> Tuple[int, int, Dict[str, List[int]]]:
        total_records = 0
        total_words = 0
        field_lengths: Dict[str, List[int]] = defaultdict(list)

        def walk(value, field_path=""):
            nonlocal total_records, total_words
            if self._is_excluded(field_path):
                return
            if isinstance(value, str):
                words = self._count_words(value)
                total_words += words
                field_lengths[field_path].append(words)
            elif isinstance(value, dict):
                for key, val in value.items():
                    new_path = f"{field_path}.{key}" if field_path else key
                    walk(val, new_path)
            elif isinstance(value, list):
                for item in value:
                    if isinstance(item, dict):
                        total_records += 1
                    walk(item, field_path)

        walk(data)
        return total_records, total_words, field_lengths

    def _slug(self, s: str) -> str:
        return re.sub(r"[^A-Za-z0-9_.-]+", "_", s).strip("_") or "plot"

    def _plot_field_averages(self, province: str, field_lengths: Dict[str, List[int]]) -> str:
        averages = {field: (sum(lengths) / len(lengths)) for field, lengths in field_lengths.items() if lengths}
        sorted_items = sorted(averages.items(), key=lambda kv: kv[1])
        if not sorted_items:
            return ""
        fields, avg_counts = zip(*sorted_items)
        plt.figure(figsize=(12, 6))
        plt.barh(fields, avg_counts)
        plt.xlabel('Average Word Count')
        plt.title(f'Average Word Count per Field - {province}')
        plt.tight_layout()
        plt.grid(axis='x')
        filename = f"{self._slug(province)}_field_word_counts.png"
        output_path = os.path.join(self.output_dir, filename)
        plt.savefig(output_path, dpi=300, bbox_inches='tight')
        plt.close()
        self.logger.info(f"Plot saved: {output_path}")
        return output_path

    def _zip_plots(self) -> str:
        zip_path = os.path.join(self.output_dir, self.zip_filename)
        with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
            for file in os.listdir(self.output_dir):
                if file.endswith('.png'):
                    zipf.write(os.path.join(self.output_dir, file), file)
        self.logger.info(f"All plots zipped into: {zip_path}")
        return zip_path

    def analyze(self):
        self.logger.info("STARTING ANALYSIS")
        summary_rows = []
        field_avg_rows = []
        for filename in os.listdir(self.input_dir):
            if not filename.lower().endswith('.json'):
                continue
            province = os.path.splitext(filename)[0]
            file_path = os.path.join(self.input_dir, filename)
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
            rec_count, word_count, field_lengths = self._process_data(data)
            summary_rows.append({
                "province": province,
                "total_records": rec_count,
                "total_words": word_count,
            })
            for field, lengths in field_lengths.items():
                if not lengths:
                    continue
                avg = sum(lengths) / len(lengths)
                field_avg_rows.append({
                    "province": province,
                    "field": field,
                    "avg_word_count": round(avg, 4),
                    "samples": len(lengths),
                })
            self._plot_field_averages(province, field_lengths)
        summary_path = os.path.join(self.output_dir, "summary_by_province.csv")
        with open(summary_path, "w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=["province", "total_records", "total_words"])
            writer.writeheader()
            writer.writerows(summary_rows)
        field_avg_path = os.path.join(self.output_dir, "field_averages_long.csv")
        with open(field_avg_path, "w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=["province", "field", "avg_word_count", "samples"])
            writer.writeheader()
            writer.writerows(field_avg_rows)
        self.logger.info(f"Wrote tables: {summary_path}, {field_avg_path}")
        self.logger.info("CREATING ZIP ARCHIVE OF PLOTS")
        self._zip_plots()
        self.logger.info("ANALYSIS COMPLETE")

In [20]:
analyzer = ProvinceDataAnalyzer(input_dir='province_datasets', output_dir='Stats', zip_filename='Stats.zip')
analyzer.analyze()

2025-08-08 20:40:17,905 - INFO - STARTING ANALYSIS
INFO:__main__:STARTING ANALYSIS
2025-08-08 20:40:18,583 - INFO - Plot saved: results/Chahar_Mahal_Bakhtiari_field_word_counts.png
INFO:__main__:Plot saved: results/Chahar_Mahal_Bakhtiari_field_word_counts.png
2025-08-08 20:40:19,266 - INFO - Plot saved: results/Kohgiluyeh_and_Boyerahmad_field_word_counts.png
INFO:__main__:Plot saved: results/Kohgiluyeh_and_Boyerahmad_field_word_counts.png
2025-08-08 20:40:19,936 - INFO - Plot saved: results/Boushehr_field_word_counts.png
INFO:__main__:Plot saved: results/Boushehr_field_word_counts.png
2025-08-08 20:40:20,629 - INFO - Plot saved: results/Fars_field_word_counts.png
INFO:__main__:Plot saved: results/Fars_field_word_counts.png
2025-08-08 20:40:21,329 - INFO - Plot saved: results/Isfahan_field_word_counts.png
INFO:__main__:Plot saved: results/Isfahan_field_word_counts.png
2025-08-08 20:40:22,288 - INFO - Plot saved: results/Hormozgan_field_word_counts.png
INFO:__main__:Plot saved: results/H

In [18]:
!rm -rf results