In [None]:
import os
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier

from blocksnet.analysis.land_use.prediction import SpatialClassifier
from blocksnet.machine_learning.strategy.sklearn.ensemble.voting.classification_strategy import SKLearnVotingClassificationStrategy

import warnings
warnings.filterwarnings("ignore")

In [None]:
import pickle
import pandas as pd
import geopandas as gpd
from pathlib import Path

def load_gdfs(root: str | Path, pattern: str = "*.pkl", target_crs: str | None = None) -> list[gpd.GeoDataFrame]:
    """
    –ó–∞–≥—Ä—É–∂–∞–µ—Ç GeoDataFrame'—ã –∏–∑ .pkl —Ñ–∞–π–ª–æ–≤:
      - –µ—Å–ª–∏ root —É–∫–∞–∑—ã–≤–∞–µ—Ç –Ω–∞ –æ–¥–∏–Ω .pkl —Ñ–∞–π–ª ‚Äî —á–∏—Ç–∞–µ–º –µ–≥–æ –∏ –≤–æ–∑–≤—Ä–∞—â–∞–µ–º —Å–ø–∏—Å–æ–∫ [GDF]
      - –µ—Å–ª–∏ root ‚Äî –ø–∞–ø–∫–∞ ‚Äî —Ä–µ–∫—É—Ä—Å–∏–≤–Ω–æ –∏—â–µ–º –ø–æ –º–∞—Å–∫–µ pattern –∏ –≤–æ–∑–≤—Ä–∞—â–∞–µ–º —Å–ø–∏—Å–æ–∫ GDF

    –ü—ã—Ç–∞–µ–º—Å—è –∑–∞–ø–æ–ª–Ω–∏—Ç—å —Å—Ç–æ–ª–±—Ü—ã 'city' –∏ 'country' –∏–∑ —Å—Ç—Ä—É–∫—Ç—É—Ä—ã –ø—É—Ç–µ–π:
    .../<country>/<city>/<file>.pkl

    Args:
        root: –ø—É—Ç—å –∫ —Ñ–∞–π–ª—É .pkl –∏–ª–∏ –ø–∞–ø–∫–µ
        pattern: –º–∞—Å–∫–∞ —Ñ–∞–π–ª–æ–≤ (–ø–æ —É–º–æ–ª—á–∞–Ω–∏—é "*.pkl") ‚Äî –∏—Å–ø–æ–ª—å–∑—É–µ—Ç—Å—è —Ç–æ–ª—å–∫–æ –µ—Å–ª–∏ root –ø–∞–ø–∫–∞
        target_crs: –µ—Å–ª–∏ —É–∫–∞–∑–∞–Ω, –ø—Ä–∏–≤–æ–¥–∏–º –≤—Å–µ –≥–µ–æ–¥–∞–Ω–Ω—ã–µ –∫ —ç—Ç–æ–º—É CRS

    Returns:
        list[GeoDataFrame]
    """
    p = Path(root)

    def _ensure_gdf(obj):
        """–ü—Ä–µ–æ–±—Ä–∞–∑—É–µ—Ç –æ–±—ä–µ–∫—Ç –≤ GeoDataFrame, –µ—Å–ª–∏ —ç—Ç–æ DataFrame —Å –∫–æ–ª–æ–Ω–∫–æ–π 'geometry'."""
        if isinstance(obj, gpd.GeoDataFrame):
            return obj
        if isinstance(obj, pd.DataFrame) and "geometry" in obj.columns:
            return gpd.GeoDataFrame(obj, geometry="geometry", crs=getattr(obj, "crs", None))
        return None

    def _infer_city_country(fp: Path):
        # –æ–∂–∏–¥–∞–µ–º .../<country>/<city>/<file>.pkl
        city = fp.name[:-4] if fp else None
        country = fp.parent.name if len(fp.parents) >= 2 else None
        return city, country

    # –°–æ–±–µ—Ä—ë–º —Å–ø–∏—Å–æ–∫ —Ñ–∞–π–ª–æ–≤
    if p.is_file() and p.suffix.lower() == ".pkl":
        files = [p]
    elif p.is_dir():
        files = sorted(p.rglob(pattern))
        files = [f for f in files if f.suffix.lower() == ".pkl"]
    else:
        return []

    gdfs: list[gpd.GeoDataFrame] = []

    for fp in files:
        try:
            with open(fp, "rb") as f:
                obj = pickle.load(f)
        except Exception as e:
            print(f"–ü—Ä–æ–ø—É—Å–∫–∞—é {fp}: {e}")
            continue

        # –ï—Å–ª–∏ GeoDataFrame
        gdf = _ensure_gdf(obj)

        # –ï—Å–ª–∏ dict —Å GeoDataFrame
        if gdf is None and isinstance(obj, dict):
            for k, v in obj.items():
                gi = _ensure_gdf(v)
                if gi is None:
                    continue
                gi = gi.copy()
                if "city" not in gi.columns or gi["city"].isna().all():
                    gi["city"] = str(k)
                _, country = _infer_city_country(fp)
                if ("country" not in gi.columns) and country:
                    gi["country"] = country
                if target_crs:
                    gi = gi.to_crs(target_crs) if gi.crs else gi.set_crs(target_crs)
                gdfs.append(gi)
            continue

        if gdf is None:
            print(f"–ü—Ä–æ–ø—É—Å–∫–∞—é {fp}: –æ–±—ä–µ–∫—Ç –Ω–µ GeoDataFrame –∏ –Ω–µ dict —Å GeoDataFrame.")
            continue

        gdf = gdf.copy()
        city, country = _infer_city_country(fp)
        if "city" not in gdf.columns:
            gdf["city"] = city or fp.stem
        if ("country" not in gdf.columns) and country:
            gdf["country"] = country

        if target_crs:
            gdf = gdf.to_crs(target_crs) if gdf.crs else gdf.set_crs(target_crs)

        gdfs.append(gdf)

    return gdfs

MERGE_DICT = {
    'LandUse.RECREATION': 'non_urban',
    'LandUse.SPECIAL': 'non_urban',
    'LandUse.AGRICULTURE': 'non_urban',
    'LandUse.BUSINESS': 'urban',
    'LandUse.RESIDENTIAL': 'urban',
    'LandUse.INDUSTRIAL': 'industrial',
    'LandUse.TRANSPORT': None,    
}


In [None]:
russia = load_gdfs('data/blocks/Russia/')

In [None]:
# 1. –ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∞—Ü–∏—è –∏ –æ–±—É—á–µ–Ω–∏–µ
BASE_PARAMS = {"random_state": 42, "n_jobs": -1}
CPU = max(1, min(8, os.cpu_count() or 1))
MODEL_PARAMS = {
    "rf": {
        "n_estimators": 120,          # –±—ã–ª–æ 200
        "max_depth": 7,
        "class_weight": "balanced",
        "max_samples": 0.25,          # üî¥ –±—ç–≥–≥–∏–Ω–≥ –Ω–∞ –ø–æ–¥–≤—ã–±–æ—Ä–∫–µ
        "min_samples_leaf": 10,       # —Å—Ç–∞–±–∏–ª–∏–∑–∞—Ü–∏—è –∏ –º–µ–Ω—å—à–µ —É–∑–ª–æ–≤
        **BASE_PARAMS
    },
    "xgb": {
        "n_estimators": 150,          # –º–µ–Ω—å—à–µ
        "max_depth": 7,
        "learning_rate": 0.05,
        "subsample": 0.8,             # —Å—Ç–∞—Ö–∞—Å—Ç–∏—á–Ω–æ—Å—Ç—å
        "colsample_bytree": 0.8,
        "tree_method": "hist",        # –ø–∞–º—è—Ç—å/—Å–∫–æ—Ä–æ—Å—Ç—å
        "n_jobs": CPU                 # XGB –∏–≥–Ω–æ—Ä–∏—Ä—É–µ—Ç BASE_PARAMS –µ—Å–ª–∏ –µ–≥–æ —Å—Ç–µ—Ä–ª–∏
    },
    "lgb": {
        "n_estimators": 200,
        "max_depth": 7,
        "learning_rate": 0.05,
        "class_weight": "balanced",
        "num_threads": CPU            # —É LGB –ø–∞—Ä–∞–º–µ—Ç—Ä –¥—Ä—É–≥–æ–µ –∏–º—è
    },
    "hgb": {
        "max_iter": 200,
        "max_depth": 7,
        "learning_rate": 0.05,
        "random_state": 42
    }
}
estimators = [
    ("rf",  RandomForestClassifier(**MODEL_PARAMS["rf"])),
    ("xgb", XGBClassifier(**MODEL_PARAMS["xgb"])),
    ("lgb", LGBMClassifier(**MODEL_PARAMS["lgb"])),
    ("hgb", HistGradientBoostingClassifier(**MODEL_PARAMS["hgb"])),
]

strategy = SKLearnVotingClassificationStrategy(estimators, {"voting": "soft", "n_jobs": -1})
classifier = SpatialClassifier(strategy, 1000, 5)
score = classifier.train(russia)

In [None]:
classifier = SpatialClassifier.default()
result = classifier.run(russia)