In [1]:
# === Baseline OLS (GroupKFold by City) — MAE only ===
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

DATA = Path("C:/Users/User/Desktop/ML/Project/solar-potential-analysis-github-setup/New_approach/dataset/cleaned_datasets/top20_balanced_sample.parquet")  # adjust if needed
OUT  = Path("artifacts"); OUT.mkdir(parents=True, exist_ok=True)

# Load data
cols = ["City","BuildingType_5","kWh_per_m2",
        "tilt","tilt2","tilt_sin","tilt_cos",
        "GHI_kWh_per_m2_day","AvgTemp_C","ClearnessIndex","Precip_mm_per_day"]
df = pd.read_parquet(DATA, columns=cols).copy()

# Features
TARGET = "kWh_per_m2"
CAT = ["BuildingType_5"]
NUM = ["tilt","tilt2","tilt_sin","tilt_cos","GHI_kWh_per_m2_day","AvgTemp_C","ClearnessIndex","Precip_mm_per_day"]

# Preprocess + model
pre = ColumnTransformer([
    ("num", StandardScaler(), NUM),
    ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), CAT),
])
pipe = Pipeline([("pre", pre), ("ols", LinearRegression())])

# Grouped CV by City
gkf = GroupKFold(n_splits=5)
groups = df["City"].astype(str).values
X = df[CAT+NUM]
y = df[TARGET].astype(float).values

fold_mae = []
for i, (tr, te) in enumerate(gkf.split(X, y, groups), 1):
    pipe.fit(X.iloc[tr], y[tr])
    pred = pipe.predict(X.iloc[te])
    mae = mean_absolute_error(y[te], pred)
    fold_mae.append(mae)
    print(f"Fold {i}: MAE = {mae:.3f}")

print(f"\nOLS (city-grouped 5-fold) — MAE mean = {np.mean(fold_mae):.3f} ± {np.std(fold_mae):.3f}")

# Save summary
pd.DataFrame({"fold": range(1,6), "MAE": fold_mae}).to_csv(OUT/"ols_citygkf_mae.csv", index=False)
print("Saved:", (OUT/"ols_citygkf_mae.csv").as_posix())


Fold 1: MAE = 24.063
Fold 2: MAE = 29.624
Fold 3: MAE = 16.351
Fold 4: MAE = 26.603
Fold 5: MAE = 20.446

OLS (city-grouped 5-fold) — MAE mean = 23.417 ± 4.644
Saved: artifacts/ols_citygkf_mae.csv


In [2]:
# === Baseline Huber (GroupKFold by City) — MAE only ===
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import HuberRegressor
from sklearn.metrics import mean_absolute_error

DATA = Path("C:/Users/User/Desktop/ML/Project/solar-potential-analysis-github-setup/New_approach/dataset/cleaned_datasets/top20_balanced_sample.parquet")
OUT  = Path("artifacts"); OUT.mkdir(parents=True, exist_ok=True)

cols = ["City","BuildingType_5","kWh_per_m2",
        "tilt","tilt2","tilt_sin","tilt_cos",
        "GHI_kWh_per_m2_day","AvgTemp_C","ClearnessIndex","Precip_mm_per_day"]
df = pd.read_parquet(DATA, columns=cols).copy()

TARGET = "kWh_per_m2"
CAT = ["BuildingType_5"]
NUM = ["tilt","tilt2","tilt_sin","tilt_cos","GHI_kWh_per_m2_day","AvgTemp_C","ClearnessIndex","Precip_mm_per_day"]

pre = ColumnTransformer([
    ("num", StandardScaler(), NUM),
    ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), CAT),
])

# Huber: quadratic near 0, linear in tails (robust to outliers)
huber = HuberRegressor(epsilon=1.35, alpha=0.0001, max_iter=1000)

pipe = Pipeline([("pre", pre), ("huber", huber)])

gkf = GroupKFold(n_splits=5)
groups = df["City"].astype(str).values
X = df[CAT+NUM]
y = df[TARGET].astype(float).values

fold_mae = []
for i, (tr, te) in enumerate(gkf.split(X, y, groups), 1):
    pipe.fit(X.iloc[tr], y[tr])
    pred = pipe.predict(X.iloc[te])
    mae = mean_absolute_error(y[te], pred)
    fold_mae.append(mae)
    print(f"Fold {i}: MAE = {mae:.3f}")

print(f"\nHuber (city-grouped 5-fold) — MAE mean = {np.mean(fold_mae):.3f} ± {np.std(fold_mae):.3f}")
pd.DataFrame({"fold": range(1,6), "MAE": fold_mae}).to_csv(OUT/"huber_citygkf_mae.csv", index=False)
print("Saved:", (OUT/"huber_citygkf_mae.csv").as_posix())


Fold 1: MAE = 22.701
Fold 2: MAE = 29.862
Fold 3: MAE = 16.353
Fold 4: MAE = 27.873
Fold 5: MAE = 20.657

Huber (city-grouped 5-fold) — MAE mean = 23.489 ± 4.887
Saved: artifacts/huber_citygkf_mae.csv


In [4]:
# === Baseline LightGBM (GroupKFold by City) — MAE with callbacks ===
import pandas as pd, numpy as np
from pathlib import Path
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error
from lightgbm import LGBMRegressor, early_stopping, log_evaluation

DATA = Path("C:/Users/User/Desktop/ML/Project/solar-potential-analysis-github-setup/New_approach/dataset/cleaned_datasets/top20_balanced_sample.parquet")
OUT  = Path("artifacts"); OUT.mkdir(parents=True, exist_ok=True)

cols = ["City","BuildingType_5","kWh_per_m2",
        "tilt","tilt2","tilt_sin","tilt_cos",
        "GHI_kWh_per_m2_day","AvgTemp_C","ClearnessIndex","Precip_mm_per_day"]
df = pd.read_parquet(DATA, columns=cols).copy()

TARGET = "kWh_per_m2"
CAT = ["BuildingType_5"]
NUM = ["tilt","tilt2","tilt_sin","tilt_cos","GHI_kWh_per_m2_day","AvgTemp_C","ClearnessIndex","Precip_mm_per_day"]

pre = ColumnTransformer([
    ("num", StandardScaler(), NUM),  # harmless for trees
    ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), CAT),
])

gkf = GroupKFold(n_splits=5)
groups = df["City"].astype(str).values
X = df[CAT+NUM]
y = df[TARGET].astype(float).values

fold_mae = []
fold_iters = []
for i, (tr, te) in enumerate(gkf.split(X, y, groups), 1):
    # transform once per fold
    Xt_tr = pre.fit_transform(X.iloc[tr])
    Xt_te = pre.transform(X.iloc[te])

    model = LGBMRegressor(
        n_estimators=2000,
        learning_rate=0.03,
        num_leaves=63,
        max_depth=-1,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=1.0,
        random_state=42,
        n_jobs=-1,
        # objective left default (L2); we evaluate on MAE via metric below
    )
    model.fit(
        Xt_tr, y[tr],
        eval_set=[(Xt_te, y[te])],
        eval_metric="l1",
        callbacks=[
            early_stopping(stopping_rounds=100),
            log_evaluation(period=0)  # silence logs
        ],
    )
    pred = model.predict(Xt_te, num_iteration=model.best_iteration_)
    mae = mean_absolute_error(y[te], pred)
    fold_mae.append(mae)
    fold_iters.append(model.best_iteration_)
    print(f"Fold {i}: MAE = {mae:.3f}  (iters={model.best_iteration_})")

print(f"\nLightGBM (city-grouped 5-fold) — MAE mean = {np.mean(fold_mae):.3f} ± {np.std(fold_mae):.3f}")
pd.DataFrame({"fold": range(1,6), "MAE": fold_mae, "iters": fold_iters}).to_csv(OUT/"lgbm_citygkf_mae.csv", index=False)
print("Saved:", (OUT/"lgbm_citygkf_mae.csv").as_posix())


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004557 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 766864, number of used features: 14
[LightGBM] [Info] Start training from score 272.165832
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[313]	valid_0's l1: 17.2303	valid_0's l2: 445.788




Fold 1: MAE = 17.230  (iters=313)
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003707 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1096
[LightGBM] [Info] Number of data points in the train set: 766864, number of used features: 14
[LightGBM] [Info] Start training from score 273.010929
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[73]	valid_0's l1: 17.5076	valid_0's l2: 403.12
Fold 2: MAE = 17.508  (iters=73)




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004329 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1100
[LightGBM] [Info] Number of data points in the train set: 766864, number of used features: 14
[LightGBM] [Info] Start training from score 271.877599
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[64]	valid_0's l1: 11.7685	valid_0's l2: 223.666
Fold 3: MAE = 11.768  (iters=64)




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004800 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1099
[LightGBM] [Info] Number of data points in the train set: 766864, number of used features: 14
[LightGBM] [Info] Start training from score 277.434894
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[234]	valid_0's l1: 18.2755	valid_0's l2: 616.441




Fold 4: MAE = 18.275  (iters=234)
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004477 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 766864, number of used features: 14
[LightGBM] [Info] Start training from score 265.384856
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[360]	valid_0's l1: 21.6748	valid_0's l2: 730.554




Fold 5: MAE = 21.675  (iters=360)

LightGBM (city-grouped 5-fold) — MAE mean = 17.291 ± 3.185
Saved: artifacts/lgbm_citygkf_mae.csv


In [5]:
# === Quality anchors: Skill vs naive + NMAE% (LightGBM, GroupKFold by City) ===
import pandas as pd, numpy as np
from pathlib import Path
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error
from lightgbm import LGBMRegressor, early_stopping, log_evaluation

DATA = Path("C:/Users/User/Desktop/ML/Project/solar-potential-analysis-github-setup/New_approach/dataset/cleaned_datasets/top20_balanced_sample.parquet")

cols = ["City","BuildingType_5","kWh_per_m2",
        "tilt","tilt2","tilt_sin","tilt_cos",
        "GHI_kWh_per_m2_day","AvgTemp_C","ClearnessIndex","Precip_mm_per_day"]
df = pd.read_parquet(DATA, columns=cols).copy()

TARGET = "kWh_per_m2"
CAT = ["BuildingType_5"]
NUM = ["tilt","tilt2","tilt_sin","tilt_cos","GHI_kWh_per_m2_day","AvgTemp_C","ClearnessIndex","Precip_mm_per_day"]

pre = ColumnTransformer([
    ("num", StandardScaler(), NUM),
    ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), CAT),
])

gkf = GroupKFold(n_splits=5)
groups = df["City"].astype(str).values
X = df[CAT+NUM]
y = df[TARGET].astype(float).values

rows = []
for i, (tr, te) in enumerate(gkf.split(X, y, groups), 1):
    # transform once per fold
    Xt_tr = pre.fit_transform(X.iloc[tr])
    Xt_te = pre.transform(X.iloc[te])

    # naive baseline: predict training median
    y_tr = y[tr]
    y_te = y[te]
    baseline_pred = np.full_like(y_te, fill_value=np.median(y_tr), dtype=float)
    mae_base = mean_absolute_error(y_te, baseline_pred)

    # model
    model = LGBMRegressor(
        n_estimators=2000, learning_rate=0.03, num_leaves=63,
        subsample=0.8, colsample_bytree=0.8, reg_lambda=1.0,
        random_state=42, n_jobs=-1
    )
    model.fit(
        Xt_tr, y_tr,
        eval_set=[(Xt_te, y_te)],
        eval_metric="l1",
        callbacks=[early_stopping(stopping_rounds=100), log_evaluation(period=0)],
    )
    pred = model.predict(Xt_te, num_iteration=model.best_iteration_)
    mae = mean_absolute_error(y_te, pred)

    # anchors
    skill = 1.0 - (mae / mae_base)
    nmae = 100.0 * mae / float(np.mean(y_te))

    rows.append({
        "fold": i,
        "MAE": mae,
        "Baseline_MAE": mae_base,
        "Skill": skill,
        "NMAE_%": nmae,
        "iters": model.best_iteration_,
    })

res = pd.DataFrame(rows)
print(res.round(3).to_string(index=False))
print("\nMeans:")
print(res.mean(numeric_only=True).round(3).to_string())


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003969 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 766864, number of used features: 14
[LightGBM] [Info] Start training from score 272.165832
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[313]	valid_0's l1: 17.2303	valid_0's l2: 445.788




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004057 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1096
[LightGBM] [Info] Number of data points in the train set: 766864, number of used features: 14
[LightGBM] [Info] Start training from score 273.010929
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[73]	valid_0's l1: 17.5076	valid_0's l2: 403.12




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004531 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1100
[LightGBM] [Info] Number of data points in the train set: 766864, number of used features: 14
[LightGBM] [Info] Start training from score 271.877599
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[64]	valid_0's l1: 11.7685	valid_0's l2: 223.666




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004488 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1099
[LightGBM] [Info] Number of data points in the train set: 766864, number of used features: 14
[LightGBM] [Info] Start training from score 277.434894
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[234]	valid_0's l1: 18.2755	valid_0's l2: 616.441




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003764 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 766864, number of used features: 14
[LightGBM] [Info] Start training from score 265.384856
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[360]	valid_0's l1: 21.6748	valid_0's l2: 730.554




 fold    MAE  Baseline_MAE  Skill  NMAE_%  iters
    1 17.230        26.220  0.343   6.353    313
    2 17.508        23.873  0.267   6.537     73
    3 11.768        21.591  0.455   4.321     64
    4 18.275        32.336  0.435   7.306    234
    5 21.675        34.639  0.374   7.265    360

Means:
fold              3.000
MAE              17.291
Baseline_MAE     27.732
Skill             0.375
NMAE_%            6.356
iters           208.800


In [6]:
# === LightGBM tuned for MAE (regression_l1) + smoother leaves ===
import pandas as pd, numpy as np
from pathlib import Path
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error
from lightgbm import LGBMRegressor, early_stopping, log_evaluation

DATA = Path("C:/Users/User/Desktop/ML/Project/solar-potential-analysis-github-setup/New_approach/dataset/cleaned_datasets/top20_balanced_sample.parquet")

cols = ["City","BuildingType_5","kWh_per_m2",
        "tilt","tilt2","tilt_sin","tilt_cos",
        "GHI_kWh_per_m2_day","AvgTemp_C","ClearnessIndex","Precip_mm_per_day"]
df = pd.read_parquet(DATA, columns=cols).copy()

TARGET = "kWh_per_m2"
CAT = ["BuildingType_5"]
NUM = ["tilt","tilt2","tilt_sin","tilt_cos","GHI_kWh_per_m2_day","AvgTemp_C","ClearnessIndex","Precip_mm_per_day"]

pre = ColumnTransformer([
    ("num", StandardScaler(), NUM),
    ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), CAT),
])

gkf = GroupKFold(n_splits=5)
groups = df["City"].astype(str).values
X = df[CAT+NUM]
y = df[TARGET].astype(float).values

fold_mae, fold_iters = [], []
for i, (tr, te) in enumerate(gkf.split(X, y, groups), 1):
    Xt_tr = pre.fit_transform(X.iloc[tr])
    Xt_te = pre.transform(X.iloc[te])

    model = LGBMRegressor(
        objective="regression_l1",   # optimize MAE directly
        n_estimators=4000,
        learning_rate=0.02,
        num_leaves=31,               # smaller leaves = smoother
        min_data_in_leaf=3000,       # stronger smoothing
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=1.0,
        random_state=42,
        n_jobs=-1,
    )
    model.fit(
        Xt_tr, y[tr],
        eval_set=[(Xt_te, y[te])],
        eval_metric="l1",
        callbacks=[early_stopping(stopping_rounds=150), log_evaluation(period=0)],
    )
    pred = model.predict(Xt_te, num_iteration=model.best_iteration_)
    mae = mean_absolute_error(y[te], pred)
    fold_mae.append(mae)
    fold_iters.append(model.best_iteration_)
    print(f"Fold {i}: MAE = {mae:.3f}  (iters={model.best_iteration_})")

print(f"\nLightGBM (MAE objective, smoothed) — MAE mean = {np.mean(fold_mae):.3f} ± {np.std(fold_mae):.3f}")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004246 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 766864, number of used features: 14
[LightGBM] [Info] Start training from score 273.022827
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[146]	valid_0's l1: 19.9213
Fold 1: MAE = 19.921  (iters=146)




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004772 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1096
[LightGBM] [Info] Number of data points in the train set: 766864, number of used features: 14
[LightGBM] [Info] Start training from score 273.011658
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[24]	valid_0's l1: 22.7804
Fold 2: MAE = 22.780  (iters=24)




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004600 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1100
[LightGBM] [Info] Number of data points in the train set: 766864, number of used features: 14
[LightGBM] [Info] Start training from score 273.838135
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[104]	valid_0's l1: 11.4055
Fold 3: MAE = 11.406  (iters=104)




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004758 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1099
[LightGBM] [Info] Number of data points in the train set: 766864, number of used features: 14
[LightGBM] [Info] Start training from score 278.404968
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[288]	valid_0's l1: 19.7261




Fold 4: MAE = 19.726  (iters=288)
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005354 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 766864, number of used features: 14
[LightGBM] [Info] Start training from score 265.025757
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[677]	valid_0's l1: 25.2226




Fold 5: MAE = 25.223  (iters=677)

LightGBM (MAE objective, smoothed) — MAE mean = 19.811 ± 4.664


In [7]:
# === LightGBM (L2 objective) + smoother leaves (sanity check) ===
import pandas as pd, numpy as np
from pathlib import Path
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error
from lightgbm import LGBMRegressor, early_stopping, log_evaluation

DATA = Path("C:/Users/User/Desktop/ML/Project/solar-potential-analysis-github-setup/New_approach/dataset/cleaned_datasets/top20_balanced_sample.parquet")

cols = ["City","BuildingType_5","kWh_per_m2",
        "tilt","tilt2","tilt_sin","tilt_cos",
        "GHI_kWh_per_m2_day","AvgTemp_C","ClearnessIndex","Precip_mm_per_day"]
df = pd.read_parquet(DATA, columns=cols).copy()

TARGET = "kWh_per_m2"
CAT = ["BuildingType_5"]
NUM = ["tilt","tilt2","tilt_sin","tilt_cos","GHI_kWh_per_m2_day","AvgTemp_C","ClearnessIndex","Precip_mm_per_day"]

pre = ColumnTransformer([
    ("num", StandardScaler(), NUM),
    ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), CAT),
])

gkf = GroupKFold(n_splits=5)
groups = df["City"].astype(str).values
X = df[CAT+NUM]; y = df[TARGET].astype(float).values

fold_mae, fold_iters = [], []
for i, (tr, te) in enumerate(gkf.split(X, y, groups), 1):
    Xt_tr = pre.fit_transform(X.iloc[tr])
    Xt_te = pre.transform(X.iloc[te])

    model = LGBMRegressor(
        objective="regression",   # L2/MSE (baseline objective)
        n_estimators=3000,
        learning_rate=0.03,
        num_leaves=31,            # smoother than baseline 63
        min_data_in_leaf=3000,    # smoothing
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=1.0,
        random_state=42,
        n_jobs=-1,
    )
    model.fit(
        Xt_tr, y[tr],
        eval_set=[(Xt_te, y[te])],
        eval_metric="l1",
        callbacks=[early_stopping(stopping_rounds=150), log_evaluation(period=0)],
    )
    pred = model.predict(Xt_te, num_iteration=model.best_iteration_)
    mae = mean_absolute_error(y[te], pred)
    fold_mae.append(mae); fold_iters.append(model.best_iteration_)
    print(f"Fold {i}: MAE = {mae:.3f}  (iters={model.best_iteration_})")

print(f"\nLightGBM (L2 + smoother leaves) — MAE mean = {np.mean(fold_mae):.3f} ± {np.std(fold_mae):.3f}")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004568 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 766864, number of used features: 14
[LightGBM] [Info] Start training from score 272.165832
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[156]	valid_0's l1: 17.024	valid_0's l2: 435.35
Fold 1: MAE = 17.024  (iters=156)




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.035319 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1096
[LightGBM] [Info] Number of data points in the train set: 766864, number of used features: 14
[LightGBM] [Info] Start training from score 273.010929
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[53]	valid_0's l1: 17.8492	valid_0's l2: 414.285
Fold 2: MAE = 17.849  (iters=53)




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004573 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1100
[LightGBM] [Info] Number of data points in the train set: 766864, number of used features: 14
[LightGBM] [Info] Start training from score 271.877599
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[66]	valid_0's l1: 11.544	valid_0's l2: 220.876
Fold 3: MAE = 11.544  (iters=66)




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003854 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1099
[LightGBM] [Info] Number of data points in the train set: 766864, number of used features: 14
[LightGBM] [Info] Start training from score 277.434894
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[247]	valid_0's l1: 17.8915	valid_0's l2: 582.785




Fold 4: MAE = 17.891  (iters=247)
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004738 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 766864, number of used features: 14
[LightGBM] [Info] Start training from score 265.384856
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[166]	valid_0's l1: 20.6986	valid_0's l2: 726.41
Fold 5: MAE = 20.699  (iters=166)

LightGBM (L2 + smoother leaves) — MAE mean = 17.001 ± 2.999




In [8]:
# === LOCO (Leave-One-City-Out) with best L2 LightGBM config ===
import pandas as pd, numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error
from lightgbm import LGBMRegressor, early_stopping, log_evaluation

DATA = Path("C:/Users/User/Desktop/ML/Project/solar-potential-analysis-github-setup/New_approach/dataset/cleaned_datasets/top20_balanced_sample.parquet")
OUT  = Path("artifacts"); OUT.mkdir(parents=True, exist_ok=True)

# Load
cols = ["City","BuildingType_5","kWh_per_m2",
        "tilt","tilt2","tilt_sin","tilt_cos",
        "GHI_kWh_per_m2_day","AvgTemp_C","ClearnessIndex","Precip_mm_per_day"]
df = pd.read_parquet(DATA, columns=cols).copy()

TARGET = "kWh_per_m2"
CAT = ["BuildingType_5"]
NUM = ["tilt","tilt2","tilt_sin","tilt_cos","GHI_kWh_per_m2_day","AvgTemp_C","ClearnessIndex","Precip_mm_per_day"]

cities = df["City"].astype(str).unique().tolist()
results = []

for city in sorted(cities):
    # Split LOCO
    test = df[df["City"] == city].copy()
    train = df[df["City"] != city].copy()

    # Preprocess (fit on train only)
    pre = ColumnTransformer([
        ("num", StandardScaler(), NUM),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), CAT),
    ])
    Xtr_full = train[CAT+NUM]
    ytr_full = train[TARGET].values.astype(float)
    Xte = test[CAT+NUM]
    yte = test[TARGET].values.astype(float)

    # small validation from train cities only (no leakage)
    Xtr, Xval, ytr, yval = train_test_split(
        Xtr_full, ytr_full, test_size=0.10, random_state=42
    )

    Xt_tr  = pre.fit_transform(Xtr)
    Xt_val = pre.transform(Xval)
    Xt_te  = pre.transform(Xte)

    # Best L2 config from your CV sweep (smoother leaves)
    model = LGBMRegressor(
        objective="regression",       # L2/MSE
        n_estimators=3000,
        learning_rate=0.03,
        num_leaves=31,
        min_data_in_leaf=3000,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=1.0,
        random_state=42,
        n_jobs=-1,
    )
    model.fit(
        Xt_tr, ytr,
        eval_set=[(Xt_val, yval)],
        eval_metric="l1",
        callbacks=[early_stopping(stopping_rounds=150), log_evaluation(period=0)],
    )
    pred = model.predict(Xt_te, num_iteration=model.best_iteration_)
    mae  = mean_absolute_error(yte, pred)

    results.append({"City": city, "MAE": mae, "iters": model.best_iteration_})

# Summaries
res = pd.DataFrame(results).sort_values("MAE").reset_index(drop=True)
macro_mae = res["MAE"].mean()
# micro-MAE weights by city size:
sizes = df.groupby("City").size().rename("n").reset_index()
res = res.merge(sizes, on="City", how="left")
micro_mae = (res["MAE"] * res["n"]).sum() / res["n"].sum()

print("=== LOCO MAE by held-out city (sorted best→worst) ===")
print(res[["City","n","MAE"]].to_string(index=False,
      formatters={"MAE": lambda x: f"{x:.3f}"}))
print(f"\nLOCO Macro-MAE (mean across 20 cities): {macro_mae:.3f}")
print(f"LOCO Micro-MAE (overall): {micro_mae:.3f}")

# Save
out_csv = OUT / "loco_lgbm_l2_smooth_mae.csv"
res.to_csv(out_csv, index=False)
print("\nSaved:", out_csv.as_posix())


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004961 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1107
[LightGBM] [Info] Number of data points in the train set: 819585, number of used features: 14
[LightGBM] [Info] Start training from score 272.169051
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[1888]	valid_0's l1: 7.83268	valid_0's l2: 147.955




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005026 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1107
[LightGBM] [Info] Number of data points in the train set: 819585, number of used features: 14
[LightGBM] [Info] Start training from score 273.898541
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[1800]	valid_0's l1: 7.51307	valid_0's l2: 133.402




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005563 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1107
[LightGBM] [Info] Number of data points in the train set: 819585, number of used features: 14
[LightGBM] [Info] Start training from score 270.805649
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[1856]	valid_0's l1: 7.60404	valid_0's l2: 141.653




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.051290 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1108
[LightGBM] [Info] Number of data points in the train set: 819585, number of used features: 14
[LightGBM] [Info] Start training from score 273.747613
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[1219]	valid_0's l1: 7.73658	valid_0's l2: 144.996




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005108 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1107
[LightGBM] [Info] Number of data points in the train set: 819585, number of used features: 14
[LightGBM] [Info] Start training from score 271.562076
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[1311]	valid_0's l1: 7.95923	valid_0's l2: 148.998




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005722 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1107
[LightGBM] [Info] Number of data points in the train set: 819585, number of used features: 14
[LightGBM] [Info] Start training from score 271.437339
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[1157]	valid_0's l1: 7.95203	valid_0's l2: 148.838




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004219 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1107
[LightGBM] [Info] Number of data points in the train set: 819585, number of used features: 14
[LightGBM] [Info] Start training from score 273.720206
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[977]	valid_0's l1: 7.72157	valid_0's l2: 143.836




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005156 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1108
[LightGBM] [Info] Number of data points in the train set: 819585, number of used features: 14
[LightGBM] [Info] Start training from score 271.347611
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[1024]	valid_0's l1: 7.35028	valid_0's l2: 134.236




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004708 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1107
[LightGBM] [Info] Number of data points in the train set: 819585, number of used features: 14
[LightGBM] [Info] Start training from score 271.631985
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[1856]	valid_0's l1: 7.57967	valid_0's l2: 142.605




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003967 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1107
[LightGBM] [Info] Number of data points in the train set: 819585, number of used features: 14
[LightGBM] [Info] Start training from score 272.323663
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[1346]	valid_0's l1: 7.5361	valid_0's l2: 136.799




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004222 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1107
[LightGBM] [Info] Number of data points in the train set: 819585, number of used features: 14
[LightGBM] [Info] Start training from score 270.174022
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[1962]	valid_0's l1: 8.00328	valid_0's l2: 151.007




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005339 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1111
[LightGBM] [Info] Number of data points in the train set: 819585, number of used features: 14
[LightGBM] [Info] Start training from score 273.619955
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[1310]	valid_0's l1: 7.78116	valid_0's l2: 148.099




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003490 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1111
[LightGBM] [Info] Number of data points in the train set: 819585, number of used features: 14
[LightGBM] [Info] Start training from score 273.448833
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[995]	valid_0's l1: 7.77975	valid_0's l2: 147.678




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004822 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1107
[LightGBM] [Info] Number of data points in the train set: 819585, number of used features: 14
[LightGBM] [Info] Start training from score 270.788151
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[1625]	valid_0's l1: 7.97665	valid_0's l2: 150.571




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006148 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1107
[LightGBM] [Info] Number of data points in the train set: 819585, number of used features: 14
[LightGBM] [Info] Start training from score 273.727424
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[1120]	valid_0's l1: 7.82365	valid_0's l2: 148.497




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003935 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1107
[LightGBM] [Info] Number of data points in the train set: 819585, number of used features: 14
[LightGBM] [Info] Start training from score 268.549354
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[1118]	valid_0's l1: 7.8867	valid_0's l2: 147.321




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005906 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1107
[LightGBM] [Info] Number of data points in the train set: 819585, number of used features: 14
[LightGBM] [Info] Start training from score 271.246760
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[2131]	valid_0's l1: 7.9687	valid_0's l2: 150.9




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005062 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1107
[LightGBM] [Info] Number of data points in the train set: 819585, number of used features: 14
[LightGBM] [Info] Start training from score 272.202964
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[935]	valid_0's l1: 7.61077	valid_0's l2: 143.382




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004940 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1107
[LightGBM] [Info] Number of data points in the train set: 819585, number of used features: 14
[LightGBM] [Info] Start training from score 272.609537
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[2305]	valid_0's l1: 7.44865	valid_0's l2: 134.123




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004144 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1107
[LightGBM] [Info] Number of data points in the train set: 819585, number of used features: 14
[LightGBM] [Info] Start training from score 270.436684
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[1363]	valid_0's l1: 7.1427	valid_0's l2: 115.852




=== LOCO MAE by held-out city (sorted best→worst) ===
            City     n    MAE
     DarEsSalaam 47929  6.007
           Accra 47929  7.341
      LagosState 47929  9.063
           Lagos 47929  9.247
GreatDhakaRegion 47929  9.729
         Colombo 47929 11.159
          Beirut 47929 12.325
          Panama 47929 12.760
         Antigua 47929 13.443
         Nairobi 47929 13.891
          Manila 47929 15.607
         Grenada 47929 16.982
        Maldives 47929 17.376
       Samarkand 47929 20.167
           Izmir 47929 20.705
     SouthAfrica 47929 23.397
        Honduras 47929 24.445
     Mexico City 47929 34.833
         Karachi 47929 37.114
          Almaty 47929 39.992

LOCO Macro-MAE (mean across 20 cities): 17.779
LOCO Micro-MAE (overall): 17.779

Saved: artifacts/loco_lgbm_l2_smooth_mae.csv


In [9]:
# === LOCO with SuperType back-off added (same best L2 config) ===
import pandas as pd, numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error
from lightgbm import LGBMRegressor, early_stopping, log_evaluation

DATA = Path("C:/Users/User/Desktop/ML/Project/solar-potential-analysis-github-setup/New_approach/dataset/cleaned_datasets/top20_balanced_sample.parquet")
OUT  = Path("artifacts"); OUT.mkdir(parents=True, exist_ok=True)

# Load
cols = ["City","BuildingType_5","kWh_per_m2","tilt","tilt2","tilt_sin","tilt_cos",
        "GHI_kWh_per_m2_day","AvgTemp_C","ClearnessIndex","Precip_mm_per_day","SuperType"]
df = pd.read_parquet(DATA, columns=[c for c in cols if c in pd.read_parquet(DATA).columns]).copy()

# If SuperType is missing in this file, rebuild it quickly from BuildingType (fallback)
if "SuperType" not in df.columns:
    # lightweight map (same as we agreed)
    sup = {
        "single family residential":"Residential",
        "multifamily residential":"Residential",
        "peri-urban settlement":"Residential",
        "commercial":"Commercial",
        "small commercial":"Commercial",
        "hotels":"Commercial",
        "industrial":"Industrial",
        "public sector":"Public",
        "schools":"Public",
        "public health facilities":"Public",
    }
    # you'll have original BuildingType only if present; otherwise skip this block
    if "BuildingType" in df.columns:
        df["SuperType"] = df["BuildingType"].map(sup).astype("string")

TARGET = "kWh_per_m2"
CAT = ["BuildingType_5", "SuperType"] if "SuperType" in df.columns else ["BuildingType_5"]
NUM = ["tilt","tilt2","tilt_sin","tilt_cos","GHI_kWh_per_m2_day","AvgTemp_C","ClearnessIndex","Precip_mm_per_day"]

cities = df["City"].astype(str).unique().tolist()
results = []

for city in sorted(cities):
    test  = df[df["City"] == city].copy()
    train = df[df["City"] != city].copy()

    pre = ColumnTransformer([
        ("num", StandardScaler(), NUM),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), CAT),
    ])
    Xtr_full = train[CAT+NUM]; ytr_full = train[TARGET].values.astype(float)
    Xte = test[CAT+NUM];        yte = test[TARGET].values.astype(float)

    # small val split from training cities only
    Xtr, Xval, ytr, yval = train_test_split(Xtr_full, ytr_full, test_size=0.10, random_state=42)

    Xt_tr  = pre.fit_transform(Xtr)
    Xt_val = pre.transform(Xval)
    Xt_te  = pre.transform(Xte)

    model = LGBMRegressor(
        objective="regression",
        n_estimators=3000,
        learning_rate=0.03,
        num_leaves=31,
        min_data_in_leaf=3000,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=1.0,
        random_state=42,
        n_jobs=-1,
    )
    model.fit(
        Xt_tr, ytr,
        eval_set=[(Xt_val, yval)],
        eval_metric="l1",
        callbacks=[early_stopping(stopping_rounds=150), log_evaluation(period=0)],
    )
    pred = model.predict(Xt_te, num_iteration=model.best_iteration_)
    mae  = mean_absolute_error(yte, pred)
    results.append({"City": city, "MAE": mae, "iters": model.best_iteration_})

res = pd.DataFrame(results).sort_values("MAE").reset_index(drop=True)
macro_mae = res["MAE"].mean()
sizes = df.groupby("City").size().rename("n").reset_index()
res = res.merge(sizes, on="City", how="left")
micro_mae = (res["MAE"] * res["n"]).sum() / res["n"].sum()

print("=== LOCO with SuperType — MAE by held-out city (best→worst) ===")
print(res[["City","n","MAE"]].to_string(index=False, formatters={"MAE": lambda x: f"{x:.3f}"}))
print(f"\nLOCO Macro-MAE: {macro_mae:.3f}   Micro-MAE: {micro_mae:.3f}")

out_csv = OUT / "loco_lgbm_l2_smooth_with_supertype_mae.csv"
res.to_csv(out_csv, index=False)
print("\nSaved:", out_csv.as_posix())


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003897 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1107
[LightGBM] [Info] Number of data points in the train set: 819585, number of used features: 14
[LightGBM] [Info] Start training from score 272.169051
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[1888]	valid_0's l1: 7.83268	valid_0's l2: 147.955




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005395 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1107
[LightGBM] [Info] Number of data points in the train set: 819585, number of used features: 14
[LightGBM] [Info] Start training from score 273.898541
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[1800]	valid_0's l1: 7.51307	valid_0's l2: 133.402




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004784 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1107
[LightGBM] [Info] Number of data points in the train set: 819585, number of used features: 14
[LightGBM] [Info] Start training from score 270.805649
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[1856]	valid_0's l1: 7.60404	valid_0's l2: 141.653




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005584 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1108
[LightGBM] [Info] Number of data points in the train set: 819585, number of used features: 14
[LightGBM] [Info] Start training from score 273.747613
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[1219]	valid_0's l1: 7.73658	valid_0's l2: 144.996




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004549 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1107
[LightGBM] [Info] Number of data points in the train set: 819585, number of used features: 14
[LightGBM] [Info] Start training from score 271.562076
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[1311]	valid_0's l1: 7.95923	valid_0's l2: 148.998




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004106 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1107
[LightGBM] [Info] Number of data points in the train set: 819585, number of used features: 14
[LightGBM] [Info] Start training from score 271.437339
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[1157]	valid_0's l1: 7.95203	valid_0's l2: 148.838




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004919 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1107
[LightGBM] [Info] Number of data points in the train set: 819585, number of used features: 14
[LightGBM] [Info] Start training from score 273.720206
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[977]	valid_0's l1: 7.72157	valid_0's l2: 143.836




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004749 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1108
[LightGBM] [Info] Number of data points in the train set: 819585, number of used features: 14
[LightGBM] [Info] Start training from score 271.347611
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[1024]	valid_0's l1: 7.35028	valid_0's l2: 134.236




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005307 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1107
[LightGBM] [Info] Number of data points in the train set: 819585, number of used features: 14
[LightGBM] [Info] Start training from score 271.631985
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[1856]	valid_0's l1: 7.57967	valid_0's l2: 142.605




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004625 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1107
[LightGBM] [Info] Number of data points in the train set: 819585, number of used features: 14
[LightGBM] [Info] Start training from score 272.323663
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[1346]	valid_0's l1: 7.5361	valid_0's l2: 136.799




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004351 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1107
[LightGBM] [Info] Number of data points in the train set: 819585, number of used features: 14
[LightGBM] [Info] Start training from score 270.174022
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[1962]	valid_0's l1: 8.00328	valid_0's l2: 151.007




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005085 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1111
[LightGBM] [Info] Number of data points in the train set: 819585, number of used features: 14
[LightGBM] [Info] Start training from score 273.619955
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[1310]	valid_0's l1: 7.78116	valid_0's l2: 148.099




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015508 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1111
[LightGBM] [Info] Number of data points in the train set: 819585, number of used features: 14
[LightGBM] [Info] Start training from score 273.448833
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[995]	valid_0's l1: 7.77975	valid_0's l2: 147.678




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004919 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1107
[LightGBM] [Info] Number of data points in the train set: 819585, number of used features: 14
[LightGBM] [Info] Start training from score 270.788151
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[1625]	valid_0's l1: 7.97665	valid_0's l2: 150.571




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003992 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1107
[LightGBM] [Info] Number of data points in the train set: 819585, number of used features: 14
[LightGBM] [Info] Start training from score 273.727424
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[1120]	valid_0's l1: 7.82365	valid_0's l2: 148.497




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004222 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1107
[LightGBM] [Info] Number of data points in the train set: 819585, number of used features: 14
[LightGBM] [Info] Start training from score 268.549354
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[1118]	valid_0's l1: 7.8867	valid_0's l2: 147.321




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004379 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1107
[LightGBM] [Info] Number of data points in the train set: 819585, number of used features: 14
[LightGBM] [Info] Start training from score 271.246760
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[2131]	valid_0's l1: 7.9687	valid_0's l2: 150.9




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005921 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1107
[LightGBM] [Info] Number of data points in the train set: 819585, number of used features: 14
[LightGBM] [Info] Start training from score 272.202964
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[935]	valid_0's l1: 7.61077	valid_0's l2: 143.382




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005647 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1107
[LightGBM] [Info] Number of data points in the train set: 819585, number of used features: 14
[LightGBM] [Info] Start training from score 272.609537
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[2305]	valid_0's l1: 7.44865	valid_0's l2: 134.123




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005111 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1107
[LightGBM] [Info] Number of data points in the train set: 819585, number of used features: 14
[LightGBM] [Info] Start training from score 270.436684
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[1363]	valid_0's l1: 7.1427	valid_0's l2: 115.852




=== LOCO with SuperType — MAE by held-out city (best→worst) ===
            City     n    MAE
     DarEsSalaam 47929  6.007
           Accra 47929  7.341
      LagosState 47929  9.063
           Lagos 47929  9.247
GreatDhakaRegion 47929  9.729
         Colombo 47929 11.159
          Beirut 47929 12.325
          Panama 47929 12.760
         Antigua 47929 13.443
         Nairobi 47929 13.891
          Manila 47929 15.607
         Grenada 47929 16.982
        Maldives 47929 17.376
       Samarkand 47929 20.167
           Izmir 47929 20.705
     SouthAfrica 47929 23.397
        Honduras 47929 24.445
     Mexico City 47929 34.833
         Karachi 47929 37.114
          Almaty 47929 39.992

LOCO Macro-MAE: 17.779   Micro-MAE: 17.779

Saved: artifacts/loco_lgbm_l2_smooth_with_supertype_mae.csv


In [10]:
# === LOCO diagnostics: MAE by BuildingType_5 for the hardest cities ===
import pandas as pd, numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error
from lightgbm import LGBMRegressor, early_stopping, log_evaluation

DATA = Path("C:/Users/User/Desktop/ML/Project/solar-potential-analysis-github-setup/New_approach/dataset/cleaned_datasets/top20_balanced_sample.parquet")
OUT  = Path("artifacts"); OUT.mkdir(parents=True, exist_ok=True)

cols = ["City","BuildingType_5","kWh_per_m2",
        "tilt","tilt2","tilt_sin","tilt_cos",
        "GHI_kWh_per_m2_day","AvgTemp_C","ClearnessIndex","Precip_mm_per_day"]
df = pd.read_parquet(DATA, columns=cols).copy()

TARGET = "kWh_per_m2"
CAT = ["BuildingType_5"]
NUM = ["tilt","tilt2","tilt_sin","tilt_cos","GHI_kWh_per_m2_day","AvgTemp_C","ClearnessIndex","Precip_mm_per_day"]

cities = sorted(df["City"].astype(str).unique())
rows = []
per_city_type = []

for city in cities:
    test  = df[df["City"] == city].copy()
    train = df[df["City"] != city].copy()

    pre = ColumnTransformer([
        ("num", StandardScaler(), NUM),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), CAT),
    ])

    Xtr_full = train[CAT+NUM]; ytr_full = train[TARGET].astype(float).values
    Xte = test[CAT+NUM];        yte = test[TARGET].astype(float).values
    Xtr, Xval, ytr, yval = train_test_split(Xtr_full, ytr_full, test_size=0.10, random_state=42)

    Xt_tr  = pre.fit_transform(Xtr)
    Xt_val = pre.transform(Xval)
    Xt_te  = pre.transform(Xte)

    model = LGBMRegressor(
        objective="regression",
        n_estimators=3000,
        learning_rate=0.03,
        num_leaves=31,
        min_data_in_leaf=3000,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=1.0,
        random_state=42,
        n_jobs=-1,
    )
    model.fit(
        Xt_tr, ytr,
        eval_set=[(Xt_val, yval)],
        eval_metric="l1",
        callbacks=[early_stopping(stopping_rounds=150), log_evaluation(period=0)],
    )
    pred = model.predict(Xt_te, num_iteration=model.best_iteration_)
    overall = mean_absolute_error(yte, pred)
    rows.append({"City": city, "MAE": overall, "iters": model.best_iteration_})

    # per-type MAE in this city
    tmp = test[["BuildingType_5"]].copy()
    tmp["y"] = yte
    tmp["yhat"] = pred
    g = tmp.groupby("BuildingType_5").apply(lambda d: mean_absolute_error(d["y"], d["yhat"])).rename("MAE_type")
    for t, v in g.items():
        per_city_type.append({"City": city, "BuildingType_5": t, "MAE_type": v, "support": (tmp["BuildingType_5"]==t).sum()})

res = pd.DataFrame(rows).sort_values("MAE", ascending=False).reset_index(drop=True)
by_type = pd.DataFrame(per_city_type)

# Show top-5 hardest cities with their per-type MAE (sorted by MAE_type desc)
hard5 = res.head(5)["City"].tolist()
print("=== Hardest 5 cities (overall LOCO MAE) ===")
print(res.head(5).to_string(index=False, formatters={"MAE": lambda x: f"{x:.3f}"}))

print("\n=== Per-type MAE in each of the hardest cities ===")
for c in hard5:
    sub = by_type[by_type["City"]==c].sort_values("MAE_type", ascending=False)
    print(f"\n[{c}]")
    print(sub.to_string(index=False, formatters={"MAE_type": lambda x: f"{x:.3f}"}))

# Save artifacts
res.to_csv(OUT/"loco_city_overall_mae.csv", index=False)
by_type.to_csv(OUT/"loco_city_type_mae.csv", index=False)
print("\nSaved:",
      (OUT/"loco_city_overall_mae.csv").as_posix(),
      (OUT/"loco_city_type_mae.csv").as_posix())


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004772 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1107
[LightGBM] [Info] Number of data points in the train set: 819585, number of used features: 14
[LightGBM] [Info] Start training from score 272.169051
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[1888]	valid_0's l1: 7.83268	valid_0's l2: 147.955


  g = tmp.groupby("BuildingType_5").apply(lambda d: mean_absolute_error(d["y"], d["yhat"])).rename("MAE_type")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004840 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1107
[LightGBM] [Info] Number of data points in the train set: 819585, number of used features: 14
[LightGBM] [Info] Start training from score 273.898541
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[1800]	valid_0's l1: 7.51307	valid_0's l2: 133.402


  g = tmp.groupby("BuildingType_5").apply(lambda d: mean_absolute_error(d["y"], d["yhat"])).rename("MAE_type")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004219 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1107
[LightGBM] [Info] Number of data points in the train set: 819585, number of used features: 14
[LightGBM] [Info] Start training from score 270.805649
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[1856]	valid_0's l1: 7.60404	valid_0's l2: 141.653


  g = tmp.groupby("BuildingType_5").apply(lambda d: mean_absolute_error(d["y"], d["yhat"])).rename("MAE_type")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002960 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1108
[LightGBM] [Info] Number of data points in the train set: 819585, number of used features: 14
[LightGBM] [Info] Start training from score 273.747613
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[1219]	valid_0's l1: 7.73658	valid_0's l2: 144.996


  g = tmp.groupby("BuildingType_5").apply(lambda d: mean_absolute_error(d["y"], d["yhat"])).rename("MAE_type")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006162 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1107
[LightGBM] [Info] Number of data points in the train set: 819585, number of used features: 14
[LightGBM] [Info] Start training from score 271.562076
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[1311]	valid_0's l1: 7.95923	valid_0's l2: 148.998


  g = tmp.groupby("BuildingType_5").apply(lambda d: mean_absolute_error(d["y"], d["yhat"])).rename("MAE_type")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005151 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1107
[LightGBM] [Info] Number of data points in the train set: 819585, number of used features: 14
[LightGBM] [Info] Start training from score 271.437339
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[1157]	valid_0's l1: 7.95203	valid_0's l2: 148.838


  g = tmp.groupby("BuildingType_5").apply(lambda d: mean_absolute_error(d["y"], d["yhat"])).rename("MAE_type")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004341 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1107
[LightGBM] [Info] Number of data points in the train set: 819585, number of used features: 14
[LightGBM] [Info] Start training from score 273.720206
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[977]	valid_0's l1: 7.72157	valid_0's l2: 143.836


  g = tmp.groupby("BuildingType_5").apply(lambda d: mean_absolute_error(d["y"], d["yhat"])).rename("MAE_type")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003303 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1108
[LightGBM] [Info] Number of data points in the train set: 819585, number of used features: 14
[LightGBM] [Info] Start training from score 271.347611
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[1024]	valid_0's l1: 7.35028	valid_0's l2: 134.236


  g = tmp.groupby("BuildingType_5").apply(lambda d: mean_absolute_error(d["y"], d["yhat"])).rename("MAE_type")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005365 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1107
[LightGBM] [Info] Number of data points in the train set: 819585, number of used features: 14
[LightGBM] [Info] Start training from score 271.631985
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[1856]	valid_0's l1: 7.57967	valid_0's l2: 142.605


  g = tmp.groupby("BuildingType_5").apply(lambda d: mean_absolute_error(d["y"], d["yhat"])).rename("MAE_type")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004844 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1107
[LightGBM] [Info] Number of data points in the train set: 819585, number of used features: 14
[LightGBM] [Info] Start training from score 272.323663
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[1346]	valid_0's l1: 7.5361	valid_0's l2: 136.799


  g = tmp.groupby("BuildingType_5").apply(lambda d: mean_absolute_error(d["y"], d["yhat"])).rename("MAE_type")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004803 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1107
[LightGBM] [Info] Number of data points in the train set: 819585, number of used features: 14
[LightGBM] [Info] Start training from score 270.174022
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[1962]	valid_0's l1: 8.00328	valid_0's l2: 151.007


  g = tmp.groupby("BuildingType_5").apply(lambda d: mean_absolute_error(d["y"], d["yhat"])).rename("MAE_type")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002945 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1111
[LightGBM] [Info] Number of data points in the train set: 819585, number of used features: 14
[LightGBM] [Info] Start training from score 273.619955
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[1310]	valid_0's l1: 7.78116	valid_0's l2: 148.099


  g = tmp.groupby("BuildingType_5").apply(lambda d: mean_absolute_error(d["y"], d["yhat"])).rename("MAE_type")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004364 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1111
[LightGBM] [Info] Number of data points in the train set: 819585, number of used features: 14
[LightGBM] [Info] Start training from score 273.448833
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[995]	valid_0's l1: 7.77975	valid_0's l2: 147.678


  g = tmp.groupby("BuildingType_5").apply(lambda d: mean_absolute_error(d["y"], d["yhat"])).rename("MAE_type")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004039 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1107
[LightGBM] [Info] Number of data points in the train set: 819585, number of used features: 14
[LightGBM] [Info] Start training from score 270.788151
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[1625]	valid_0's l1: 7.97665	valid_0's l2: 150.571


  g = tmp.groupby("BuildingType_5").apply(lambda d: mean_absolute_error(d["y"], d["yhat"])).rename("MAE_type")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005338 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1107
[LightGBM] [Info] Number of data points in the train set: 819585, number of used features: 14
[LightGBM] [Info] Start training from score 273.727424
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[1120]	valid_0's l1: 7.82365	valid_0's l2: 148.497


  g = tmp.groupby("BuildingType_5").apply(lambda d: mean_absolute_error(d["y"], d["yhat"])).rename("MAE_type")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005435 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1107
[LightGBM] [Info] Number of data points in the train set: 819585, number of used features: 14
[LightGBM] [Info] Start training from score 268.549354
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[1118]	valid_0's l1: 7.8867	valid_0's l2: 147.321


  g = tmp.groupby("BuildingType_5").apply(lambda d: mean_absolute_error(d["y"], d["yhat"])).rename("MAE_type")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006344 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1107
[LightGBM] [Info] Number of data points in the train set: 819585, number of used features: 14
[LightGBM] [Info] Start training from score 271.246760
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[2131]	valid_0's l1: 7.9687	valid_0's l2: 150.9


  g = tmp.groupby("BuildingType_5").apply(lambda d: mean_absolute_error(d["y"], d["yhat"])).rename("MAE_type")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005176 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1107
[LightGBM] [Info] Number of data points in the train set: 819585, number of used features: 14
[LightGBM] [Info] Start training from score 272.202964
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[935]	valid_0's l1: 7.61077	valid_0's l2: 143.382


  g = tmp.groupby("BuildingType_5").apply(lambda d: mean_absolute_error(d["y"], d["yhat"])).rename("MAE_type")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004291 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1107
[LightGBM] [Info] Number of data points in the train set: 819585, number of used features: 14
[LightGBM] [Info] Start training from score 272.609537
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[2305]	valid_0's l1: 7.44865	valid_0's l2: 134.123


  g = tmp.groupby("BuildingType_5").apply(lambda d: mean_absolute_error(d["y"], d["yhat"])).rename("MAE_type")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004148 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1107
[LightGBM] [Info] Number of data points in the train set: 819585, number of used features: 14
[LightGBM] [Info] Start training from score 270.436684
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[1363]	valid_0's l1: 7.1427	valid_0's l2: 115.852




=== Hardest 5 cities (overall LOCO MAE) ===
       City    MAE  iters
     Almaty 39.992   1800
    Karachi 37.114   1962
Mexico City 34.833   1118
   Honduras 24.445   1856
SouthAfrica 23.397   1363

=== Per-type MAE in each of the hardest cities ===

[Almaty]
  City            BuildingType_5 MAE_type  support
Almaty   multifamily residential   43.631     8111
Almaty single family residential   42.792    23965
Almaty             public sector   40.909     1619
Almaty                commercial   38.359     8500
Almaty                industrial   25.306     5734

[Karachi]
   City          BuildingType_5 MAE_type  support
Karachi           public sector   37.700     2974
Karachi multifamily residential   37.320    23965
Karachi                   Other   37.228    13662
Karachi              commercial   36.814     4262
Karachi              industrial   34.840     3066

[Mexico City]
       City            BuildingType_5 MAE_type  support
Mexico City             public sector   35.845    

  g = tmp.groupby("BuildingType_5").apply(lambda d: mean_absolute_error(d["y"], d["yhat"])).rename("MAE_type")


In [11]:
# === Slide visuals pack ===
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

ART = Path("artifacts")
OUT = ART  # save alongside other artifacts
OUT.mkdir(parents=True, exist_ok=True)

# ---------- 1) Model comparison (CV MAE ± std) ----------
# Using your run summaries
models = ["OLS", "Huber", "LightGBM (L2 baseline)", "LightGBM (L2 smoothed)"]
means  = [23.417, 23.489, 17.291, 17.001]
stds   = [ 4.644,  4.887,  3.185,  2.999]

plt.figure(figsize=(10,6))
x = np.arange(len(models))
plt.bar(x, means, yerr=stds, capsize=6)
plt.xticks(x, models, rotation=15, ha="right")
plt.ylabel("MAE (kWh/m²)")
plt.title("Cross-validated MAE by Model (GroupKFold by City)")
plt.tight_layout()
plt.savefig(OUT/"viz_model_comparison_cv.png", dpi=220)
plt.close()

# ---------- 2) LOCO MAE by city (sorted) ----------
loco_path = ART / "loco_lgbm_l2_smooth_mae.csv"
loco = pd.read_csv(loco_path)  # expects columns: City, MAE, n (if present)
loco = loco.sort_values("MAE", ascending=True)

plt.figure(figsize=(11,7))
y = np.arange(len(loco))
plt.barh(y, loco["MAE"].values)
plt.yticks(y, loco["City"].tolist())
plt.xlabel("MAE (kWh/m²)")
plt.title("LOCO — MAE by Held-out City (LightGBM L2 smoothed)")
plt.tight_layout()
plt.savefig(OUT/"viz_loco_mae_by_city.png", dpi=220)
plt.close()

# ---------- 3) Per-type MAE in the 3 hardest cities ----------
by_type = pd.read_csv(ART / "loco_city_type_mae.csv")  # City, BuildingType_5, MAE_type, support
hard3 = pd.read_csv(loco_path).sort_values("MAE", ascending=False)["City"].head(3).tolist()

order_types = [
    "single family residential",
    "multifamily residential",
    "commercial",
    "industrial",
    "public sector",
    "Other",
]

fig, axes = plt.subplots(1, 3, figsize=(14,4.5), sharey=True)
for ax, city in zip(axes, hard3):
    sub = by_type[by_type["City"]==city].copy()
    # ensure consistent type order (drop missing gracefully)
    sub["BuildingType_5"] = pd.Categorical(sub["BuildingType_5"], categories=order_types, ordered=True)
    sub = sub.sort_values("BuildingType_5")
    ax.bar(sub["BuildingType_5"].astype(str), sub["MAE_type"].values)
    ax.set_title(city)
    ax.set_xticklabels(sub["BuildingType_5"].astype(str), rotation=30, ha="right")
    ax.set_ylim(0, max(40, sub["MAE_type"].max()*1.15))
axes[0].set_ylabel("MAE (kWh/m²)")
fig.suptitle("LOCO — MAE by BuildingType in Hardest 3 Cities", y=1.02)
plt.tight_layout()
plt.savefig(OUT/"viz_hardest3_per_type.png", dpi=220, bbox_inches="tight")
plt.close()

print("Saved slide visuals:")
print(" - artifacts/viz_model_comparison_cv.png")
print(" - artifacts/viz_loco_mae_by_city.png")
print(" - artifacts/viz_hardest3_per_type.png")


  ax.set_xticklabels(sub["BuildingType_5"].astype(str), rotation=30, ha="right")
  ax.set_xticklabels(sub["BuildingType_5"].astype(str), rotation=30, ha="right")
  ax.set_xticklabels(sub["BuildingType_5"].astype(str), rotation=30, ha="right")


Saved slide visuals:
 - artifacts/viz_model_comparison_cv.png
 - artifacts/viz_loco_mae_by_city.png
 - artifacts/viz_hardest3_per_type.png


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 958580 entries, 0 to 958579
Data columns (total 11 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   City                958580 non-null  object 
 1   BuildingType_5      958580 non-null  object 
 2   kWh_per_m2          958580 non-null  float64
 3   tilt                958580 non-null  float64
 4   tilt2               958580 non-null  float64
 5   tilt_sin            958580 non-null  float64
 6   tilt_cos            958580 non-null  float64
 7   GHI_kWh_per_m2_day  958580 non-null  float64
 8   AvgTemp_C           958580 non-null  float64
 9   ClearnessIndex      958580 non-null  float64
 10  Precip_mm_per_day   958580 non-null  float64
dtypes: float64(9), object(2)
memory usage: 80.4+ MB


In [13]:
df.shape

(958580, 11)