In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

# === Load balanced-by-type dataset ===
DATA = Path("/Users/thetsusann/Documents/ML/Energy404---Rooftop-Solar-Potential/New_approach/dataset/cleaned_datasets/top20_balanced_by_type.parquet")
df = pd.read_parquet(DATA)

print("âœ… Loaded dataset:", df.shape)
print(df["BuildingType"].value_counts())


âœ… Loaded dataset: (249428, 12)
BuildingType
commercial                   40000
industrial                   40000
multifamily residential      40000
public sector                40000
single family residential    40000
peri-urban settlement        16960
schools                      14596
public health facilities      8009
hotels                        7493
small commercial              2370
Name: count, dtype: int64


In [2]:
# === Feature sets ===
TARGET = "kWh_per_m2"
CAT = ["BuildingType"]
NUM = [
    "tilt","tilt2","tilt_sin","tilt_cos",
    "GHI_kWh_per_m2_day","AvgTemp_C",
    "ClearnessIndex","Precip_mm_per_day"
]

X = df[NUM + CAT].copy()
y = df[TARGET].copy()

for c in CAT:
    X[c] = X[c].astype("category")

# Encode categorical for XGB & et
X_encoded = X.copy()
X_encoded["BuildingType"] = X_encoded["BuildingType"].cat.codes

# Log-transform target
y_log = np.log1p(y)


In [3]:
from lightgbm import LGBMRegressor, early_stopping
from xgboost import XGBRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import Ridge
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_absolute_error

# Base model parameters
xgb_params = dict(
    objective='reg:squarederror',
    n_estimators=1500,
    learning_rate=0.03,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

lgb_params = dict(
    objective='mae',
    n_estimators=2000,
    learning_rate=0.03,
    num_leaves=15,
    min_child_samples=100,
    lambda_l1=1.0,
    lambda_l2=1.0,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

et_params = dict(
    n_estimators=500,
    max_depth=None,
    min_samples_leaf=2,
    n_jobs=-1,
    random_state=42
)


In [4]:
cv = GroupKFold(n_splits=3)
mae_scores = []
oof = []

for fold, (tr, va) in enumerate(cv.split(X, y_log, groups=df["City"]), 1):
    X_tr_lgb, X_va_lgb = X.iloc[tr], X.iloc[va]
    X_tr_enc, X_va_enc = X_encoded.iloc[tr], X_encoded.iloc[va]
    y_tr, y_va = y_log.iloc[tr], y_log.iloc[va]

    # --- Base models ---
    lgb = LGBMRegressor(**lgb_params)
    xgb = XGBRegressor(**xgb_params)
    et  = ExtraTreesRegressor(**et_params)

    # --- Train ---
    lgb.fit(X_tr_lgb, y_tr, eval_set=[(X_va_lgb, y_va)],
            callbacks=[early_stopping(stopping_rounds=150, verbose=False)])
    xgb.fit(X_tr_enc, y_tr, eval_set=[(X_va_enc, y_va)], verbose=False)
    et.fit(X_tr_enc, y_tr)

    # --- Predict ---
    pred_lgb = np.expm1(lgb.predict(X_va_lgb))
    pred_xgb = np.expm1(xgb.predict(X_va_enc))
    pred_et  = np.expm1(et.predict(X_va_enc))

    # --- Meta-learner (Ridge) ---
    meta = Ridge(alpha=1.0)
    meta.fit(np.column_stack([pred_lgb, pred_xgb, pred_et]), np.expm1(y_va))
    stacked = meta.predict(np.column_stack([pred_lgb, pred_xgb, pred_et]))

    # --- Evaluate ---
    y_true = np.expm1(y_va)
    mae = mean_absolute_error(y_true, stacked)
    mae_scores.append(mae)
    print(f"Fold {fold} MAE = {mae:.3f}")

    # --- Store OOF ---
    oof.append(pd.DataFrame({
        "City": df.loc[va, "City"],
        "BuildingType": df.loc[va, "BuildingType"].values,
        "y_true": y_true,
        "y_pred": stacked
    }))

print(f"\nðŸŽ¯ Mean Stacked Ensemble (LGBM + XGB + et) MAE: {np.mean(mae_scores):.3f} Â± {np.std(mae_scores):.3f}")

# Combine folds
oof_df = pd.concat(oof, ignore_index=True)
oof_df.to_parquet("oof_balanced_by_type_et.parquet", index=False)
print("âœ… Saved OOF predictions (with ExtraTrees).")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000702 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1085
[LightGBM] [Info] Number of data points in the train set: 168309, number of used features: 9
[LightGBM] [Info] Start training from score 5.610894
Fold 1 MAE = 16.141
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000671 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1082
[LightGBM] [Info] Number of data points in the train set: 165387, number of used features: 9
[LightGBM] [Info] Start training from score 5.606667
Fold 2 MAE = 12.221
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000738 seconds.
You can set `force_row_wise=true` to remove 

In [5]:
print("Overall MAE:", mean_absolute_error(oof_df["y_true"], oof_df["y_pred"]))

print("\nMAE by BuildingType:")
print(oof_df.groupby("BuildingType", group_keys=False)
             .apply(lambda d: mean_absolute_error(d["y_true"], d["y_pred"]))
             .round(2))

Overall MAE: 17.08628839618206

MAE by BuildingType:
BuildingType
commercial                   17.71
hotels                       13.99
industrial                   14.56
multifamily residential      20.54
peri-urban settlement        17.84
public health facilities     19.75
public sector                15.73
schools                      18.79
single family residential    16.59
small commercial              7.08
dtype: float64


  .apply(lambda d: mean_absolute_error(d["y_true"], d["y_pred"]))
