In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

# === Load balanced-by-type dataset ===
DATA = Path("/Users/thetsusann/Documents/ML/Energy404---Rooftop-Solar-Potential/New_approach/dataset/cleaned_datasets/top20_balanced_by_type.parquet")
df = pd.read_parquet(DATA)

print("âœ… Loaded dataset:", df.shape)
print(df["BuildingType"].value_counts())


âœ… Loaded dataset: (249428, 12)
BuildingType
commercial                   40000
industrial                   40000
multifamily residential      40000
public sector                40000
single family residential    40000
peri-urban settlement        16960
schools                      14596
public health facilities      8009
hotels                        7493
small commercial              2370
Name: count, dtype: int64


In [2]:
# === Feature sets ===
TARGET = "kWh_per_m2"
CAT = ["BuildingType"]
NUM = [
    "tilt","tilt2","tilt_sin","tilt_cos",
    "GHI_kWh_per_m2_day","AvgTemp_C",
    "ClearnessIndex","Precip_mm_per_day"
]

X = df[NUM + CAT].copy()
y = df[TARGET].copy()

# Categorical conversion for LightGBM
for c in CAT:
    X[c] = X[c].astype("category")

# Encode categorical for XGB
X_xgb = X.copy()
X_xgb["BuildingType"] = X_xgb["BuildingType"].cat.codes

# Log-transform the target
y_log = np.log1p(y)


In [3]:
from lightgbm import LGBMRegressor, early_stopping
from xgboost import XGBRegressor
from sklearn.linear_model import Ridge
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_absolute_error

xgb_params = dict(
    objective='reg:squarederror',
    n_estimators=1500,
    learning_rate=0.03,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    enable_categorical=False
)

lgb_params = dict(
    objective='mae',
    n_estimators=2000,
    learning_rate=0.03,
    num_leaves=15,
    min_child_samples=100,
    lambda_l1=1.0,
    lambda_l2=1.0,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)


In [None]:
cv = GroupKFold(n_splits=3)
mae_scores = []
oof = []

for fold, (tr, va) in enumerate(cv.split(X, y_log, groups=df["City"]), 1):
    X_tr_xgb, X_va_xgb = X_xgb.iloc[tr], X_xgb.iloc[va]
    X_tr_lgb, X_va_lgb = X.iloc[tr], X.iloc[va]
    y_tr, y_va = y_log.iloc[tr], y_log.iloc[va]

    # --- Train base models ---
    xgb = XGBRegressor(**xgb_params)
    lgb = LGBMRegressor(**lgb_params)

    xgb.fit(X_tr_xgb, y_tr, eval_set=[(X_va_xgb, y_va)], verbose=False)
    lgb.fit(X_tr_lgb, y_tr, eval_set=[(X_va_lgb, y_va)],
            callbacks=[early_stopping(stopping_rounds=150, verbose=False)])

    # --- Predict & inverse-transform ---
    pred_xgb = np.expm1(xgb.predict(X_va_xgb))
    pred_lgb = np.expm1(lgb.predict(X_va_lgb))

    # --- Meta-learner (Ridge stacking) ---
    meta = Ridge(alpha=1.0)
    meta.fit(np.column_stack([pred_xgb, pred_lgb]), np.expm1(y_va))
    stacked = meta.predict(np.column_stack([pred_xgb, pred_lgb]))

    # --- Evaluate ---
    y_true = np.expm1(y_va)
    mae = mean_absolute_error(y_true, stacked)
    mae_scores.append(mae)
    print(f"Fold {fold} MAE = {mae:.3f}")

    # --- Store OOF predictions for analysis ---
    oof.append(pd.DataFrame({
        "City": df.loc[va, "City"],
        "BuildingType": df.loc[va, "BuildingType"].values,
        "y_true": y_true,
        "y_pred": stacked
    }))

print(f"\nðŸŽ¯ Mean Stacked Ensemble MAE: {np.mean(mae_scores):.3f} Â± {np.std(mae_scores):.3f}")

# Combine all folds
oof_df = pd.concat(oof, ignore_index=True)
oof_df.to_parquet("oof_balanced_by_type.parquet", index=False)
print("âœ… Saved OOF predictions for error analysis.")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000723 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1085
[LightGBM] [Info] Number of data points in the train set: 168309, number of used features: 9
[LightGBM] [Info] Start training from score 5.610894
Fold 1 MAE = 18.887


In [None]:
print(oof_df.head())
print("\nMean absolute error overall:", mean_absolute_error(oof_df["y_true"], oof_df["y_pred"]))
print("\nMAE by BuildingType:")
print(oof_df.groupby("BuildingType").apply(lambda d: mean_absolute_error(d["y_true"], d["y_pred"])))
