In [1]:
# === Cell 1: Imports & dataset ===
import pandas as pd
import numpy as np
from pathlib import Path

# Path to your balanced dataset
DATA = Path("/Users/thetsusann/Documents/ML/Energy404---Rooftop-Solar-Potential/New_approach/dataset/cleaned_datasets/top20_balanced_by_type.parquet")

df = pd.read_parquet(DATA)
print("Shape:", df.shape)
print("\nCities:", df["City"].nunique())
print("Building types:", df["BuildingType"].nunique())
print("\nCounts per type:\n", df["BuildingType"].value_counts())


Shape: (249428, 12)

Cities: 20
Building types: 10

Counts per type:
 BuildingType
commercial                   40000
industrial                   40000
multifamily residential      40000
public sector                40000
single family residential    40000
peri-urban settlement        16960
schools                      14596
public health facilities      8009
hotels                        7493
small commercial              2370
Name: count, dtype: int64


In [2]:
# === Cell 2: Define features and preprocessing ===

TARGET = "kWh_per_m2"
CAT = ["BuildingType"]
NUM = [
    "tilt","tilt2","tilt_sin","tilt_cos",
    "GHI_kWh_per_m2_day","AvgTemp_C",
    "ClearnessIndex","Precip_mm_per_day"
]

X = df[NUM + CAT].copy()
y = df[TARGET].copy()

# Encode categorical for LightGBM
for c in CAT:
    X[c] = X[c].astype("category")

# Encoded copy for XGB/RF
X_encoded = X.copy()
X_encoded["BuildingType"] = X_encoded["BuildingType"].cat.codes

# Log-transform target (for numerical stability)
y_log = np.log1p(y)

print("Feature matrix:", X.shape)


Feature matrix: (249428, 9)


In [3]:
# === Cell 3: Model parameters ===
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor

xgb_params = dict(
    objective='reg:squarederror',
    n_estimators=1500,
    learning_rate=0.03,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    enable_categorical=False
)

lgb_params = dict(
    objective='mae',
    n_estimators=2000,
    learning_rate=0.03,
    num_leaves=15,
    min_child_samples=100,
    lambda_l1=1.0,
    lambda_l2=1.0,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

rf_params = dict(
    n_estimators=400,
    max_depth=None,
    min_samples_leaf=3,
    random_state=42,
    n_jobs=-1
)


In [4]:
# === Cell 4: Cross-validation + residual bias correction ===
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import Ridge
from lightgbm import early_stopping

cv = GroupKFold(n_splits=3)
mae_scores_corr = []
mae_scores_raw = []
oof = []

for fold, (tr, va) in enumerate(cv.split(X, y_log, groups=df["City"]), 1):
    print(f"\n===== Fold {fold} =====")
    X_tr_lgb, X_va_lgb = X.iloc[tr], X.iloc[va]
    X_tr_enc, X_va_enc = X_encoded.iloc[tr], X_encoded.iloc[va]
    y_tr, y_va = y_log.iloc[tr], y_log.iloc[va]

    # Train base models
    lgb = LGBMRegressor(**lgb_params)
    xgb = XGBRegressor(**xgb_params)
    rf  = RandomForestRegressor(**rf_params)

    lgb.fit(X_tr_lgb, y_tr, eval_set=[(X_va_lgb, y_va)], callbacks=[early_stopping(stopping_rounds=150, verbose=False)])
    xgb.fit(X_tr_enc, y_tr, eval_set=[(X_va_enc, y_va)], verbose=False)
    rf.fit(X_tr_enc, y_tr)

    # Predict validation fold
    pred_lgb = np.expm1(lgb.predict(X_va_lgb))
    pred_xgb = np.expm1(xgb.predict(X_va_enc))
    pred_rf  = np.expm1(rf.predict(X_va_enc))

    meta = Ridge(alpha=1.0)
    meta.fit(np.column_stack([pred_lgb, pred_xgb, pred_rf]), np.expm1(y_va))
    stacked = meta.predict(np.column_stack([pred_lgb, pred_xgb, pred_rf]))

    y_true = np.expm1(y_va)

    # === Stage 2: Bias correction (trained on training fold only) ===
    train_preds_lgb = np.expm1(lgb.predict(X_tr_lgb))
    train_preds_xgb = np.expm1(xgb.predict(X_tr_enc))
    train_preds_rf  = np.expm1(rf.predict(X_tr_enc))
    train_meta_preds = meta.predict(np.column_stack([train_preds_lgb, train_preds_xgb, train_preds_rf]))
    train_true = np.expm1(y_tr)

    train_residuals = train_true - train_meta_preds
    city_bias = pd.Series(train_residuals, index=df.iloc[tr]["City"]).groupby(level=0).mean()

    corrected = stacked + df.loc[va, "City"].map(city_bias).fillna(0)

    mae_raw = mean_absolute_error(y_true, stacked)
    mae_corr = mean_absolute_error(y_true, corrected)
    print(f"MAE raw={mae_raw:.3f} â†’ corrected={mae_corr:.3f}")

    mae_scores_raw.append(mae_raw)
    mae_scores_corr.append(mae_corr)

    oof.append(pd.DataFrame({
        "City": df.loc[va, "City"],
        "BuildingType": df.loc[va, "BuildingType"].values,
        "y_true": y_true,
        "y_pred_raw": stacked,
        "y_pred_corr": corrected
    }))

print("\nðŸŽ¯ Mean MAE (raw):", np.mean(mae_scores_raw))
print("ðŸŽ¯ Mean MAE (bias-corrected):", np.mean(mae_scores_corr))



===== Fold 1 =====
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000700 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1085
[LightGBM] [Info] Number of data points in the train set: 168309, number of used features: 9
[LightGBM] [Info] Start training from score 5.610894
MAE raw=18.780 â†’ corrected=18.780

===== Fold 2 =====
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000643 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1082
[LightGBM] [Info] Number of data points in the train set: 165387, number of used features: 9
[LightGBM] [Info] Start training from score 5.606667
MAE raw=12.004 â†’ corrected=12.004

===== Fold 3 =====
[LightGBM] [Info] Auto-choosing row-wise multi-threading

In [5]:
# === Cell 5: Type-level MAE comparison ===
oof_df = pd.concat(oof, ignore_index=True)
oof_df.to_parquet("oof_with_city_biascorr.parquet", index=False)
print("âœ… Saved OOF predictions with bias correction")

print("\nMAE by BuildingType (raw vs corrected):")
type_mae = oof_df.groupby("BuildingType").apply(
    lambda d: pd.Series({
        "raw": mean_absolute_error(d["y_true"], d["y_pred_raw"]),
        "corr": mean_absolute_error(d["y_true"], d["y_pred_corr"])
    })
).round(2)

print(type_mae)


âœ… Saved OOF predictions with bias correction

MAE by BuildingType (raw vs corrected):
                             raw   corr
BuildingType                           
commercial                 17.42  17.42
hotels                     12.76  12.76
industrial                 14.93  14.93
multifamily residential    17.26  17.26
peri-urban settlement      22.56  22.56
public health facilities   18.02  18.02
public sector              16.64  16.64
schools                    18.08  18.08
single family residential  17.08  17.08
small commercial           11.48  11.48


  type_mae = oof_df.groupby("BuildingType").apply(
