In [23]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.model_selection import GroupKFold, GridSearchCV
from sklearn.metrics import mean_absolute_error

from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

# === Load Dataset ===
DATA = Path("New_approach/dataset/cleaned_datasets/all_cities_weather_ready_train.parquet")
df = pd.read_parquet(DATA, engine="fastparquet")

# === Define Features ===
TARGET = "kWh_per_m2"
CAT = ["BuildingType"]
NUM = [
    "tilt", "tilt2", "tilt_sin", "tilt_cos",
    "GHI_kWh_per_m2_day", "AvgTemp_C",
    "ClearnessIndex", "Precip_mm_per_day"
]

X = df[NUM + CAT]
y = df[TARGET]

# âœ… Convert categorical columns to 'category' dtype
for col in CAT:
    X[col] = X[col].astype("category")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = X[col].astype("category")


Fine-Tuning LightGBM

In [29]:
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_absolute_error
from lightgbm import LGBMRegressor, early_stopping
from xgboost import XGBRegressor

# === Transform target ===
y_log = np.log1p(y)

# === Prepare separate data for XGBoost (encode categorical) ===
X_xgb = X.copy()
X_xgb["BuildingType"] = X_xgb["BuildingType"].cat.codes

# === Base models ===
xgb = XGBRegressor(
    objective='reg:squarederror',
    n_estimators=1500,
    learning_rate=0.03,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    enable_categorical=False  # we encoded categories manually
)

lgb = LGBMRegressor(
    objective='mae',
    n_estimators=2000,
    learning_rate=0.03,
    num_leaves=15,
    min_child_samples=100,
    lambda_l1=1.0,
    lambda_l2=1.0,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

# === Cross-validation and meta stacking ===
cv = GroupKFold(n_splits=3)
mae_scores = []

for fold, (tr, va) in enumerate(cv.split(X, y_log, groups=df["City"]), 1):
    # Use encoded data for XGB and original (categorical) for LGB
    X_tr_xgb, X_va_xgb = X_xgb.iloc[tr], X_xgb.iloc[va]
    X_tr_lgb, X_va_lgb = X.iloc[tr], X.iloc[va]
    y_tr, y_va = y_log.iloc[tr], y_log.iloc[va]

    # Train XGBoost
    xgb.fit(X_tr_xgb, y_tr, eval_set=[(X_va_xgb, y_va)], verbose=False)

    # Train LightGBM with early stopping (new callback style)
    lgb.fit(
        X_tr_lgb, y_tr,
        eval_set=[(X_va_lgb, y_va)],
        callbacks=[early_stopping(stopping_rounds=150, verbose=False)]
    )

    # Predictions (inverse log1p transform)
    pred_xgb = np.expm1(xgb.predict(X_va_xgb))
    pred_lgb = np.expm1(lgb.predict(X_va_lgb))

    # --- Blend weights: test 0.3â€“0.7 LGB fraction ---
    blended = 0.3 * pred_xgb + 0.7 * pred_lgb

    # --- Meta-learner (stacking) ---
    meta = Ridge(alpha=1.0)
    meta.fit(np.column_stack([pred_xgb, pred_lgb]), np.expm1(y_va))
    stacked = meta.predict(np.column_stack([pred_xgb, pred_lgb]))

    mae = mean_absolute_error(np.expm1(y_va), stacked)
    mae_scores.append(mae)
    print(f"Fold {fold} MAE = {mae:.3f}")

print(f"\nðŸŽ¯ Mean Stacked Ensemble MAE: {np.mean(mae_scores):.3f} Â± {np.std(mae_scores):.3f}")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.024858 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1103
[LightGBM] [Info] Number of data points in the train set: 4157956, number of used features: 9
[LightGBM] [Info] Start training from score 5.623585
Fold 1 MAE = 10.408
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018234 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1092
[LightGBM] [Info] Number of data points in the train set: 4163079, number of used features: 9
[LightGBM] [Info] Start training from score 5.554214
Fold 2 MAE = 14.532
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.024711 seconds.
You can set `force_row_wise=true` to remov