In [1]:
import numpy as np, pandas as pd
from pathlib import Path

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GroupKFold

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

DATA = Path("C:/Users/User/Desktop/ML/Project/solar-potential-analysis-github-setup/New_approach/dataset/cleaned_datasets/all_cities_weather_ready_train.parquet")
df   = pd.read_parquet(DATA)

TARGET = "kWh_per_m2"
CAT   = ["BuildingType"]
NUM   = ["tilt","tilt2","tilt_sin","tilt_cos",
         "GHI_kWh_per_m2_day","AvgTemp_C","ClearnessIndex","Precip_mm_per_day"]

def make_preprocessor(cat_cols, num_cols):
    return ColumnTransformer([
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols),
        ("num", StandardScaler(), num_cols)
    ], remainder="drop")

def evaluate(pipe, Xtr, ytr, Xte, yte, label=""):
    pipe.fit(Xtr, ytr)
    pred = pipe.predict(Xte)
    mae = mean_absolute_error(yte, pred)
    mse = mean_squared_error(yte, pred)
    r2  = r2_score(yte, pred)
    print(f"{label:22s} | MAE={mae:,.2f} | MSE={mse:,.2f} | R2={r2:.3f}")
    return {"label":label, "MAE":mae, "MSE":mse, "R2":r2}

X = df[CAT + NUM].copy()
y = df[TARGET].astype(float).values
groups_city = df["City"]  # used only for grouping splits

In [2]:
gkf = GroupKFold(n_splits=5)

models = {
    "XGB": XGBRegressor(
        n_estimators=600, learning_rate=0.05, max_depth=6,
        subsample=0.8, colsample_bytree=0.8, random_state=42, n_jobs=-1
    ),
    "LGBM": LGBMRegressor(
        n_estimators=1200, learning_rate=0.05, num_leaves=31,
        subsample=0.8, colsample_bytree=0.8, random_state=42
    )
}

pre = make_preprocessor(["BuildingType"], NUM)

summary = []
for name, mdl in models.items():
    maes = []
    for fold,(tr,te) in enumerate(gkf.split(X, y, groups=groups_city),1):
        pipe = Pipeline([("pre", pre), ("m", mdl)])
        res = evaluate(pipe, X.iloc[tr], y[tr], X.iloc[te], y[te], label=f"{name}-gkf-noCity-f{fold}")
        maes.append(res["MAE"])
    print(f"AVG {name}-gkf-noCity MAE: {np.mean(maes):.2f}")
    summary.append((name, float(np.mean(maes))))
print("\nModel selection summary:", summary)

XGB-gkf-noCity-f1      | MAE=9.87 | MSE=143.08 | R2=-0.540
XGB-gkf-noCity-f2      | MAE=23.02 | MSE=677.46 | R2=-0.025
XGB-gkf-noCity-f3      | MAE=8.94 | MSE=116.89 | R2=0.643
XGB-gkf-noCity-f4      | MAE=21.73 | MSE=682.87 | R2=0.594
XGB-gkf-noCity-f5      | MAE=26.80 | MSE=962.37 | R2=0.271
AVG XGB-gkf-noCity MAE: 18.07
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.029584 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1137
[LightGBM] [Info] Number of data points in the train set: 4916288, number of used features: 18
[LightGBM] [Info] Start training from score 275.411072




LGBM-gkf-noCity-f1     | MAE=10.07 | MSE=148.10 | R2=-0.594
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.036186 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1111
[LightGBM] [Info] Number of data points in the train set: 5014958, number of used features: 17
[LightGBM] [Info] Start training from score 272.288336




LGBM-gkf-noCity-f2     | MAE=29.77 | MSE=1,133.02 | R2=-0.714
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.031019 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1123
[LightGBM] [Info] Number of data points in the train set: 5016861, number of used features: 18
[LightGBM] [Info] Start training from score 268.138354




LGBM-gkf-noCity-f3     | MAE=8.87 | MSE=162.95 | R2=0.503
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.154538 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1114
[LightGBM] [Info] Number of data points in the train set: 5012306, number of used features: 18
[LightGBM] [Info] Start training from score 261.995774




LGBM-gkf-noCity-f4     | MAE=24.16 | MSE=807.66 | R2=0.519
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.044078 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1111
[LightGBM] [Info] Number of data points in the train set: 5013591, number of used features: 18
[LightGBM] [Info] Start training from score 266.194550




LGBM-gkf-noCity-f5     | MAE=25.11 | MSE=842.18 | R2=0.362
AVG LGBM-gkf-noCity MAE: 19.59

Model selection summary: [('XGB', 18.071071130437197), ('LGBM', 19.594435560780493)]


In [3]:
best = LGBMRegressor(
    n_estimators=1200, learning_rate=0.05, num_leaves=31,
    subsample=0.8, colsample_bytree=0.8, random_state=42
)
pre = make_preprocessor(["BuildingType"], NUM)

rows = []
cities = sorted(df["City"].unique())
for c in cities:
    tr = df["City"] != c; te = ~tr
    Xtr, ytr = df.loc[tr, ["BuildingType"] + NUM], df.loc[tr, TARGET].values
    Xte, yte = df.loc[te, ["BuildingType"] + NUM], df.loc[te, TARGET].values

    pipe = Pipeline([("pre", pre), ("m", best)])
    res = evaluate(pipe, Xtr, ytr, Xte, yte, label=f"LGBM-LOCO-noCity-{c}")
    rows.append({"City": c, **res})

out = pd.DataFrame(rows)
Path("results").mkdir(exist_ok=True, parents=True)
out.to_csv("results/PersonC_LGBM_LOCO_noCity.csv", index=False)
print("\nSaved: results/PersonC_LGBM_LOCO_noCity.csv")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.030424 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1133
[LightGBM] [Info] Number of data points in the train set: 5978388, number of used features: 18
[LightGBM] [Info] Start training from score 268.842619




LGBM-LOCO-noCity-Accra | MAE=10.30 | MSE=149.79 | R2=-0.892
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.036614 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1133
[LightGBM] [Info] Number of data points in the train set: 6115887, number of used features: 18
[LightGBM] [Info] Start training from score 269.564842




LGBM-LOCO-noCity-Almaty | MAE=19.30 | MSE=550.77 | R2=0.377
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.036630 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1133
[LightGBM] [Info] Number of data points in the train set: 6195572, number of used features: 18
[LightGBM] [Info] Start training from score 268.584899




LGBM-LOCO-noCity-Antigua | MAE=14.23 | MSE=315.35 | R2=-0.112
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.036765 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1134
[LightGBM] [Info] Number of data points in the train set: 6175815, number of used features: 18
[LightGBM] [Info] Start training from score 269.121794




LGBM-LOCO-noCity-Beirut | MAE=13.11 | MSE=250.49 | R2=-0.554
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.036283 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1133
[LightGBM] [Info] Number of data points in the train set: 5978055, number of used features: 18
[LightGBM] [Info] Start training from score 268.325699




LGBM-LOCO-noCity-Colombo | MAE=6.98 | MSE=64.80 | R2=-0.209
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.031485 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1133
[LightGBM] [Info] Number of data points in the train set: 5727832, number of used features: 18
[LightGBM] [Info] Start training from score 267.757846




LGBM-LOCO-noCity-DarEsSalaam | MAE=3.67 | MSE=20.83 | R2=0.613
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.036629 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1133
[LightGBM] [Info] Number of data points in the train set: 6206305, number of used features: 18
[LightGBM] [Info] Start training from score 268.762797




LGBM-LOCO-noCity-Dominica | MAE=32.29 | MSE=1,522.04 | R2=-1.781
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.032969 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1131
[LightGBM] [Info] Number of data points in the train set: 5619692, number of used features: 17
[LightGBM] [Info] Start training from score 272.192966




LGBM-LOCO-noCity-GreatDhakaRegion | MAE=9.01 | MSE=134.22 | R2=0.406
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.039627 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1135
[LightGBM] [Info] Number of data points in the train set: 6193025, number of used features: 18
[LightGBM] [Info] Start training from score 268.658167




LGBM-LOCO-noCity-Grenada | MAE=14.94 | MSE=375.10 | R2=0.102
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.034266 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1133
[LightGBM] [Info] Number of data points in the train set: 6020682, number of used features: 18
[LightGBM] [Info] Start training from score 268.434705




LGBM-LOCO-noCity-Honduras | MAE=33.55 | MSE=1,284.41 | R2=-5.226
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.036961 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1133
[LightGBM] [Info] Number of data points in the train set: 6053117, number of used features: 18
[LightGBM] [Info] Start training from score 268.957868




LGBM-LOCO-noCity-Izmir | MAE=19.77 | MSE=528.41 | R2=0.294
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.050498 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1133
[LightGBM] [Info] Number of data points in the train set: 5979797, number of used features: 18
[LightGBM] [Info] Start training from score 267.134370




LGBM-LOCO-noCity-Karachi | MAE=42.44 | MSE=1,813.10 | R2=-94.909
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.037073 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1137
[LightGBM] [Info] Number of data points in the train set: 6031234, number of used features: 18
[LightGBM] [Info] Start training from score 269.799639




LGBM-LOCO-noCity-Lagos | MAE=9.28 | MSE=117.21 | R2=-0.650
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.029177 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1137
[LightGBM] [Info] Number of data points in the train set: 4916288, number of used features: 18
[LightGBM] [Info] Start training from score 275.411072




LGBM-LOCO-noCity-LagosState | MAE=10.07 | MSE=148.10 | R2=-0.594
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.038134 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1133
[LightGBM] [Info] Number of data points in the train set: 6152077, number of used features: 18
[LightGBM] [Info] Start training from score 268.388829




LGBM-LOCO-noCity-Maldives | MAE=15.47 | MSE=261.05 | R2=-1.460
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.033865 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1133
[LightGBM] [Info] Number of data points in the train set: 5947517, number of used features: 18
[LightGBM] [Info] Start training from score 270.285559




LGBM-LOCO-noCity-Manila | MAE=9.04 | MSE=134.14 | R2=-0.031
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.192301 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1133
[LightGBM] [Info] Number of data points in the train set: 5736870, number of used features: 18
[LightGBM] [Info] Start training from score 262.828859




LGBM-LOCO-noCity-Mexico City | MAE=36.61 | MSE=1,437.38 | R2=-3.513
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.040287 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1133
[LightGBM] [Info] Number of data points in the train set: 5974252, number of used features: 18
[LightGBM] [Info] Start training from score 268.013639




LGBM-LOCO-noCity-Nairobi | MAE=11.13 | MSE=197.60 | R2=-3.262
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.040391 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1133
[LightGBM] [Info] Number of data points in the train set: 6055041, number of used features: 18
[LightGBM] [Info] Start training from score 268.771265




LGBM-LOCO-noCity-Panama | MAE=13.33 | MSE=256.77 | R2=-0.033
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.035816 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1133
[LightGBM] [Info] Number of data points in the train set: 6233617, number of used features: 18
[LightGBM] [Info] Start training from score 268.874496




LGBM-LOCO-noCity-Rustavi | MAE=28.34 | MSE=1,298.49 | R2=-0.163
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.040010 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1135
[LightGBM] [Info] Number of data points in the train set: 6198703, number of used features: 18
[LightGBM] [Info] Start training from score 268.661118




LGBM-LOCO-noCity-SVG   | MAE=16.90 | MSE=431.82 | R2=0.110
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.036647 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1133
[LightGBM] [Info] Number of data points in the train set: 6054023, number of used features: 18
[LightGBM] [Info] Start training from score 269.085135




LGBM-LOCO-noCity-Samarkand | MAE=18.91 | MSE=502.44 | R2=0.302
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.035136 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1133
[LightGBM] [Info] Number of data points in the train set: 5831616, number of used features: 18
[LightGBM] [Info] Start training from score 266.646106




LGBM-LOCO-noCity-SouthAfrica | MAE=24.62 | MSE=916.37 | R2=0.211
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.038003 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1133
[LightGBM] [Info] Number of data points in the train set: 6240021, number of used features: 18
[LightGBM] [Info] Start training from score 268.775942




LGBM-LOCO-noCity-StLucia | MAE=15.02 | MSE=333.98 | R2=-0.094
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.034024 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1133
[LightGBM] [Info] Number of data points in the train set: 6228598, number of used features: 18
[LightGBM] [Info] Start training from score 268.669692
LGBM-LOCO-noCity-StMaarten | MAE=26.02 | MSE=810.48 | R2=-1.424

Saved: results/PersonC_LGBM_LOCO_noCity.csv


