In [1]:
import numpy as np, pandas as pd
from pathlib import Path

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, GroupKFold
from sklearn.linear_model import Ridge, HuberRegressor

DATA = Path("C:/Users/User/Desktop/ML/Project/solar-potential-analysis-github-setup/New_approach/dataset/cleaned_datasets/all_cities_weather_ready_train.parquet")
df   = pd.read_parquet(DATA)

TARGET = "kWh_per_m2"
CAT   = ["City", "BuildingType"]
NUM   = ["tilt","tilt2","tilt_sin","tilt_cos",
         "GHI_kWh_per_m2_day","AvgTemp_C","ClearnessIndex","Precip_mm_per_day"]

def make_preprocessor(cat_cols, num_cols):
    return ColumnTransformer([
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols),
        ("num", StandardScaler(), num_cols)
    ], remainder="drop")

def evaluate(pipe, Xtr, ytr, Xte, yte, label=""):
    pipe.fit(Xtr, ytr)
    pred = pipe.predict(Xte)
    mae = mean_absolute_error(yte, pred)
    mse = mean_squared_error(yte, pred)
    r2  = r2_score(yte, pred)
    print(f"{label:22s} | MAE={mae:,.2f} | MSE={mse:,.2f} | R2={r2:.3f}")
    return {"label":label, "MAE":mae, "MSE":mse, "R2":r2}

X_full = df[CAT + NUM].copy()
y_full = df[TARGET].astype(float).values
groups_city = df["City"]

In [2]:
X = df[["City","BuildingType"] + NUM]
y = y_full
Xtr, Xte, ytr, yte = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=df["City"]
)
pre = make_preprocessor(["City","BuildingType"], NUM)

for name, model in [("Ridge", Ridge(alpha=1.0, random_state=42)),
                    ("Huber", HuberRegressor())]:
    pipe = Pipeline([("pre", pre), ("m", model)])
    evaluate(pipe, Xtr, ytr, Xte, yte, label=f"{name}-strat-withCity")

Ridge-strat-withCity   | MAE=9.34 | MSE=179.54 | R2=0.844


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


Huber-strat-withCity   | MAE=9.16 | MSE=186.78 | R2=0.837


In [3]:
X = df[["BuildingType"] + NUM]
y = y_full
Xtr, Xte, ytr, yte = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=df["City"]
)
pre = make_preprocessor(["BuildingType"], NUM)

for name, model in [("Ridge", Ridge(alpha=1.0, random_state=42)),
                    ("Huber", HuberRegressor())]:
    pipe = Pipeline([("pre", pre), ("m", model)])
    evaluate(pipe, Xtr, ytr, Xte, yte, label=f"{name}-strat-noCity")

Ridge-strat-noCity     | MAE=14.23 | MSE=355.72 | R2=0.690


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


Huber-strat-noCity     | MAE=13.95 | MSE=365.69 | R2=0.682


In [4]:
def gkf_eval(model, use_city=True):
    maes, mses, r2s = [], [], []
    gkf = GroupKFold(n_splits=5)
    if use_city:
        X = df[["City","BuildingType"] + NUM]
        pre = make_preprocessor(["City","BuildingType"], NUM)
        tag = "gkf-withCity"
    else:
        X = df[["BuildingType"] + NUM]
        pre = make_preprocessor(["BuildingType"], NUM)
        tag = "gkf-noCity"

    for fold, (tr, te) in enumerate(gkf.split(X, y_full, groups=groups_city), 1):
        pipe = Pipeline([("pre", pre), ("m", model)])
        res = evaluate(pipe, X.iloc[tr], y_full[tr], X.iloc[te], y_full[te],
                       label=f"{type(model).__name__}-{tag}-f{fold}")
        maes.append(res["MAE"]); mses.append(res["MSE"]); r2s.append(res["R2"])
    print(f"AVG {type(model).__name__}-{tag}: MAE={np.mean(maes):.2f} | R2={np.mean(r2s):.3f}")

gkf_eval(Ridge(alpha=1.0, random_state=42), use_city=True)
gkf_eval(Ridge(alpha=1.0, random_state=42), use_city=False)
gkf_eval(HuberRegressor(), use_city=True)
gkf_eval(HuberRegressor(), use_city=False)

Ridge-gkf-withCity-f1  | MAE=9.21 | MSE=133.02 | R2=-0.431
Ridge-gkf-withCity-f2  | MAE=13.12 | MSE=256.32 | R2=0.612
Ridge-gkf-withCity-f3  | MAE=9.80 | MSE=184.39 | R2=0.437
Ridge-gkf-withCity-f4  | MAE=31.50 | MSE=1,394.59 | R2=0.170
Ridge-gkf-withCity-f5  | MAE=34.18 | MSE=1,444.47 | R2=-0.094
AVG Ridge-gkf-withCity: MAE=19.56 | R2=0.139
Ridge-gkf-noCity-f1    | MAE=14.28 | MSE=369.62 | R2=-2.977
Ridge-gkf-noCity-f2    | MAE=16.59 | MSE=388.64 | R2=0.412
Ridge-gkf-noCity-f3    | MAE=9.94 | MSE=188.05 | R2=0.426
Ridge-gkf-noCity-f4    | MAE=28.61 | MSE=1,115.41 | R2=0.336
Ridge-gkf-noCity-f5    | MAE=31.29 | MSE=1,261.07 | R2=0.045
AVG Ridge-gkf-noCity: MAE=20.14 | R2=-0.352


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


HuberRegressor-gkf-withCity-f1 | MAE=9.02 | MSE=129.22 | R2=-0.390


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


HuberRegressor-gkf-withCity-f2 | MAE=30.52 | MSE=1,095.86 | R2=-0.658


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


HuberRegressor-gkf-withCity-f3 | MAE=12.74 | MSE=257.58 | R2=0.214


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


HuberRegressor-gkf-withCity-f4 | MAE=38.58 | MSE=1,995.85 | R2=-0.187


KeyboardInterrupt: 