In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.model_selection import GroupKFold, GridSearchCV
from sklearn.metrics import mean_absolute_error

from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

# === Load Dataset ===
DATA = Path("C:/Users/User/Desktop/ML/Project/solar-potential-analysis-github-setup/New_approach/dataset/cleaned_datasets/top20_balanced_sample.parquet" )
df = pd.read_parquet(DATA)

# === Define Features ===
TARGET = "kWh_per_m2"
CAT = ["BuildingType"]
NUM = [
    "tilt", "tilt2", "tilt_sin", "tilt_cos",
    "GHI_kWh_per_m2_day", "AvgTemp_C",
    "ClearnessIndex", "Precip_mm_per_day"
]

X = df[NUM + CAT]
y = df[TARGET]

# ‚úÖ Convert categorical columns to 'category' dtype
for col in CAT:
    X[col] = X[col].astype("category")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = X[col].astype("category")


In [2]:
from sklearn.model_selection import GroupKFold

# --- Simulate RepeatedGroupKFold for compatibility ---
def repeated_group_kfold(X, y, groups, n_splits=5, n_repeats=2, random_state=42):
    rng = np.random.default_rng(random_state)
    all_splits = []
    for r in range(n_repeats):
        gkf = GroupKFold(n_splits=n_splits)
        shuffled_groups = rng.permutation(np.unique(groups))
        group_map = {g: i for i, g in enumerate(shuffled_groups)}
        sorted_groups = np.array([group_map[g] for g in groups])
        all_splits.extend(list(gkf.split(X, y, sorted_groups)))
    return all_splits

cv_splits = repeated_group_kfold(X, y, df["City"], n_splits=5, n_repeats=2)

# --- Base LGBM Model ---
base_lgbm = LGBMRegressor(
    objective='mae',
    n_estimators=1000,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.8,
    random_state=42
)

# --- Parameter Grid ---
param_grid = {
    'num_leaves': [15, 31, 63],
    'min_child_samples': [5, 20, 50],
    'lambda_l1': [0, 0.1, 1.0],
    'lambda_l2': [0, 0.1, 1.0],
    'max_depth': [-1, 5, 10]
}

grid = GridSearchCV(
    base_lgbm,
    param_grid,
    cv=cv_splits,
    n_jobs=-1,
    scoring='neg_mean_absolute_error',
    verbose=1
)

grid.fit(X, y)
print("‚úÖ Best params:", grid.best_params_)
print("‚úÖ Best mean MAE:", -grid.best_score_)


Fitting 10 folds for each of 243 candidates, totalling 2430 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.037788 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1108
[LightGBM] [Info] Number of data points in the train set: 958580, number of used features: 9
[LightGBM] [Info] Start training from score 273.000366
‚úÖ Best params: {'lambda_l1': 1.0, 'lambda_l2': 1.0, 'max_depth': -1, 'min_child_samples': 5, 'num_leaves': 15}
‚úÖ Best mean MAE: 19.27580144365488


In [3]:
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_absolute_error
import numpy as np

# === Define Models ===
xgb = XGBRegressor(
    objective='reg:squarederror',
    n_estimators=800,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.9,
    colsample_bytree=0.8,
    random_state=42
)

lgb = LGBMRegressor(
    objective='mae',
    n_estimators=1000,
    learning_rate=0.05,
    num_leaves=31,
    subsample=0.9,
    colsample_bytree=0.8,
    random_state=42
)

# === Cross-Validation ===
cv = GroupKFold(n_splits=3)
mae_scores = []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y, groups=df["City"]), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    xgb.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
    lgb.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=100, verbose=False)
    
    pred_xgb = xgb.predict(X_val)
    pred_lgb = lgb.predict(X_val)
    
    # Weighted blend ‚Äî tweak if needed
    blended = 0.6 * pred_xgb + 0.4 * pred_lgb
    mae = mean_absolute_error(y_val, blended)
    mae_scores.append(mae)
    print(f"Fold {fold} MAE = {mae:.3f}")

print(f"\n‚úÖ Mean Ensemble MAE: {np.mean(mae_scores):.3f} ¬± {np.std(mae_scores):.3f}")


ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:BuildingType: category

In [None]:
best_lgbm_mae = -grid.best_score_
ensemble_mae = np.mean(mae_scores)

print(f"üìä Baseline LGBM (from previous run): 17.23 kWh/m¬≤")
print(f"‚úÖ Tuned LGBM MAE: {best_lgbm_mae:.3f} kWh/m¬≤")
print(f"ü§ù Ensemble MAE: {ensemble_mae:.3f} kWh/m¬≤")
print(f"üìâ Improvement: {17.23 - ensemble_mae:.2f} kWh/m¬≤ (~{(17.23 - ensemble_mae)/17.23*100:.1f}%)")