In [134]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor
import scipy.stats as stats



In [111]:
df = pd.read_excel("./datasets/datasets.xlsx", sheet_name="Sheet1")

features = [
    'Tempmax_C', 'Tempmin_C', 'windspeedmax', 'windspeedmean',
    'solarradiation', 'uvindex', 'cloudcover', 'humidity', 'precip'
]

X = df[features]
y_wind = df['Wind_GWh']
y_solar = df['(Combined) Solar_GWh']  # 已合并 Rooftop + Utility 的值


X_train_wind, X_test_wind, y_train_wind, y_test_wind = train_test_split(X, y_wind, test_size=0.2, random_state=42)
X_train_solar, X_test_solar, y_train_solar, y_test_solar = train_test_split(X, y_solar, test_size=0.2, random_state=42)
print(np.mean(y_train_wind), np.mean(y_test_wind))
model_wind = xgb.XGBRegressor(    
    n_estimators=200,
    eta=0.31,
    max_depth=4,
    random_state=42,
    gamma=49,
    reg_lambda=1.02,
    )
model_solar = xgb.XGBRegressor(
    n_estimators=200, 
    eta=0.31, 
    max_depth=4, 
    random_state=42
)


model_wind.fit(X_train_wind, y_train_wind)
model_solar.fit(X_train_solar, y_train_solar)

pred_wind = model_wind.predict(X_test_wind)
pred_solar = model_solar.predict(X_test_solar)

pred_combined = pred_wind + pred_solar
y_combined_true = y_test_wind + y_test_solar

rmse_wind = np.sqrt(mean_squared_error(y_test_wind, pred_wind))
r2_wind = r2_score(y_test_wind, pred_wind)
rmse_sloar = np.sqrt(mean_squared_error(y_test_solar, pred_solar))
r2_sloar = r2_score(y_test_solar, pred_solar)
rmse = np.sqrt(mean_squared_error(y_combined_true, pred_combined))
r2 = r2_score(y_combined_true, pred_combined)

print(f"RMSE (wind): {rmse_wind:.3f}")
print(f"R² (wind): {r2_wind:.3f}")

print(f"RMSE (solar): {rmse_sloar:.3f}")
print(f"R² (solar): {r2_sloar:.3f}")

print(f"RMSE (Combined): {rmse:.3f}")
print(f"R² (Combined): {r2:.3f}")


18.601061643835617 18.162297297297297
RMSE (wind): 3.990
R² (wind): 0.798
RMSE (solar): 1.022
R² (solar): 0.945
RMSE (Combined): 4.059
R² (Combined): 0.808


In [109]:
values = [i/100 for i in range(1, 101)]
best_rmse = float('inf')
best_r2 = 0
best_value = None
print(np.mean(y_train_wind), np.mean(y_test_wind))
for value in values:
    print(f"Trying value = {value}")
    model = xgb.XGBRegressor(
        n_estimators=200, 
        eta=value, 
        max_depth=4, 
        random_state=42
    )
    model.fit(X_train_solar, y_train_solar)
    y_pred = model.predict(X_test_solar)

    rmse = np.sqrt(mean_squared_error(y_test_solar, y_pred))
    r2 = r2_score(y_test_solar, y_pred)

    # 如果 rmse 无效则跳过
    if np.isnan(rmse):
        print(f" Skipping eta = {eta} due to NaN RMSE")
        continue

    if rmse < best_rmse:
        best_rmse = rmse
        best_r2 = r2
        best_value = value


    print(f"value: {value:.2f}, RMSE: {rmse:.3f}, R²: {r2:.3f}")

print(f"\nBest value = {best_value:.2f}, RMSE = {best_rmse:.3f}, R² = {best_r2:.3f}")


18.601061643835617 18.162297297297297
Trying value = 0.01
value: 0.01, RMSE: 1.159, R²: 0.929
Trying value = 0.02
value: 0.02, RMSE: 1.010, R²: 0.946
Trying value = 0.03
value: 0.03, RMSE: 1.003, R²: 0.947
Trying value = 0.04
value: 0.04, RMSE: 1.007, R²: 0.946
Trying value = 0.05
value: 0.05, RMSE: 1.021, R²: 0.945
Trying value = 0.06
value: 0.06, RMSE: 1.022, R²: 0.945
Trying value = 0.07
value: 0.07, RMSE: 1.020, R²: 0.945
Trying value = 0.08
value: 0.08, RMSE: 1.037, R²: 0.943
Trying value = 0.09
value: 0.09, RMSE: 1.052, R²: 0.941
Trying value = 0.1
value: 0.10, RMSE: 1.034, R²: 0.943
Trying value = 0.11
value: 0.11, RMSE: 1.006, R²: 0.946
Trying value = 0.12
value: 0.12, RMSE: 1.039, R²: 0.943
Trying value = 0.13
value: 0.13, RMSE: 1.015, R²: 0.945
Trying value = 0.14
value: 0.14, RMSE: 1.058, R²: 0.941
Trying value = 0.15
value: 0.15, RMSE: 1.022, R²: 0.945
Trying value = 0.16
value: 0.16, RMSE: 1.033, R²: 0.943
Trying value = 0.17
value: 0.17, RMSE: 1.040, R²: 0.943
Trying valu

In [136]:
# Features and target
features = [
    'Tempmax_C', 'Tempmin_C', 'windspeedmax', 'windspeedmean',
    'solarradiation', 'uvindex', 'cloudcover', 'humidity', 'precip'
]
X = df[features]
y_wind = df['Wind_GWh']
y_solar = df['(Combined) Solar_GWh'] 

# Split data once to ensure alignment across targets
X_train, X_test, y_train_wind, y_test_wind, y_train_solar, y_test_solar = train_test_split(
    X, y_wind, y_solar, test_size=0.2, random_state=42
)

# Define models for wind and solar
model_wind = xgb.XGBRegressor(
    n_estimators=200,
    eta=0.31,
    max_depth=4,
    gamma=49,
    reg_lambda=1.02,
    random_state=42
)

model_solar = xgb.XGBRegressor(
    n_estimators=200,
    eta=0.31,
    max_depth=4,
    random_state=42
)

# Train the models
model_wind.fit(X_train, y_train_wind)
model_solar.fit(X_train, y_train_solar)

# Make predictions
pred_wind = model_wind.predict(X_test)
pred_solar = model_solar.predict(X_test)

# Combine predictions and true values (aligned by row)
pred_combined = pred_wind + pred_solar
y_combined_true = y_test_wind + y_test_solar

# Evaluate model performance
rmse_wind = np.sqrt(mean_squared_error(y_test_wind, pred_wind))
r2_wind = r2_score(y_test_wind, pred_wind)

rmse_solar = np.sqrt(mean_squared_error(y_test_solar, pred_solar))
r2_solar = r2_score(y_test_solar, pred_solar)

rmse_combined = np.sqrt(mean_squared_error(y_combined_true, pred_combined))
r2_combined = r2_score(y_combined_true, pred_combined)

# Print evaluation results
print(f"RMSE (Wind):     {rmse_wind:.3f}, R²: {r2_wind:.3f}")
print(f"RMSE (Solar):    {rmse_solar:.3f}, R²: {r2_solar:.3f}")
print(f"RMSE (Combined): {rmse_combined:.3f}, R²: {r2_combined:.3f}")


RMSE (Wind):     3.990, R²: 0.798
RMSE (Solar):    1.022, R²: 0.945
RMSE (Combined): 4.059, R²: 0.808


In [137]:
# Define features and targets
features = [
    'Tempmax_C', 'Tempmin_C', 'windspeedmax', 'windspeedmean',
    'solarradiation', 'uvindex', 'cloudcover', 'humidity', 'precip'
]
X = df[features]
y_wind = df['Wind_GWh']
y_solar = df['(Combined) Solar_GWh']
y_combined = y_wind + y_solar

# Split all data in one step to ensure row alignment
X_train, X_test, y_train_wind, y_test_wind, y_train_solar, y_test_solar, y_train_combined, y_test_combined = train_test_split(
    X, y_wind, y_solar, y_combined, test_size=0.2, random_state=42
)

# Train base models (Wind and Solar)
model_wind = xgb.XGBRegressor(
    eta=0.31, max_depth=4, n_estimators=200, gamma=49, reg_lambda=1.02, random_state=42
)
model_solar = xgb.XGBRegressor(
    eta=0.31, max_depth=4, n_estimators=200, random_state=42
)

model_wind.fit(X_train, y_train_wind)
model_solar.fit(X_train, y_train_solar)

# Predict using base models
pred_wind = model_wind.predict(X_test)
pred_solar = model_solar.predict(X_test)

# Construct stacking input features from base model predictions
stacked_X = np.column_stack([pred_wind, pred_solar])

# Train meta-model using combined target
meta_model = xgb.XGBRegressor(
    eta=0.1, max_depth=2, n_estimators=100, random_state=42
)
meta_model.fit(stacked_X, y_test_combined)

# Predict combined output using meta-model
y_pred_combined = meta_model.predict(stacked_X)

# Evaluate the stacked model
rmse_stacked = np.sqrt(mean_squared_error(y_test_combined, y_pred_combined))
r2_stacked = r2_score(y_test_combined, y_pred_combined)

print(f"RMSE (XGB Stacked Combined): {rmse_stacked:.3f}")
print(f"R²   (XGB Stacked Combined): {r2_stacked:.3f}")


RMSE (XGB Stacked Combined): 2.202
R²   (XGB Stacked Combined): 0.944


In [133]:
# Load the dataset
df = pd.read_excel("./datasets/datasets.xlsx", sheet_name="Sheet1")

# Define features and targets
features = [
    'Tempmax_C', 'Tempmin_C', 'windspeedmax', 'windspeedmean',
    'solarradiation', 'uvindex', 'cloudcover', 'humidity', 'precip'
]
X = df[features]
y_wind = df['Wind_GWh']
y_solar = df['(Combined) Solar_GWh']
y_combined = y_wind + y_solar

# Split all data in one step to ensure alignment across targets
X_train, X_test, y_train_wind, y_test_wind, y_train_solar, y_test_solar, y_train_combined, y_test_combined = train_test_split(
    X, y_wind, y_solar, y_combined, test_size=0.2, random_state=42
)

# Train base models for wind and solar generation
model_wind = XGBRegressor(
    eta=0.31, max_depth=4, n_estimators=200, gamma=49, reg_lambda=1.02, random_state=42
)
model_solar = XGBRegressor(
    eta=0.31, max_depth=4, n_estimators=200, random_state=42
)

model_wind.fit(X_train, y_train_wind)
model_solar.fit(X_train, y_train_solar)

# Generate predictions from base models for stacking input
pred_wind = model_wind.predict(X_test)
pred_solar = model_solar.predict(X_test)
stacked_X = np.column_stack([pred_wind, pred_solar])  # shape: (n_samples, 2)

# Define parameter grid for GridSearchCV
param_grid = {
    'eta': [0.01, 0.02, 0.03, 0.05, 0.1, 0.2],
    'max_depth': [2, 3, 4, 5],
    'n_estimators': [50, 100, 150, 200, 250],
    'gamma': [0, 1, 2, 5, 10],
    'reg_lambda': [0, 0.5, 1, 2],
    'colsample_bytree': [0.7, 0.8, 0.9, 1.0],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0]
}  # Total combinations: 6×4×5×5×4×4×5 = 96,000

# Initialize GridSearchCV for the meta-model
grid_search = GridSearchCV(
    estimator=XGBRegressor(random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring='neg_root_mean_squared_error',
    verbose=1,
    n_jobs=-1
)

# Fit meta-model using combined generation as the target
grid_search.fit(stacked_X, y_test_combined)

# Evaluate the best meta-model
best_meta_model = grid_search.best_estimator_
y_pred_combined = best_meta_model.predict(stacked_X)

rmse = np.sqrt(mean_squared_error(y_test_combined, y_pred_combined))
r2 = r2_score(y_test_combined, y_pred_combined)

print("\nBest Meta-Model Parameters:")
print(grid_search.best_params_)
print(f"\nFinal RMSE (Stacked): {rmse:.3f}")
print(f"Final R²   (Stacked): {r2:.3f}")


Fitting 5 folds for each of 48000 candidates, totalling 240000 fits

Best Meta-Model Parameters:
{'colsample_bytree': 1.0, 'eta': 0.1, 'gamma': 1, 'max_depth': 2, 'n_estimators': 50, 'reg_lambda': 0, 'subsample': 0.6}

inal RMSE (Stacked): 2.548
Final R²   (Stacked): 0.924


In [135]:
# Define features and targets
features = [
    'Tempmax_C', 'Tempmin_C', 'windspeedmax', 'windspeedmean',
    'solarradiation', 'uvindex', 'cloudcover', 'humidity', 'precip'
]
X = df[features]
y_wind = df['Wind_GWh']
y_solar = df['(Combined) Solar_GWh']
y_combined = y_wind + y_solar

# Split all data at once to ensure row alignment
X_train, X_test, y_train_wind, y_test_wind, y_train_solar, y_test_solar, y_train_combined, y_test_combined = train_test_split(
    X, y_wind, y_solar, y_combined, test_size=0.2, random_state=42
)

# Train base models for wind and solar generation
model_wind = XGBRegressor(
    eta=0.31, max_depth=4, n_estimators=200, gamma=49, reg_lambda=1.02,
    random_state=42
)
model_solar = XGBRegressor(
    eta=0.31, max_depth=4, n_estimators=200, random_state=42
)

model_wind.fit(X_train, y_train_wind)
model_solar.fit(X_train, y_train_solar)

# Use base model predictions as stacking features
pred_wind = model_wind.predict(X_test)
pred_solar = model_solar.predict(X_test)
stacked_X = np.column_stack([pred_wind, pred_solar])

# Define parameter distributions for randomized search
param_dist = {
    'eta': stats.uniform(0.01, 0.3),              # 0.01 to 0.31
    'max_depth': [2, 3, 4, 5, 6],
    'n_estimators': [50, 100, 150, 200, 250],
    'gamma': [0, 1, 2, 5, 10],
    'reg_lambda': stats.uniform(0, 3),            # 0 to 3
    'colsample_bytree': stats.uniform(0.7, 0.3),  # 0.7 to 1.0
    'subsample': stats.uniform(0.6, 0.4)          # 0.6 to 1.0
}

# Set up RandomizedSearchCV for the meta-model
random_search = RandomizedSearchCV(
    estimator=XGBRegressor(tree_method='hist', device='cuda', random_state=42),
    param_distributions=param_dist,
    n_iter=2400,
    scoring='neg_root_mean_squared_error',
    cv=5,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

# Fit the randomized search (may take a long time)
random_search.fit(stacked_X, y_test_combined)

# Evaluate the best meta-model
best_model = random_search.best_estimator_
y_pred_combined = best_model.predict(stacked_X)

rmse = np.sqrt(mean_squared_error(y_test_combined, y_pred_combined))
r2 = r2_score(y_test_combined, y_pred_combined)

print("\nBest Parameters (RandomizedSearchCV):")
print(random_search.best_params_)
print(f"Final RMSE: {rmse:.3f}")
print(f"Final R²: {r2:.3f}")


Fitting 5 folds for each of 2400 candidates, totalling 12000 fits

Best Parameters (RandomizedSearchCV):
{'colsample_bytree': 0.8532241907732696, 'eta': 0.1352233009446337, 'gamma': 10, 'max_depth': 2, 'n_estimators': 150, 'reg_lambda': 0.8082370013955644, 'subsample': 0.6976502088991097}
Final RMSE: 2.134
Final R²: 0.947
