In [4]:
# Install necessary packages (uncomment if needed)
!pip install catboost

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Import models from scikit-learn
from sklearn.ensemble import (RandomForestRegressor, GradientBoostingRegressor,
                              ExtraTreesRegressor, AdaBoostRegressor)
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor  # CART model
from sklearn.linear_model import (LinearRegression, Lasso, ElasticNet, BayesianRidge, Ridge)
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Import additional models from external libraries
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

# Mount Google Drive (if running in Colab)
from google.colab import drive
drive.mount('/content/drive')

# Load dataset from online location
data_url = "https://raw.githubusercontent.com/apownukepcc/ForecastingDailyEmissions/refs/heads/main/SO2TONS_dataset.csv"
data = pd.read_csv(data_url)

# Convert the 'date' column to datetime
data['date'] = pd.to_datetime(data['date'])

# Filter to peak season months (May through August)
peak_season_months = [5, 6, 7, 8]
data = data[data['date'].dt.month.isin(peak_season_months)]

# Define predictors and target variable
predictors = ['tavg', 'tmin', 'tmax', 'prcp', 'snow', 'wdir', 'wspd', 'pres']
target = 'Emissions_Load'

# Drop rows with missing values (for predictors and target)
data = data.dropna(subset=predictors + [target])

# Define a comprehensive set of predictive models
models = {
    "Random Forest": RandomForestRegressor(random_state=42),
    "k-NN": KNeighborsRegressor(n_neighbors=5),
    "Neural Network": MLPRegressor(hidden_layer_sizes=(50, 50), max_iter=500, random_state=42),
    "Linear Regression": LinearRegression(),
    "CART": DecisionTreeRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
    "SVR": SVR(kernel='rbf'),
    "Extra Trees": ExtraTreesRegressor(random_state=42),
    "AdaBoost": AdaBoostRegressor(random_state=42),
    "Lasso": Lasso(),
    "ElasticNet": ElasticNet(random_state=42),
    "XGBoost": XGBRegressor(random_state=42),
    "LightGBM": LGBMRegressor(random_state=42),
    "CatBoost": CatBoostRegressor(random_state=42, verbose=0),
    "Bayesian Ridge": BayesianRidge(),
    "Ridge": Ridge(random_state=42)
}

# Create an empty DataFrame to store predictions for all powerplants
all_predictions_table = pd.DataFrame()

# To store overall average relative errors per model across all sources
overall_rel_errors = {model_name: [] for model_name in models.keys()}

# Dictionary to hold summary metrics per source (powerplant)
source_summary = {}

# Loop over each unique powerplant in the "Source" column
for source in data['Source'].unique():
    print(f"\nProcessing predictions for powerplant: {source}")
    data_source = data[data['Source'] == source].copy()

    # Check if there is enough data for meaningful predictions
    if data_source.shape[0] < 10:
        print(f"Not enough data for {source}, skipping...\n")
        continue

    # Split features and target
    X = data_source[predictors]
    y = data_source[target]

    # Split into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    y_test_array = y_test.values

    # Dictionaries to hold predictions and performance metrics for this powerplant
    predictions = {}
    performance_metrics = {}

    # For storing aggregated metrics for this source
    source_metrics = {}

    # Train each model and compute predictions and metrics
    for model_name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        predictions[model_name] = y_pred

        rmse = np.sqrt(mean_squared_error(y_test_array, y_pred))
        mae  = mean_absolute_error(y_test_array, y_pred)
        r2   = r2_score(y_test_array, y_pred)
        mape = np.mean(np.abs((y_test_array - y_pred) / y_test_array)) * 100

        performance_metrics[model_name] = {"RMSE": rmse, "MAE": mae, "R2": r2, "MAPE": mape}

        # Save metrics for overall aggregation across sources
        if model_name not in source_metrics:
            source_metrics[model_name] = {"RMSE": [], "MAE": [], "R2": [], "MAPE": []}
        source_metrics[model_name]["RMSE"].append(rmse)
        source_metrics[model_name]["MAE"].append(mae)
        source_metrics[model_name]["R2"].append(r2)
        source_metrics[model_name]["MAPE"].append(mape)

        # Collect relative error for overall summary
        rel_error = np.mean((np.abs(y_pred - y_test_array) / y_test_array) * 100)
        overall_rel_errors[model_name].append(rel_error)

    # Build predictions table for the current powerplant
    pred_table = pd.DataFrame({
        "Date": data_source.loc[X_test.index, 'date'].values,
        "Actual": y_test_array
    })

    for model_name, y_pred in predictions.items():
        pred_table[model_name] = y_pred
        residual = y_pred - y_test_array
        pred_table[model_name + " Residual"] = residual
        pred_table[model_name + " Relative Error (%)"] = (np.abs(residual) / y_test_array) * 100

        # Plot predicted vs. actual
        plt.figure(figsize=(8, 6))
        plt.scatter(y_test_array, y_pred, alpha=0.6, label=model_name)
        plt.plot([min(y_test_array), max(y_test_array)], [min(y_test_array), max(y_test_array)], 'k--', label="Perfect Fit")
        plt.xlabel("Actual Emissions_Load")
        plt.ylabel("Predicted Emissions_Load")
        plt.title(f"{source} - Predicted vs Actual for {model_name}")
        plt.legend()
        plt.grid(True)
        plt.show()

    # Print performance metrics for this powerplant
    print(f"Performance metrics for powerplant {source}:")
    for model_name, metrics in performance_metrics.items():
        print(f"  {model_name}: RMSE={metrics['RMSE']:.2e}, MAE={metrics['MAE']:.2e}, R2={metrics['R2']:.2f}, MAPE={metrics['MAPE']:.2f}%")

    # Add Source column and append to the global predictions table
    pred_table["Source"] = source
    all_predictions_table = pd.concat([all_predictions_table, pred_table], ignore_index=True)

    # Create summary table for this powerplant (average metrics per model)
    summary_data = []
    for model_name, metrics in source_metrics.items():
        avg_rmse = np.mean(metrics["RMSE"]) if metrics["RMSE"] else np.nan
        avg_mae  = np.mean(metrics["MAE"])  if metrics["MAE"]  else np.nan
        avg_r2   = np.mean(metrics["R2"])   if metrics["R2"]   else np.nan
        avg_mape = np.mean(metrics["MAPE"])  if metrics["MAPE"] else np.nan
        summary_data.append({
            "Model": model_name,
            "Avg_RMSE": avg_rmse,
            "Avg_MAE": avg_mae,
            "Avg_R2": avg_r2,
            "Avg_MAPE": avg_mape
        })
    summary_df = pd.DataFrame(summary_data)
    source_summary[source] = summary_df.copy()

    print(f"\nSummary for {source} (average metrics):")
    print(summary_df)

    # Sort and print best and worst for each metric for this source
    metrics_to_sort = {
        "Avg_RMSE": "min",   # lower is better
        "Avg_MAE": "min",    # lower is better
        "Avg_MAPE": "min",   # lower is better
        "Avg_R2": "max"      # higher is better
    }
    for metric, sort_order in metrics_to_sort.items():
        sorted_df = summary_df.sort_values(by=metric, ascending=(sort_order=="min"))
        best_model = sorted_df.iloc[0]["Model"]
        worst_model = sorted_df.iloc[-1]["Model"]
        print(f"\nFor {source} sorted by {metric}:")
        print(sorted_df[['Model', metric]])
        print(f"Best {metric}: {best_model}   |   Worst {metric}: {worst_model}")

# Sort the global predictions table by Source and Date
all_predictions_table.sort_values(by=["Source", "Date"], inplace=True)

# Save the global predictions table to CSV on Google Drive
csv_path = '/content/drive/My Drive/final_predictions_by_powerplant_extended.csv'
all_predictions_table.to_csv(csv_path, index=False)
print(f"\nGlobal predictions table saved to {csv_path}")

# Compute and print overall average relative error for each model across all powerplants
print("\nOverall Average Relative Error (%) for each model across all powerplants:")
for model_name, errors in overall_rel_errors.items():
    if errors:
        avg_rel_error = np.mean(errors)
        print(f"  {model_name}: {avg_rel_error:.2f}%")
    else:
        print(f"  {model_name}: No data available.")

# Optionally, display a sample of the global predictions table
print("\nSample of global predictions table:")
print(all_predictions_table.head())


Output hidden; open in https://colab.research.google.com to view.