In [4]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

# URLs for datasets
datasets = {
    "SO2TONS": "https://raw.githubusercontent.com/apownukepcc/ForecastingDailyEmissions/refs/heads/main/SO2TONS_dataset.csv",
    "NOXTONS": "https://raw.githubusercontent.com/apownukepcc/ForecastingDailyEmissions/refs/heads/main/NOXTONS_dataset.csv",
    "COTONS": "https://raw.githubusercontent.com/apownukepcc/ForecastingDailyEmissions/refs/heads/main/COTONS_dataset.csv"
}

# Define the peak season months (May through August)
peak_season_months = [5, 6, 7, 8]

# Define lakes (sources)
sources = ["LAKE-1", "LAKE-2", "LAKE-3", "LAKE-4"]

# Define the specific day for prediction
specific_date = pd.Timestamp("2022-07-15")

# Initialize dictionaries to store models and predictions
models = {}
predictions = {}

# Define the parameter grid for GridSearchCV, tuning the degree of polynomial features
param_grid = {
    'poly__degree': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],       # test degrees 1 through 10
    'lr__fit_intercept': [True, False]    # whether or not to fit an intercept
}

# Loop through each dataset (SO2TONS, NOXTONS, COTONS)
for parameter, url in datasets.items():
    # Load the dataset
    data = pd.read_csv(url)

    # Convert the 'date' column to datetime
    data['date'] = pd.to_datetime(data['date'])

    # Filter for peak season
    data = data[data['date'].dt.month.isin(peak_season_months)]

    # Separate data by source
    for source in sources:
        source_data = data[data['Source'] == source]

        # Check if the source data has enough rows
        if source_data.empty or len(source_data) < 10:
            print(f"Not enough data for {parameter} at {source}. Skipping...")
            continue

        # Define predictors (weather features) and target variable
        predictors = ['tavg', 'tmin', 'tmax', 'prcp', 'snow', 'wdir', 'wspd', 'pres']
        target = 'Emissions_Load'

        # Drop rows with missing values in predictors or target
        source_data = source_data.dropna(subset=predictors + [target])

        # Split the data into features (X) and target (y)
        X = source_data[predictors]
        y = source_data[target]

        # Split into train and test sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # Build a pipeline: scaling -> polynomial features -> linear regression
        pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('poly', PolynomialFeatures()),
            ('lr', LinearRegression())
        ])

        # Set up GridSearchCV with the pipeline and parameter grid
        grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
        grid_search.fit(X_train, y_train)

        # Retrieve the best pipeline (model)
        best_pipeline = grid_search.best_estimator_

        # Evaluate the tuned model on the test set
        y_pred = best_pipeline.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        r2 = r2_score(y_test, y_pred)

        print(f"Best parameters for {parameter} at {source}: {grid_search.best_params_}")
        print(f"Model for {parameter} at {source}:")
        print(f"  RMSE: {rmse:.4e}")
        print(f"  R²: {r2:.4e}")

        # Save the best model (pipeline) for later use
        models[(parameter, source)] = best_pipeline

        # Check if the specific date exists in the source data
        day_data = source_data[source_data['date'] == specific_date]
        if not day_data.empty:
            # Extract features for the specific day and apply the same scaling and polynomial transformation
            specific_features = day_data[predictors]
            specific_actual = day_data[target].iloc[0]

            # Predict emissions/load for the specific day using the tuned pipeline
            specific_prediction = best_pipeline.predict(specific_features)[0]

            # Save the prediction and actual value for verification
            predictions[(parameter, source)] = {
                "features": day_data[predictors].iloc[0],
                "actual": specific_actual,
                "predicted": specific_prediction
            }

# Display all predictions at the end
print("\nFinal Predictions:")
for key, value in predictions.items():
    parameter, source = key
    print(f"{parameter} at {source}:")
    print(f"  Features: {value['features'].to_dict()}")
    print(f"  Actual Emissions_Load: {value['actual']:.4e}")
    print(f"  Predicted Emissions_Load: {value['predicted']:.4e}")
    print()


Best parameters for SO2TONS at LAKE-1: {'lr__fit_intercept': True, 'poly__degree': 1}
Model for SO2TONS at LAKE-1:
  RMSE: 1.3175e-05
  R²: 1.3596e-01
Best parameters for SO2TONS at LAKE-2: {'lr__fit_intercept': True, 'poly__degree': 1}
Model for SO2TONS at LAKE-2:
  RMSE: 1.3125e-05
  R²: -2.8320e-02
Best parameters for SO2TONS at LAKE-3: {'lr__fit_intercept': True, 'poly__degree': 1}
Model for SO2TONS at LAKE-3:
  RMSE: 1.1900e-05
  R²: 1.6608e-01
Best parameters for SO2TONS at LAKE-4: {'lr__fit_intercept': True, 'poly__degree': 1}
Model for SO2TONS at LAKE-4:
  RMSE: 1.2439e-05
  R²: 2.6944e-01
Best parameters for NOXTONS at LAKE-1: {'lr__fit_intercept': False, 'poly__degree': 1}
Model for NOXTONS at LAKE-1:
  RMSE: 2.1511e-04
  R²: 3.0701e-01
Best parameters for NOXTONS at LAKE-2: {'lr__fit_intercept': True, 'poly__degree': 1}
Model for NOXTONS at LAKE-2:
  RMSE: 2.3277e-04
  R²: -1.0715e-01
Best parameters for NOXTONS at LAKE-3: {'lr__fit_intercept': True, 'poly__degree': 1}
Model