In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

# URLs for datasets
datasets = {
    "SO2TONS": "https://raw.githubusercontent.com/apownukepcc/ForecastingDailyEmissions/refs/heads/main/SO2TONS_dataset.csv",
    "NOXTONS": "https://raw.githubusercontent.com/apownukepcc/ForecastingDailyEmissions/refs/heads/main/NOXTONS_dataset.csv",
    "COTONS": "https://raw.githubusercontent.com/apownukepcc/ForecastingDailyEmissions/refs/heads/main/COTONS_dataset.csv"
}

# Define the peak season months (May through August)
peak_season_months = [5, 6, 7, 8]

# Define lakes (sources)
sources = ["LAKE-1", "LAKE-2", "LAKE-3", "LAKE-4"]

# Define the specific day for prediction
specific_date = pd.Timestamp("2022-07-15")

# Initialize dictionaries to store models and predictions
models = {}
predictions = {}

# Define the parameter grid for Lasso regression
param_grid = {
    'alpha': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization strength
    'fit_intercept': [True, False]
}

# Loop through each dataset (SO2TONS, NOXTONS, COTONS)
for parameter, url in datasets.items():
    # Load the dataset
    data = pd.read_csv(url)

    # Convert the 'date' column to datetime
    data['date'] = pd.to_datetime(data['date'])

    # Filter for peak season
    data = data[data['date'].dt.month.isin(peak_season_months)]

    # Separate data by source
    for source in sources:
        source_data = data[data['Source'] == source]

        # Check if the source data has enough rows
        if source_data.empty or len(source_data) < 10:
            print(f"Not enough data for {parameter} at {source}. Skipping...")
            continue

        # Define predictors (weather features) and target variable
        predictors = ['tavg', 'tmin', 'tmax', 'prcp', 'snow', 'wdir', 'wspd', 'pres']
        target = 'Emissions_Load'

        # Drop rows with missing values
        source_data = source_data.dropna(subset=predictors + [target])

        # Split the data into features (X) and target (y)
        X = source_data[predictors]
        y = source_data[target]

        # Standardize features
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)

        # Split into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

        # Set up the Lasso regression model and GridSearchCV for hyperparameter tuning
        lasso = Lasso(max_iter=10000, random_state=42)
        grid_search = GridSearchCV(lasso, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
        grid_search.fit(X_train, y_train)

        # Retrieve the best estimator
        best_model = grid_search.best_estimator_

        # Evaluate the tuned model on the test set
        y_pred = best_model.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        r2 = r2_score(y_test, y_pred)

        print(f"Best parameters for {parameter} at {source}: {grid_search.best_params_}")
        print(f"Model for {parameter} at {source}:")
        print(f"  RMSE: {rmse:.4e}")
        print(f"  R²: {r2:.4e}")

        # Save the best model and scaler
        models[(parameter, source)] = (best_model, scaler)

        # Check if the specific date exists in the source data
        day_data = source_data[source_data['date'] == specific_date]
        if not day_data.empty:
            # Extract feature values for the specific day and scale them
            specific_features = scaler.transform(day_data[predictors])
            specific_actual = day_data[target].iloc[0]

            # Predict emissions/load for the specific day using the tuned model
            specific_prediction = best_model.predict(specific_features)[0]

            # Save the prediction and actual value for later verification
            predictions[(parameter, source)] = {
                "features": day_data[predictors].iloc[0],
                "actual": specific_actual,
                "predicted": specific_prediction
            }

# Display all predictions at the end
print("\nFinal Predictions:")
for key, value in predictions.items():
    parameter, source = key
    print(f"{parameter} at {source}:")
    print(f"  Features: {value['features'].to_dict()}")
    print(f"  Actual Emissions_Load: {value['actual']:.4e}")
    print(f"  Predicted Emissions_Load: {value['predicted']:.4e}")
    print()


Best parameters for SO2TONS at LAKE-1: {'alpha': 0.001, 'fit_intercept': True}
Model for SO2TONS at LAKE-1:
  RMSE: 1.4181e-05
  R²: -9.7603e-04
Best parameters for SO2TONS at LAKE-2: {'alpha': 0.001, 'fit_intercept': True}
Model for SO2TONS at LAKE-2:
  RMSE: 1.3005e-05
  R²: -9.5952e-03
Best parameters for SO2TONS at LAKE-3: {'alpha': 0.001, 'fit_intercept': True}
Model for SO2TONS at LAKE-3:
  RMSE: 1.3068e-05
  R²: -5.6491e-03
Best parameters for SO2TONS at LAKE-4: {'alpha': 0.001, 'fit_intercept': True}
Model for SO2TONS at LAKE-4:
  RMSE: 1.4882e-05
  R²: -4.5590e-02
Best parameters for NOXTONS at LAKE-1: {'alpha': 0.001, 'fit_intercept': True}
Model for NOXTONS at LAKE-1:
  RMSE: 2.5884e-04
  R²: -3.3963e-03
Best parameters for NOXTONS at LAKE-2: {'alpha': 0.001, 'fit_intercept': True}
Model for NOXTONS at LAKE-2:
  RMSE: 2.2970e-04
  R²: -7.8134e-02
Best parameters for NOXTONS at LAKE-3: {'alpha': 0.001, 'fit_intercept': True}
Model for NOXTONS at LAKE-3:
  RMSE: 2.2971e-04
  R