In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

# URLs for datasets
datasets = {
    "SO2TONS": "https://raw.githubusercontent.com/apownukepcc/ForecastingDailyEmissions/refs/heads/main/SO2TONS_dataset.csv",
    "NOXTONS": "https://raw.githubusercontent.com/apownukepcc/ForecastingDailyEmissions/refs/heads/main/NOXTONS_dataset.csv",
    "COTONS": "https://raw.githubusercontent.com/apownukepcc/ForecastingDailyEmissions/refs/heads/main/COTONS_dataset.csv"
}

# Define the peak season months (May through August)
peak_season_months = [5, 6, 7, 8]

# Define lakes (sources)
sources = ["LAKE-1", "LAKE-2", "LAKE-3", "LAKE-4"]

# Define the specific day for prediction
specific_date = pd.Timestamp("2022-07-15")

# Define a PyTorch model
class EmissionPredictor(nn.Module):
    def __init__(self, input_size):
        super(EmissionPredictor, self).__init__()
        self.fc1 = nn.Linear(input_size, 32)
        self.fc2 = nn.Linear(32, 16)
        self.fc3 = nn.Linear(16, 8)
        self.fc4 = nn.Linear(8, 1)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        x = self.fc4(x)
        return x

# Initialize a dictionary to store models, predictions, and inputs for verification
models = {}
predictions = {}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Loop through each dataset
for parameter, url in datasets.items():
    # Load the dataset
    data = pd.read_csv(url)

    # Convert the 'date' column to datetime
    data['date'] = pd.to_datetime(data['date'])

    # Filter for peak season
    data = data[data['date'].dt.month.isin(peak_season_months)]

    # Separate data by source
    for source in sources:
        source_data = data[data['Source'] == source]

        if source_data.empty or len(source_data) < 10:
            print(f"Not enough data for {parameter} at {source}. Skipping...")
            continue

        # Define predictors and target
        predictors = ['tavg', 'tmin', 'tmax', 'prcp', 'snow', 'wdir', 'wspd', 'pres']
        target = 'Emissions_Load'

        # Drop rows with missing values
        source_data = source_data.dropna(subset=predictors + [target])

        # Split the data into features (X) and target (y)
        X = source_data[predictors]
        y = source_data[target]

        # Standardize features
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)

        # Split into train and test sets
        X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

        # Convert to PyTorch tensors
        X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
        X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)
        y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1).to(device)
        y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1).to(device)

        # Initialize the model
        model = EmissionPredictor(X_train.shape[1]).to(device)
        criterion = nn.MSELoss()
        optimizer = optim.Adam(model.parameters(), lr=0.001)

        # Train the model
        epochs = 100
        batch_size = 8

        for epoch in range(epochs):
            permutation = torch.randperm(X_train_tensor.size(0))
            for i in range(0, X_train_tensor.size(0), batch_size):
                indices = permutation[i:i+batch_size]
                batch_x, batch_y = X_train_tensor[indices], y_train_tensor[indices]

                optimizer.zero_grad()
                outputs = model(batch_x)
                loss = criterion(outputs, batch_y)
                loss.backward()
                optimizer.step()

        # Evaluate the model
        with torch.no_grad():
            y_pred_tensor = model(X_test_tensor)
            y_pred = y_pred_tensor.cpu().numpy().flatten()

        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        r2 = r2_score(y_test, y_pred)

        print(f"Model for {parameter} at {source}:")
        print(f"  RMSE: {rmse:.4f}")
        print(f"  R²: {r2:.4f}")

        models[(parameter, source)] = (model, scaler)

        # Predict for specific date
        day_data = source_data[source_data['date'] == specific_date]
        if not day_data.empty:
            specific_features = scaler.transform(day_data[predictors])
            specific_features_tensor = torch.tensor(specific_features, dtype=torch.float32).to(device)

            with torch.no_grad():
                specific_prediction = model(specific_features_tensor).cpu().numpy()[0, 0]

            specific_actual = day_data[target].iloc[0]

            predictions[(parameter, source)] = {
                "features": day_data[predictors].iloc[0],
                "actual": specific_actual,
                "predicted": specific_prediction
            }

# Display all predictions at the end
print("\nFinal Predictions:")
for key, value in predictions.items():
    parameter, source = key
    print(f"{parameter} at {source}:")
    print(f"  Features: {value['features'].to_dict()}")
    print(f"  Actual Emissions_Load: {value['actual']:.4f}")
    print(f"  Predicted Emissions_Load: {value['predicted']:.4f}")
    print()


Model for SO2TONS at LAKE-1:
  RMSE: 0.0073
  R²: -262337.1256
Model for SO2TONS at LAKE-2:
  RMSE: 0.0018
  R²: -19107.6562
Model for SO2TONS at LAKE-3:
  RMSE: 0.0021
  R²: -25220.1207
Model for SO2TONS at LAKE-4:
  RMSE: 0.0023
  R²: -25567.7081
Model for NOXTONS at LAKE-1:
  RMSE: 0.0087
  R²: -1140.7929
Model for NOXTONS at LAKE-2:
  RMSE: 0.0068
  R²: -949.5982
Model for NOXTONS at LAKE-3:
  RMSE: 0.0052
  R²: -509.9165
Model for NOXTONS at LAKE-4:
  RMSE: 0.0023
  R²: -110.8641
Model for COTONS at LAKE-1:
  RMSE: 0.0055
  R²: -3106.0307
Model for COTONS at LAKE-2:
  RMSE: 0.0021
  R²: -572.3720
Model for COTONS at LAKE-3:
  RMSE: 0.0009
  R²: -117.9118
Model for COTONS at LAKE-4:
  RMSE: 0.0039
  R²: -1367.3266

Final Predictions:
SO2TONS at LAKE-1:
  Features: {'tavg': 31.7, 'tmin': 23.3, 'tmax': 38.9, 'prcp': 0.0, 'snow': 0.0, 'wdir': 87.0, 'wspd': 11.2, 'pres': 1011.3}
  Actual Emissions_Load: 0.0000
  Predicted Emissions_Load: -0.0000

SO2TONS at LAKE-2:
  Features: {'tavg':