In [1]:
import pandas as pd
from sklearn import __version__ as sklearn_version
from packaging import version
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error

# Load the dataset
file_path = "https://raw.githubusercontent.com/apownukepcc/ForecastingDailyEmissions/refs/heads/main/combinedWeatherValues.csv"  # Replace with the actual path
data = pd.read_csv(file_path)

# Convert 'date' column to datetime
data['date'] = pd.to_datetime(data['date'])

# Define emissions parameters and load
emissions_params = ['SO2TONS', 'NOXTONS', 'COTONS']
load_param = 'LOADMWBA'

# Separate emissions and load data
emissions_data = data[data['Parameter'].isin(emissions_params)]
load_data = data[data['Parameter'] == load_param]

# Merge emissions and load data on date and source (use a left join to retain emissions data)
merged_data = pd.merge(
    emissions_data,
    load_data,
    on=["date", "Source"],
    suffixes=("_emission", "_load"),
    how="left"
)

# Check if merged data is empty
if merged_data.empty:
    print("Merged data is empty. Check the filtering conditions or input data.")
    exit()

# Calculate emissions/load only for rows where both emissions and load are available
merged_data["Emissions_Load"] = merged_data["Value_emission"] / merged_data["Value_load"]

# Fix: Drop rows with NaN in Emissions_Load
merged_data = merged_data.dropna(subset=["Emissions_Load"])

# Confirm no NaN values in Emissions_Load
if merged_data["Emissions_Load"].isna().sum() > 0:
    print("Error: NaN values still present in Emissions_Load.")
    exit()

# Define predictors and target
predictors = ['tavg_emission', 'tmin_emission', 'tmax_emission', 'prcp_emission',
              'snow_emission', 'wdir_emission', 'wspd_emission', 'pres_emission']
categorical_features = ['Source', 'Parameter_emission', 'Units_emission']
target = 'Emissions_Load'

# Dynamic OneHotEncoder
if version.parse(sklearn_version) >= version.parse("1.2"):
    one_hot_encoder = OneHotEncoder(drop="first", sparse_output=False)  # For newer versions
else:
    one_hot_encoder = OneHotEncoder(drop="first")  # For older versions (before 1.2)

# Preprocess categorical and numeric features
column_transformer = ColumnTransformer(
    transformers=[
        ('cat', one_hot_encoder, categorical_features),
        ('num', StandardScaler(), predictors)
    ]
)

# Prepare features and target
X = merged_data[predictors + categorical_features]
y = merged_data[target]

# Encode features
X_encoded = column_transformer.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Train the Random Forest model
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error on Test Data: {mae:.6f}")

# Predict emissions/load for multiple specific dates
def predict_for_multiple_dates(dates_to_predict):
    results = []
    for date_to_predict in dates_to_predict:
        # Check if the date exists in the dataset
        selected_day = merged_data[merged_data['date'] == pd.Timestamp(date_to_predict)]
        if selected_day.empty:
            print(f"No data found for the specified date: {date_to_predict}")
            continue

        # Use the first matching row for prediction
        selected_day = selected_day.iloc[0]

        # Extract features for the selected day
        selected_day_features = pd.DataFrame([{
            **{col: selected_day[col] for col in predictors},
            **{col: selected_day[col] for col in categorical_features}
        }])

        # Transform features for prediction
        selected_day_encoded = column_transformer.transform(selected_day_features)
        predicted_emissions_load = model.predict(selected_day_encoded)

        # Collect results
        result = {
            "Date": date_to_predict,
            "Predicted Emissions/Load (tons/MW)": predicted_emissions_load[0],
            "Actual Emissions/Load (tons/MW)": selected_day["Emissions_Load"],
            "Parameters": selected_day[predictors + categorical_features].to_dict()
        }
        results.append(result)

    # Display results
    for result in results:
        print(f"\nPrediction for the specified date: {result['Date']}")
        print(f"Predicted Emissions/Load: {result['Predicted Emissions/Load (tons/MW)']:.6f} tons/MW")
        print(f"Actual Emissions/Load: {result['Actual Emissions/Load (tons/MW)']:.6f} tons/MW")
        print(f"Parameters for the Day: {result['Parameters']}")

# Example usage for prediction
dates_to_test = ["2022-06-02", "2022-05-15", "2022-07-10"]
predict_for_multiple_dates(dates_to_test)


Mean Absolute Error on Test Data: 0.000176

Prediction for the specified date: 2022-06-02
Predicted Emissions/Load: 0.000026 tons/MW
Actual Emissions/Load: 0.000025 tons/MW
Parameters for the Day: {'tavg_emission': 26.8, 'tmin_emission': 20.0, 'tmax_emission': 33.9, 'prcp_emission': 3.0, 'snow_emission': 0.0, 'wdir_emission': 81, 'wspd_emission': 12.2, 'pres_emission': 1007.1, 'Source': 'LAKE-1', 'Parameter_emission': 'SO2TONS', 'Units_emission': 'TONS'}

Prediction for the specified date: 2022-05-15
Predicted Emissions/Load: 0.000026 tons/MW
Actual Emissions/Load: 0.000022 tons/MW
Parameters for the Day: {'tavg_emission': 25.3, 'tmin_emission': 10.0, 'tmax_emission': 37.2, 'prcp_emission': 0.0, 'snow_emission': 0.0, 'wdir_emission': 17, 'wspd_emission': 5.0, 'pres_emission': 1008.8, 'Source': 'LAKE-2', 'Parameter_emission': 'SO2TONS', 'Units_emission': 'TONS'}

Prediction for the specified date: 2022-07-10
Predicted Emissions/Load: 0.000035 tons/MW
Actual Emissions/Load: 0.000038 tons