In [1]:
import pandas as pd
from sklearn import __version__ as sklearn_version
from packaging import version
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error

# Load the dataset
file_path = "https://raw.githubusercontent.com/apownukepcc/ForecastingDailyEmissions/refs/heads/main/combinedWeatherValues.csv"  # Replace with the actual path
data = pd.read_csv(file_path)

# Convert 'date' column to datetime
data['date'] = pd.to_datetime(data['date'])

# Define emissions parameters and load
emissions_params = ['SO2TONS', 'NOXTONS', 'COTONS']
load_param = 'LOADMWBA'

# Filter for Peak Season (May through August)
peak_season = data[data['date'].dt.month.isin([5, 6, 7, 8])]

# Separate emissions and load data
emissions_data = peak_season[peak_season['Parameter'].isin(emissions_params)]
load_data = peak_season[peak_season['Parameter'] == load_param]

# Merge emissions and load data on date and source
merged_data = pd.merge(
    emissions_data,
    load_data,
    on=["date", "Source"],
    suffixes=("_emission", "_load")
)

# Check if merged data is empty
if merged_data.empty:
    print("Merged data is empty. Check the filtering conditions or input data.")
    exit()

# Calculate emissions/load
merged_data["Emissions_Load"] = merged_data["Value_emission"] / merged_data["Value_load"]

# Define predictors and target
predictors = ['tavg_emission', 'tmin_emission', 'tmax_emission', 'prcp_emission',
              'snow_emission', 'wdir_emission', 'wspd_emission', 'pres_emission']
categorical_features = ['Source', 'Parameter_emission', 'Units_emission']
target = 'Emissions_Load'

# Dynamic OneHotEncoder
if version.parse(sklearn_version) >= version.parse("1.2"):
    one_hot_encoder = OneHotEncoder(drop="first", sparse_output=False)  # For newer versions
else:
    one_hot_encoder = OneHotEncoder(drop="first")  # For older versions (before 1.2)

# Preprocess categorical and numeric features
column_transformer = ColumnTransformer(
    transformers=[
        ('cat', one_hot_encoder, categorical_features),
        ('num', StandardScaler(), predictors)
    ]
)

# Prepare features and target
X = merged_data[predictors + categorical_features]
y = merged_data[target]

# Encode features
X_encoded = column_transformer.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Train the Random Forest model
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error on Test Data: {mae:.6f}")

# Function to predict and compare emissions/load for a specific date
def predict_for_date(date_to_predict):
    # Filter the merged dataset for the specified date
    selected_day = merged_data[merged_data['date'] == pd.Timestamp(date_to_predict)]

    if selected_day.empty:
        print(f"No data found for the specified date: {date_to_predict}")
        return

    # Use the first matching row for prediction
    selected_day = selected_day.iloc[0]

    # Extract features for the selected day
    selected_day_features = pd.DataFrame([{
        **{col: selected_day[col] for col in predictors},
        **{col: selected_day[col] for col in categorical_features}
    }])

    # Transform features for prediction
    selected_day_encoded = column_transformer.transform(selected_day_features)
    predicted_emissions_load = model.predict(selected_day_encoded)

    # Output prediction and compare with actual value
    print(f"Prediction for the specified date: {date_to_predict}")
    print(f"Predicted Emissions/Load: {predicted_emissions_load[0]:.6f} tons/MW")
    print("\nActual Data from the Dataset:")
    print(f"Actual Emissions/Load: {selected_day['Emissions_Load']:.6f} tons/MW")
    print(f"Parameters for the Day: {selected_day[predictors + categorical_features].to_dict()}")

# Example usage for a specific date present in the dataset
predict_for_date("2022-06-02")


Mean Absolute Error on Test Data: 0.000089
Prediction for the specified date: 2022-06-02
Predicted Emissions/Load: 0.000023 tons/MW

Actual Data from the Dataset:
Actual Emissions/Load: 0.000025 tons/MW
Parameters for the Day: {'tavg_emission': 26.8, 'tmin_emission': 20.0, 'tmax_emission': 33.9, 'prcp_emission': 3.0, 'snow_emission': 0.0, 'wdir_emission': 81, 'wspd_emission': 12.2, 'pres_emission': 1007.1, 'Source': 'LAKE-1', 'Parameter_emission': 'SO2TONS', 'Units_emission': 'TONS'}
