In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import matplotlib.dates as mdates

# Generic input file location
input_file = './data/input/revised_synthetic_data.csv'

# Generic output folder locations
plot_folder = './output/input_data_characterization/plots/'

plots_folder = os.path.join(output_folder, 'plots')  # Plots folder path

os.makedirs(plots_folder, exist_ok=True)

# Load data
data = pd.read_csv(input_data_path)

# Preprocess data
data['Month'] = pd.to_datetime(data['Month'])
data.set_index('Month', inplace=True)

# Ensure data frequency is monthly and covers 2024–2029
data = data.asfreq('MS')

# Features and target variables
X = data[['Waste_Generation', 'Pricing', 'Economic_Growth_Rate', 'Resource_Availability', 'Energy_Prices']]
y_demand = data['Demand']
y_supply = data['Supply']

# Train-test split (consistent for full-range prediction)
X_train, X_test, y_train_demand, y_test_demand = train_test_split(X, y_demand, test_size=0.2, random_state=42)
_, _, y_train_supply, y_test_supply = train_test_split(X, y_supply, test_size=0.2, random_state=42)

# Gradient Boosting Regressor model for Demand
gbr_demand = GradientBoostingRegressor(random_state=42)
gbr_demand.fit(X_train, y_train_demand)

# Generate full-range predictions (on the entire dataset)
y_pred_demand = gbr_demand.predict(X)

# Gradient Boosting Regressor model for Supply
gbr_supply = GradientBoostingRegressor(random_state=42)
gbr_supply.fit(X_train, y_train_supply)



# Calculate performance metrics
def calculate_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    return mae, mse, rmse, r2



# Generate full-range predictions (on the entire dataset)
y_pred_supply = gbr_supply.predict(X)


# Calculate performance metrics for Demand and Supply
demand_metrics = calculate_metrics(y_demand, y_pred_demand)
supply_metrics = calculate_metrics(y_supply, y_pred_supply)

# Create a DataFrame for the metrics and save it to a CSV file
performance_df = pd.DataFrame({
    'Metric': ['MAE', 'MSE', 'RMSE', 'R²'],
    'Demand': demand_metrics,
    'Supply': supply_metrics
})

# Save the performance metrics to CSV in the output folder
performance_df.to_csv(os.path.join(output_folder, 'GBR_performance_metrics.csv'), index=False)

print("Performance metrics saved to GBR_performance_metrics.csv")




# Generate date range for x-ticks (2024-2029)
time_index = data.index  # Use the entire index for full prediction range
date_range = pd.date_range(start='2024-01-01', end='2029-12-01', freq='YS')

# Plot Actual vs Predicted for Supply (2024-2029)
plt.figure(figsize=(11, 8))
plt.plot(time_index, y_supply, label='Actual Supply', color='blue')
plt.plot(time_index, y_pred_supply, label='Predicted Supply', color='green', linestyle='--')
plt.title('GBR: Actual vs Predicted Supply', fontsize=25)
plt.xlabel('Date', fontsize=32)
plt.ylabel('Supply', fontsize=32)
plt.legend(fontsize=20)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.grid(True)

# Set x-tick values for supply
plt.gca().xaxis.set_major_locator(mdates.YearLocator())  # Set major ticks at year intervals
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y'))  # Format x-axis as years
plt.xticks(date_range, date_range.year, fontsize=12)
plt.yticks(fontsize=12)

# Save the plot to the specified folder
plt.savefig(f'{plots_folder}/GBR_Supply_Actual_vs_Predicted_Full.pdf', format='pdf')
plt.close()

# Plot Actual vs Predicted for Demand (2024-2029)
plt.figure(figsize=(11, 8))
plt.plot(time_index, y_demand, label='Actual Demand', color='blue')
plt.plot(time_index, y_pred_demand, label='Predicted Demand', color='green', linestyle='--')
plt.title('GBR: Actual vs Predicted Demand', fontsize=25)
plt.xlabel('Date', fontsize=32)
plt.ylabel('Demand', fontsize=32)
plt.legend(fontsize=20)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.grid(True)

# Set x-tick values for demand
plt.gca().xaxis.set_major_locator(mdates.YearLocator())  # Set major ticks at year intervals
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y'))  # Format x-axis as years
plt.xticks(date_range, date_range.year, fontsize=12)
plt.yticks(fontsize=12)

# Save the plot to the specified folder
plt.savefig(f'{plots_folder}/GBR_Demand_Actual_vs_Predicted_Full.pdf', format='pdf')
plt.close()

# Plot residuals for Demand and Supply
def plot_residuals(time_index, actual, predicted, title, y_label, file_name):
    residuals = actual - predicted
    plt.figure(figsize=(11, 8))
    plt.plot(time_index, residuals, label='Residuals', color='purple')
    #plt.title(f'Residuals for {title} (GBR)', fontsize=25)

    plt.title(f'GBR: Residual Plot for {title}', fontsize=25)
    
    plt.xlabel('Date', fontsize=32)
    plt.ylabel(y_label, fontsize=32)
    plt.xticks(fontsize=20)
    plt.yticks(fontsize=20)
    plt.grid(True)

    # Set x-tick values for residuals
    plt.gca().xaxis.set_major_locator(mdates.YearLocator())  # Set major ticks at year intervals
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y'))  # Format x-axis as years
    plt.xticks(date_range, date_range.year, fontsize=12)
    plt.yticks(fontsize=12)

    # Save the plot to the specified folder
    plt.savefig(os.path.join(plots_folder, file_name), format='pdf')
    plt.close()

# Plot residuals for Demand
plot_residuals(time_index, y_demand, y_pred_demand, 'Demand', 'Residuals', 'GBR_Demand_Residuals.pdf')

# Plot residuals for Supply
plot_residuals(time_index, y_supply, y_pred_supply, 'Supply', 'Residuals', 'GBR_Supply_Residuals.pdf')

print("GBR model, predictions, performance metrics, and plots saved successfully!")
