In [12]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.dates as mdates

# Folder paths
base_folder = './output/input_data_characterization/plots/'
input_file = base_folder + 'inputdata/revised_realistic_synthetic_data_with_improvements.csv'
output_folder = base_folder + 'outputdata/Random_Forest/'
model_files_folder = os.path.join(output_folder, 'model_files/')
predictions_folder = os.path.join(output_folder, 'predictions/')
plots_folder = os.path.join(output_folder, 'plots/')

# Ensure folders exist
os.makedirs(model_files_folder, exist_ok=True)
os.makedirs(predictions_folder, exist_ok=True)
os.makedirs(plots_folder, exist_ok=True)

# Load and preprocess data
data = pd.read_csv(input_file)
data['Month'] = pd.to_datetime(data['Month'])
data.set_index('Month', inplace=True)

# Ensure the data frequency is monthly and covers the date range from 2024 to 2029
data = data.asfreq('MS')

# Features and target variables
X = data[['Waste_Generation', 'Pricing', 'Economic_Growth_Rate', 'Resource_Availability', 'Energy_Prices']]
y_demand = data['Demand']
y_supply = data['Supply']

# Train-Test split (on the entire data for consistent predictions)
X_train, X_test, y_train_demand, y_test_demand = train_test_split(X, y_demand, test_size=0.2, random_state=42)
_, _, y_train_supply, y_test_supply = train_test_split(X, y_supply, test_size=0.2, random_state=42)

# Random Forest model for Demand
rf_demand = RandomForestRegressor(n_estimators=100, random_state=42)
rf_demand.fit(X_train, y_train_demand)

# Generate full-range predictions (on the entire dataset)
y_pred_demand = rf_demand.predict(X)

# Random Forest model for Supply
rf_supply = RandomForestRegressor(n_estimators=100, random_state=42)
rf_supply.fit(X_train, y_train_supply)

# Generate full-range predictions (on the entire dataset)
y_pred_supply = rf_supply.predict(X)

# Generate date range for x-ticks (the entire range from 2024 to 2029)
time_index = data.index  # Use the entire index for full prediction range
date_range = pd.date_range(start='2024-01-01', end='2029-12-01', freq='YS')

# Plot Actual vs Predicted for Supply (2024-2029)
plt.figure(figsize=(11, 8))
plt.plot(time_index, y_supply, label='Actual Supply', color='blue')
plt.plot(time_index, y_pred_supply, label='Predicted Supply', color='green', linestyle='--')
plt.title('Random Forest: Actual vs Predicted Supply', fontsize=25)
plt.xlabel('Date', fontsize=32)
plt.ylabel('Supply', fontsize=32)
plt.legend(fontsize=20)
plt.grid(True)

# Explicitly set the x-tick values
plt.gca().xaxis.set_major_locator(mdates.YearLocator())  # Set major ticks at year intervals
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y'))  # Format x-axis as years
plt.xticks(date_range, date_range.year, fontsize=20)
plt.yticks(fontsize=20)

# Save the plot
plt.savefig(f'{plots_folder}Random_Forest_supply_actual_vs_predicted_full.pdf', format='pdf')
plt.close()

# Plot Actual vs Predicted for Demand (2024-2029)
plt.figure(figsize=(11, 8))
plt.plot(time_index, y_demand, label='Actual Demand', color='blue')
plt.plot(time_index, y_pred_demand, label='Predicted Demand', color='green', linestyle='--')
plt.title('Random Forest: Actual vs Predicted Demand', fontsize=25)
plt.xlabel('Date', fontsize=32)
plt.ylabel('Demand', fontsize=32)
plt.legend(fontsize=20)
plt.grid(True)

# Set x-tick values for demand
plt.gca().xaxis.set_major_locator(mdates.YearLocator())  # Set major ticks at year intervals
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y'))  # Format x-axis as years
plt.xticks(date_range, date_range.year, fontsize=20)
plt.yticks(fontsize=20)

# Save the plot
plt.savefig(f'{plots_folder}Random_Forest_demand_actual_vs_predicted_full.pdf', format='pdf')
plt.close()

# Calculate performance metrics
def calculate_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    return mae, mse, rmse, r2

# Calculate and save performance metrics
demand_metrics = calculate_metrics(y_demand, y_pred_demand)
supply_metrics = calculate_metrics(y_supply, y_pred_supply)

performance_df = pd.DataFrame({
    'Metric': ['MAE', 'MSE', 'RMSE', 'R²'],
    'Demand': demand_metrics,
    'Supply': supply_metrics
})
performance_df.to_csv(os.path.join(output_folder, 'Random_Forest_performance_metrics.csv'), index=False)




# Plot residuals for Demand and Supply with a proper date range from 2024 to 2029
def plot_residuals(time_index, actual, predicted, title, y_label, file_name):
    residuals = actual - predicted
    plt.figure(figsize=(11, 8))
    plt.plot(time_index, residuals, label='Residuals', color='purple')

    # Update the title for each plot
    plt.title(f'Random Forest: Residual Plot for {title}', fontsize=25)

    # Labels with appropriate font size
    plt.xlabel('Date', fontsize=32)
    plt.ylabel(y_label, fontsize=32)
    plt.grid(True)
    
    # Set the date range explicitly to 2024-2029
    plt.xlim([pd.Timestamp('2024-01-01'), pd.Timestamp('2029-12-31')])
    
    # Set major ticks and format x-axis labels
    plt.gca().xaxis.set_major_locator(mdates.YearLocator())  # Set major ticks at year intervals
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y'))  # Format x-axis as years
    
    # Set x-tick font size and adjust tick marks according to the date range
    plt.xticks(fontsize=20)
    plt.yticks(fontsize=20)

    # Save the plot
    plt.savefig(os.path.join(plots_folder, file_name), format='pdf')
    plt.close()

# Calculate residuals and plot for Demand and Supply
# For Demand
plot_residuals(time_index, y_demand, y_pred_demand, 'Demand', 'Residuals', 'Random_Forest_Demand_Residuals.pdf')

# For Supply
plot_residuals(time_index, y_supply, y_pred_supply, 'Supply', 'Residuals', 'Random_Forest_Supply_Residuals.pdf')
