In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.dates as mdates

# Input data path
input_file = './data/input/revised_synthetic_data.csv'

# Output folder for predictions and plots
output_folder = './output/input_data_characterization/plots/'

predictions_folder = os.path.join(output_folder, 'predictions/')
plots_folder = os.path.join(output_folder, 'plots/')

# Create output directories if they don't exist
os.makedirs(predictions_folder, exist_ok=True)
os.makedirs(plots_folder, exist_ok=True)

# Load the data
data = pd.read_csv(input_file)
data['Month'] = pd.to_datetime(data['Month'])
data.set_index('Month', inplace=True)

# Ensure the data frequency is monthly and covers the date range from 2024 to 2029
data = data.asfreq('MS')  # Monthly Start frequency

# Select the relevant features for modeling
X = data[['Supply', 'Pricing']]  # Features
y_demand = data['Demand']        # Target for demand prediction
y_supply = data['Supply']        # Target for supply prediction

# Define the full date range for both training and testing
train_end_date = '2027-12-01'   # End of the training period
full_range_end = '2029-12-01'   # End of the full prediction period

# Split the data into training and the full range (not just test)
X_train = X.loc[:train_end_date]
X_full_range = X.loc[:full_range_end]
y_train_demand = y_demand.loc[:train_end_date]
y_full_range_demand = y_demand.loc[:full_range_end]
y_train_supply = y_supply.loc[:train_end_date]
y_full_range_supply = y_supply.loc[:full_range_end]

# Initialize and train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train_demand)  # Train on demand data

# Make predictions for the entire range (2024-2029)
predicted_demand = model.predict(X_full_range)
predicted_supply = model.predict(X_full_range)  # Using the same model for supply as placeholder

# Save the predictions to CSV files
predictions_demand = pd.DataFrame(predicted_demand, index=X_full_range.index, columns=['Predicted_Demand'])
predictions_supply = pd.DataFrame(predicted_supply, index=X_full_range.index, columns=['Predicted_Supply'])

predictions_demand.to_csv(f'{predictions_folder}Linear_Regression_Demand_Predictions.csv')
predictions_supply.to_csv(f'{predictions_folder}Linear_Regression_Supply_Predictions.csv')

# Calculate performance metrics (only for the test period: 2028-2029)
y_test_demand = y_demand.loc['2028-01-01':'2029-12-01']
y_test_supply = y_supply.loc['2028-01-01':'2029-12-01']
predicted_demand_test = predicted_demand[-len(y_test_demand):]
predicted_supply_test = predicted_supply[-len(y_test_supply):]

demand_mae = mean_absolute_error(y_test_demand, predicted_demand_test)
demand_mse = mean_squared_error(y_test_demand, predicted_demand_test)
demand_rmse = np.sqrt(demand_mse)
demand_r2 = r2_score(y_test_demand, predicted_demand_test)

supply_mae = mean_absolute_error(y_test_supply, predicted_supply_test)
supply_mse = mean_squared_error(y_test_supply, predicted_supply_test)
supply_rmse = np.sqrt(supply_mse)
supply_r2 = r2_score(y_test_supply, predicted_supply_test)

# Save performance metrics to CSV
performance_metrics = pd.DataFrame({
    'Metric': ['MAE', 'MSE', 'RMSE', 'R²'],
    'Demand': [demand_mae, demand_mse, demand_rmse, demand_r2],
    'Supply': [supply_mae, supply_mse, supply_rmse, supply_r2]
})

performance_metrics.to_csv(f'{predictions_folder}Linear_Regression_Performance_Metrics.csv', index=False)

# Plot Actual vs Predicted for Demand (Full Range: 2024-2029)
plt.figure(figsize=(11, 8))
plt.plot(y_full_range_demand.index, y_full_range_demand, label='Actual Demand', color='blue')
plt.plot(y_full_range_demand.index, predicted_demand, label='Predicted Demand', color='orange', linestyle='--')
plt.title('Linear Regression: Actual vs Predicted Demand', fontsize=25)
plt.xlabel('Date', fontsize=32)
plt.ylabel('Demand', fontsize=32)
plt.legend(fontsize=20)
plt.grid(True)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)

# Format x-axis to show years properly
plt.gca().xaxis.set_major_locator(mdates.YearLocator())
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y'))

plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.savefig(f'{plots_folder}LR_demand_actual_vs_predicted_full.pdf', format='pdf')
plt.close()

# Plot Actual vs Predicted for Supply (Full Range: 2024-2029)
plt.figure(figsize=(11, 8))
plt.plot(y_full_range_supply.index, y_full_range_supply, label='Actual Supply', color='blue')
plt.plot(y_full_range_supply.index, predicted_supply, label='Predicted Supply', color='orange', linestyle='--')
plt.title('Linear Regression: Actual vs Predicted Supply', fontsize=25)
plt.xlabel('Date', fontsize=32)
plt.ylabel('Supply', fontsize=32)
plt.legend(fontsize=20)
plt.grid(True)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)

# Format x-axis to show years properly
plt.gca().xaxis.set_major_locator(mdates.YearLocator())
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y'))

plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.savefig(f'{plots_folder}LR_supply_actual_vs_predicted_full.pdf', format='pdf')
plt.close()

# Plot Residuals for Demand (Full Range: 2024-2029)
demand_residuals = y_full_range_demand - predicted_demand
plt.figure(figsize=(11, 8))
plt.plot(y_full_range_demand.index, demand_residuals)
plt.title('Linear Regression: Residuals Plot for Demand', fontsize=25)
plt.xlabel('Date', fontsize=32)
plt.ylabel('Residuals', fontsize=32)

plt.grid(True)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)

# Format x-axis to show years properly
plt.gca().xaxis.set_major_locator(mdates.YearLocator())
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y'))

plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.savefig(f'{plots_folder}LR_demand_residuals_full.pdf', format='pdf')
plt.close()

# Plot Residuals for Supply (Full Range: 2024-2029)
supply_residuals = y_full_range_supply - predicted_supply
plt.figure(figsize=(11, 8))
plt.plot(y_full_range_supply.index, supply_residuals)
plt.title('Linear Regression: Residuals Plot for Supply', fontsize=25)
plt.xlabel('Date', fontsize=32)
plt.ylabel('Residuals', fontsize=32)

plt.grid(True)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
# Format x-axis to show years properly
plt.gca().xaxis.set_major_locator(mdates.YearLocator())
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y'))

plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.savefig(f'{plots_folder}LR_supply_residuals_full.pdf', format='pdf')
plt.close()

print("Full date range (2024-2029) plots and metrics generated successfully!")
