In [None]:
# =====================================================
# 01 - Data Preprocessing for Industrial Energy Forecasting
# =====================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

# -----------------------------------------------------
# Load MoSPI Annual Industrial Consumption (cleaned)
# -----------------------------------------------------
path = "../data/processed/industry_consumption_annual_clean.csv"
annual_df = pd.read_csv(path)

annual_df


### Generate daily synthetic series 

In [None]:
# =====================================================
# Create synthetic daily data that sums to MoSPI annual
# =====================================================

np.random.seed(42)

latest_year = annual_df.iloc[-1]["year"]
mospi_gwh = annual_df.iloc[-1]["industry_gwh"]
mospi_mwh = mospi_gwh * 1000

print("MoSPI Total (MWh):", mospi_mwh)

# Generate 365 daily values around a realistic pattern
days = pd.date_range("2024-04-01", periods=365, freq="D")

base = 1.8e6  # base daily consumption
daily_variation = np.random.normal(0, 0.1e6, len(days))
weekly_pattern = np.sin(np.arange(365) * 2*np.pi/7) * 0.1e6

daily_mwh = base + daily_variation + weekly_pattern

daily_df = pd.DataFrame({
    "date": days,
    "industry_consumption_mwh": daily_mwh
})

synthetic_total = daily_df["industry_consumption_mwh"].sum()
scaling_factor = mospi_mwh / synthetic_total

daily_df["industry_consumption_mwh"] *= scaling_factor

daily_df.to_csv("../data/processed/industry_daily_synthetic.csv", index=False)
daily_df.head()


### Generate hourly synthetic data 

In [None]:
# =====================================================
# Create hourly profile from daily totals
# =====================================================

hours = list(range(24))
hour_weights = np.array([0.9,0.8,0.7,0.6,0.5,0.6,0.8,1.2,1.5,1.7,1.8,2.0,
                         2.2,2.1,2.0,1.8,1.6,1.4,1.3,1.1,1.0,0.9,0.8,0.7])

hour_weights = hour_weights / hour_weights.sum()

rows=[]
for _, r in daily_df.iterrows():
    for h, w in enumerate(hour_weights):
        rows.append({
            "datetime": pd.Timestamp(r["date"]) + pd.Timedelta(hours=h),
            "industry_consumption_mwh": r["industry_consumption_mwh"] * w
        })

hourly_df = pd.DataFrame(rows)

hourly_df.to_csv("../data/processed/industry_hourly_synthetic.csv", index=False)
hourly_df.head()
