## Prediction Generation Pipeline

This Notebook defines the pipline for generatin prediction from the saved models. It is run by papermill package in Apache Airflow and predictions are generated accordingly. 

In [1]:
# ====================
# 0. ENV SETUP
# ====================
import os
import json
import logging
import numpy as np
import pandas as pd
from datetime import datetime
from tensorflow import keras

##### Set the working directories

In [23]:
import logging
import os
import pandas as pd
from pathlib import Path

# Use Airflow-friendly logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Parameters injected via Papermill or fallback defaults
try:
    parameters
except NameError:
    parameters = {"cwd": "."}

# Get cwd from Papermill parameters
cwd = parameters.get("cwd", ".")
logger.info(f"📁 Working directory: {cwd}")

INFO:__main__:📁 Working directory: .


### Load Data

In [None]:
# ====================
# 1. LOAD INPUT DATA
# ====================
# Adjust path to account for notebook running in docker/notebooks/
data_path = os.path.join(cwd, "data", "raw_climate_ddd_merged_data.csv")
if not os.path.exists(data_path):
    logger.error(f"❌ Data not found at: {data_path}")
    raise FileNotFoundError("raw_climate_ddd_merged_data.csv not found")

raw_data = pd.read_csv(data_path)

# Reorder columns if needed (ensure 'ddd_demand' is last)
feature_cols = [
    'avg_temp_max', 'avg_temp_min', 'avg_humidity',
    'total_precipitation', 'total_sunshine_hours', 'ddd_demand'
]
selected_data = raw_data[feature_cols]
selected_data = selected_data.astype('float32')

In [26]:
print("Column order in selected_data:")
print(selected_data.columns)  # Should match feature_cols

Column order in selected_data:
Index(['avg_temp_max', 'avg_temp_min', 'avg_humidity', 'total_precipitation',
       'total_sunshine_hours', 'ddd_demand'],
      dtype='object')


In [None]:
# ====================
# 2. LOAD NORMALIZATION PARAMS
# ====================
norm_path = os.path.join(cwd,"models", "normalization_params.json")
if not os.path.exists(norm_path):
    logger.error(f"❌ Normalization file not found at: {norm_path}")
    raise FileNotFoundError("normalization_params.json not found")

with open(norm_path, "r") as f:
    norm_params = json.load(f)

mean = np.array(norm_params["mean"])
std = np.array(norm_params["std"])
std[std < 1e-10] = 1.0  # Avoid divide by zero

logger.info("✅ Loaded normalization parameters.")
print("Std values (after clipping):", std)
print("Mean values (after clipping):", mean)

INFO:__main__:✅ Loaded normalization parameters.


Std values (after clipping): [1.62568307e+00 5.85483909e-01 8.69638824e+00 6.30080176e+03
 1.92188159e+03 2.59629726e-01]
Mean values (after clipping): [2.74350853e+01 1.78718662e+01 7.20621948e+01 1.27884971e+04
 3.68978633e+04 1.39756978e+00]


In [29]:
# ====================
# 3. EXTRACT INPUT WINDOW (Last 12 months)
# ====================
input_window = selected_data.iloc[-12:].copy()
input_raw = input_window.values  # shape: (12, 6)

# Normalize using precomputed training mean/std
input_normalized = (input_raw - mean) / std

# Keep only input features (exclude 'ddd_demand')
input_features = input_normalized[:, :-1]  # shape: (12, 5)
input_keras = input_features.reshape(1, 12, 5)  # shape: (1, 12, 5)

In [30]:
# ====================
# 4. DETERMINE TARGET MONTHS
# ====================
# Get year and month from last record in full dataset
last_year = int(raw_data["year"].iloc[-1])
last_month = int(raw_data["month"].iloc[-1])

# Generate month names for next 3 months
prediction_months = []
for i in range(1, 4):
    next_month = last_month + i
    pred_year = last_year + (next_month - 1) // 12
    pred_month = ((next_month - 1) % 12) + 1
    prediction_months.append(datetime(pred_year, pred_month, 1).strftime("%B %Y"))

logger.info(f"📆 Predicting for: {', '.join(prediction_months)}")

INFO:__main__:📆 Predicting for: April 2025, May 2025, June 2025


In [None]:
# ====================
# 5. LOAD MODELS
# ====================
model_paths = {
    'Dense': os.path.join(cwd, "models", "Dense_model.keras"),
    'GRU': os.path.join(cwd, "models", "GRU_model.keras"),
    'LSTM': os.path.join(cwd, "models", "LSTM_model.keras"),
    'transformer': os.path.join(cwd, "models", "transformer_model.keras"),
}

models = {}
for name, path in model_paths.items():
    try:
        models[name] = keras.models.load_model(path)
        logger.info(f"✅ Loaded model: {name}")
    except Exception as e:
        logger.warning(f"⚠️ Could not load model {name}: {e}")

INFO:__main__:✅ Loaded model: Dense
INFO:__main__:✅ Loaded model: GRU
INFO:__main__:✅ Loaded model: LSTM
INFO:__main__:✅ Loaded model: transformer


In [None]:
### Make predictions

### Cell 3: Make Predictions and Demornalize

In [32]:
# ====================
# 6. MAKE PREDICTIONS
# ====================
ddd_mean = mean[-1]
ddd_std = std[-1]

for name, model in models.items():
    try:
        y_pred = model.predict(input_keras, verbose=0).flatten()
        y_pred_orig = y_pred * ddd_std + ddd_mean

        logger.info(f"\n📈 {name} Predictions (ddd_demand):")
        for i, month in enumerate(prediction_months):
            print(f"{month}: {y_pred_orig[i]:.3f}")
    except Exception as e:
        logger.error(f"❌ Error predicting with {name}: {e}")

INFO:__main__:
📈 Dense Predictions (ddd_demand):


April 2025: 1.616
May 2025: 1.582
June 2025: 1.504


INFO:__main__:
📈 GRU Predictions (ddd_demand):


April 2025: 1.866
May 2025: 1.671
June 2025: 1.245


INFO:__main__:
📈 LSTM Predictions (ddd_demand):


April 2025: 1.532
May 2025: 1.498
June 2025: 1.413


INFO:__main__:
📈 transformer Predictions (ddd_demand):


April 2025: 1.483
May 2025: 1.610
June 2025: 1.391


In [None]:
### Save Predictions

### Save

In [None]:
# ====================
# 8. SAVE PREDICTIONS
# ====================
from datetime import date
import os
import pandas as pd
import numpy as np

# Define output directory using cwd
output_dir = os.path.join(cwd, "data")
os.makedirs(output_dir, exist_ok=True)

# Determine base month string from last available data
month_str = f"{last_year}_{last_month:02d}"
output_path = os.path.join(output_dir, f"predicted_demand_{month_str}.csv")

# Collect predictions
prediction_records = []

for name, model in models.items():
    try:
        y_pred = model.predict(input_keras, verbose=0).flatten()
        y_pred_orig = y_pred * ddd_std + ddd_mean

        for i, y in enumerate(y_pred_orig):
            pred_year = last_year + ((last_month + i) // 12)
            pred_month = ((last_month + i) % 12) + 1
            pred_date = date(pred_year, pred_month, 1).isoformat()

            prediction_records.append({
                "model_name": name,
                "date": pred_date,
                "predicted_demand": round(float(y), 4)
            })

    except Exception as e:
        logger.error(f"❌ Error predicting with {name}: {e}")

# Convert to DataFrame and save
pred_df = pd.DataFrame(prediction_records)
pred_df.to_csv(output_path, index=False)

logger.info(f"💾 Saved predictions to {output_path}")

# Preview if running interactively
if not parameters.get("airflow", False):
    display(pred_df)

INFO:__main__:💾 Saved predictions to .\../..\data\predicted_demand_2025_03.csv


Unnamed: 0,model_name,date,predicted_demand
0,Dense,2025-04-01,1.6158
1,Dense,2025-05-01,1.5825
2,Dense,2025-06-01,1.5041
3,GRU,2025-04-01,1.8661
4,GRU,2025-05-01,1.6706
5,GRU,2025-06-01,1.2447
6,LSTM,2025-04-01,1.5321
7,LSTM,2025-05-01,1.4979
8,LSTM,2025-06-01,1.4128
9,transformer,2025-04-01,1.4832
