## Prediction Generation Pipeline

This Notebook defines the pipline for generatin prediction from the saved models. It is run by papermill package in Apache Airflow and predictions are generated accordingly. 

In [1]:
# ====================
# 0. ENV SETUP
# ====================
import os
import json
import logging
import numpy as np
import pandas as pd
from datetime import datetime
from tensorflow import keras

2025-07-24 06:15:08.926792: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-07-24 06:15:08.931626: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-07-24 06:15:09.092467: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-07-24 06:15:09.689038: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


##### Set the working directories

In [12]:
import logging
from pathlib import Path

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Auto-detect parameters injected by Papermill (Airflow) vs local Jupyter
if running_in_airflow:
    cwd = Path(parameters.get("cwd", ".")).resolve()
else:
    # Jupyter or local interactive: force to /opt/data (not the current notebook folder)
    cwd = Path("/opt/data").resolve()

# Set working directory
if running_in_airflow:
    cwd = Path(parameters.get("cwd", ".")).resolve()
else:
    # Jupyter or local interactive: force to /opt/data
    cwd = Path("/opt/data").resolve()

logger.info(f"📁 Working directory set to: {cwd}")


INFO:__main__:📁 Working directory set to: /home/notebooks


### Load Data

In [20]:
# ====================
# 1. LOAD INPUT DATA
# ====================
# Adjust path to account for notebook running in docker/notebooks/
data_path = os.path.join(cwd, "malaria_historical.csv")
if not os.path.exists(data_path):
    logger.error(f"❌ Data not found at: {data_path}")
    raise FileNotFoundError("malaria_historical.csv not found")

raw_data = pd.read_csv(data_path)
raw_data.head()

Unnamed: 0,year,month,district,mal_cases,avg_temp_max,avg_temp_min,avg_humidity,sum_precipitation,sum_sunshine_hours
0,2020,1,Abim,5945.0,30.0,18.6,51.77,55.8,320.49
1,2020,1,Adjumani,25321.0,33.31,20.08,42.1,13.7,327.06
2,2020,1,Agago,19090.0,32.09,19.14,48.42,42.8,322.06
3,2020,1,Alebtong,1450.0,32.11,19.72,47.16,43.9,305.7
4,2020,1,Amolatar,3373.0,29.64,20.06,64.35,68.7,314.29


In [21]:
import pandas as pd
print(type(raw_data))  # Should output: <class 'pandas.core.frame.DataFrame'>
print(raw_data.columns)  # Check if all columns exist

<class 'pandas.core.frame.DataFrame'>
Index(['year', 'month', 'district', 'mal_cases', 'avg_temp_max',
       'avg_temp_min', 'avg_humidity', 'sum_precipitation',
       'sum_sunshine_hours'],
      dtype='object')


In [22]:
print("Column order in selected_data:")
print(selected_data.columns)  # Should match feature_cols

Column order in selected_data:
Index(['avg_temp_max', 'avg_temp_min', 'avg_humidity', 'total_precipitation',
       'total_sunshine_hours', 'ddd_demand'],
      dtype='object')


In [23]:
# Filter rows for Kamuli district
kamuli_data = raw_data[raw_data['district'] == 'Kamuli']

# Display the number of rows and preview the data
print(f"Total samples for Kamuli: {len(kamuli_data)}")
print(kamuli_data.head(10))  # or kamuli_data.tail(), or just kamuli_data to see all

Total samples for Kamuli: 1
      year  month district  mal_cases  avg_temp_max  avg_temp_min  \
7479  2025      4   Kamuli    11911.0         26.57         18.87   

      avg_humidity  sum_precipitation  sum_sunshine_hours  
7479         83.87              178.5              285.57  


In [24]:
import numpy as np
import pandas as pd

# ====================
# 1. FEATURE SELECTION
# ====================
feature_cols = [
    'district', 'year','month', # Add district column
    'avg_temp_max', 'avg_temp_min', 'avg_humidity',
    'sum_precipitation', 'sum_sunshine_hours', 'mal_cases'
]

# Extract selected columns
selected_data = raw_data[feature_cols]

# Convert to NumPy array (keeping district as string)
data_values = selected_data.drop(columns=['district']).values.astype('float32')
district_labels = selected_data['district'].values

In [27]:
# Initialize stats storage
stats_per_district = {}

# Normalize per district using training split
for district, df_district in raw_data.groupby('district'):
    df_district = df_district.sort_values(['year', 'month']).reset_index(drop=True)
    
    # Select only feature columns for normalization (excluding 'district', 'year', 'month')
    selected_features = df_district[['avg_temp_max', 'avg_temp_min', 'avg_humidity',
                                     'sum_precipitation', 'sum_sunshine_hours', 'mal_cases']]
    data_values = selected_features.values.astype('float32')

    # Split for normalization (first 60%)
    num_samples = len(data_values)
    num_train = int(0.60 * num_samples)

    if num_train < 2:
        print(f"⚠️ Not enough training data for district {district}, skipping...")
        continue

    train_values = data_values[:num_train]

    mean = train_values.mean(axis=0)
    std = train_values.std(axis=0)
    std[std < 1e-10] = 1.0  # avoid divide by zero

    stats_per_district[district] = {'mean': mean, 'std': std}

# Print preview
for district in list(stats_per_district.keys())[:3]:
    print(f"📍 {district} → mean: {stats_per_district[district]['mean']}, std: {stats_per_district[district]['std']}")


⚠️ Not enough training data for district Kamuli, skipping...
📍 Abim → mean: [  28.68842    17.887894   62.01184    98.63947   318.86868  6338.5527  ], std: [2.3697593e+00 9.7348303e-01 1.5422560e+01 8.7819649e+01 1.4696355e+01
 2.0782686e+03]
📍 Adjumani → mean: [3.1614996e+01 2.0099737e+01 6.2325531e+01 7.1605270e+01 3.1614636e+02
 2.6186447e+04], std: [2.4268689e+00 8.7700599e-01 1.6375797e+01 5.8883087e+01 1.4303403e+01
 9.1575957e+03]
📍 Agago → mean: [   30.909472    18.830788    60.056313    77.70264    316.6992
 17938.395   ], std: [2.4519389e+00 9.0371865e-01 1.5902541e+01 7.1074310e+01 1.5716954e+01
 8.2685703e+03]


In [29]:
# ====================
# 3. EXTRACT INPUT WINDOW (Last 12 months)
# ====================
input_window = selected_data.iloc[-12:].copy()
input_raw = input_window.values  # shape: (12, 6)

# Normalize using precomputed training mean/std
input_normalized = (input_raw - mean) / std

# Keep only input features (exclude 'ddd_demand')
input_features = input_normalized[:, :-1]  # shape: (12, 5)
input_keras = input_features.reshape(1, 12, 5)  # shape: (1, 12, 5)

In [29]:
# ====================
# 5. LOAD MODELS
# ====================
cwd = Path(parameters.get("cwd", ".")).resolve()
model_paths = {
    'Dense': os.path.join(cwd, "models", "Dense_model.keras"),
    'GRU': os.path.join(cwd, "models", "GRU_model.keras"),
    'LSTM': os.path.join(cwd, "models", "LSTM_model.keras"),
    'transformer': os.path.join(cwd, "models", "transformer_model.keras"),
}

models = {}
for name, path in model_paths.items():
    try:
        models[name] = keras.models.load_model(path)
        logger.info(f"✅ Loaded model: {name}")
    except Exception as e:
        logger.warning(f"⚠️ Could not load model {name}: {e}")



### Make predictions

### Cell 3: Make Predictions and Demornalize

In [32]:
# ====================
# 6. MAKE PREDICTIONS
# ====================
ddd_mean = mean[-1]
ddd_std = std[-1]

for name, model in models.items():
    try:
        y_pred = model.predict(input_keras, verbose=0).flatten()
        y_pred_orig = y_pred * ddd_std + ddd_mean

        logger.info(f"\n📈 {name} Predictions (ddd_demand):")
        for i, month in enumerate(prediction_months):
            print(f"{month}: {y_pred_orig[i]:.3f}")
    except Exception as e:
        logger.error(f"❌ Error predicting with {name}: {e}")

INFO:__main__:
📈 Dense Predictions (ddd_demand):


April 2025: 1.616
May 2025: 1.582
June 2025: 1.504


INFO:__main__:
📈 GRU Predictions (ddd_demand):


April 2025: 1.866
May 2025: 1.671
June 2025: 1.245


INFO:__main__:
📈 LSTM Predictions (ddd_demand):


April 2025: 1.532
May 2025: 1.498
June 2025: 1.413


INFO:__main__:
📈 transformer Predictions (ddd_demand):


April 2025: 1.483
May 2025: 1.610
June 2025: 1.391


In [None]:
### Save Predictions

### Save

In [None]:
# ====================
# 8. SAVE PREDICTIONS
# ====================
from datetime import date
import os
import pandas as pd
import numpy as np

# Define output directory using cwd
output_dir = os.path.join(cwd, "data")
os.makedirs(output_dir, exist_ok=True)
logger.info(f"Output directory created: {output_dir}")

# Determine base month string from last available data
month_str = f"{last_year}_{last_month:02d}"
output_path = os.path.join(output_dir, f"predictions_{month_str}.csv")
logger.info(f"Output path: {output_path}")

# Collect predictions
prediction_records = []

for name, model in models.items():
    try:
        y_pred = model.predict(input_keras, verbose=0).flatten()
        y_pred_orig = y_pred * ddd_std + ddd_mean

        for i, y in enumerate(y_pred_orig):
            pred_year = last_year + ((last_month + i) // 12)
            pred_month = ((last_month + i) % 12) + 1
            pred_date = date(pred_year, pred_month, 1).isoformat()

            prediction_records.append({
                "model_name": name,
                "date": pred_date,
                "predicted_demand": round(float(y), 4)
            })

    except Exception as e:
        logger.error(f"❌ Error predicting with {name}: {e}")

logger.info(f"Number of prediction records: {len(prediction_records)}")

# Convert to DataFrame and save
pred_df = pd.DataFrame(prediction_records)
pred_df.to_csv(output_path, index=False)
logger.info(f"💾 Saved predictions to {output_path}")

# Verify file exists
if os.path.exists(output_path):
    logger.info(f"File confirmed at {output_path}")
else:
    logger.error(f"File not found at {output_path}")

# Preview if running interactively
if not parameters.get("airflow", False):
    display(pred_df)

INFO:__main__:💾 Saved predictions to .\../..\data\predicted_demand_2025_03.csv


Unnamed: 0,model_name,date,predicted_demand
0,Dense,2025-04-01,1.6158
1,Dense,2025-05-01,1.5825
2,Dense,2025-06-01,1.5041
3,GRU,2025-04-01,1.8661
4,GRU,2025-05-01,1.6706
5,GRU,2025-06-01,1.2447
6,LSTM,2025-04-01,1.5321
7,LSTM,2025-05-01,1.4979
8,LSTM,2025-06-01,1.4128
9,transformer,2025-04-01,1.4832
