## Prediction Generation Pipeline

This Notebook defines the pipline for generatin prediction from the saved models. It is run by papermill package in Apache Airflow and predictions are generated accordingly. 

In [1]:
# ====================
# 0. ENV SETUP
# ====================
import os
import json
import logging
import numpy as np
import pandas as pd
from datetime import datetime
from tensorflow import keras

##### Set the working directories

In [2]:
import os
from pathlib import Path

# Set data directory to ../data relative to current notebook
data_dir = Path("../..", "data").resolve()

# Change working directory
os.chdir(data_dir)

print(f"Current working directory set to: {os.getcwd()}")


Current working directory set to: C:\Users\alber\anti-malarial-demand-forecast\data


### Load Data

In [3]:
# ====================
# 1. LOAD INPUT DATA
# ====================
# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Since you've already changed the current working directory to /opt/data,
# you can now simply refer to the file by its name.
data_filename = "malaria_historical.csv"
data_path = Path(data_filename) # Path object automatically resolves relative to cwd

if not data_path.exists():
    logger.error(f"❌ Data not found at: {data_path.absolute()}") # Use .absolute() for full path in error message
    raise FileNotFoundError(f"{data_filename} not found in the current working directory ({Path.cwd()})")

try:
    raw_data = pd.read_csv(data_path)
    logger.info(f"✅ Successfully loaded {data_filename} from {data_path.absolute()}")
    logger.info(f"First 5 rows of data:\n{raw_data.head()}")
except Exception as e:
    logger.error(f"Error loading data from {data_path.absolute()}: {e}")
    raise

INFO:__main__:✅ Successfully loaded malaria_historical.csv from C:\Users\alber\anti-malarial-demand-forecast\data\malaria_historical.csv
INFO:__main__:First 5 rows of data:
   year  month  district  mal_cases  avg_temp_max  avg_temp_min  avg_humidity  \
0  2020      1      Abim     5945.0         30.00         18.60         51.77   
1  2020      1  Adjumani    25321.0         33.31         20.08         42.10   
2  2020      1     Agago    19090.0         32.09         19.14         48.42   
3  2020      1  Alebtong     1450.0         32.11         19.72         47.16   
4  2020      1  Amolatar     3373.0         29.64         20.06         64.35   

   sum_precipitation  sum_sunshine_hours  
0               55.8              320.49  
1               13.7              327.06  
2               42.8              322.06  
3               43.9              305.70  
4               68.7              314.29  


In [4]:
import pandas as pd
print(type(raw_data))  # Should output: <class 'pandas.core.frame.DataFrame'>
print(raw_data.columns)  # Check if all columns exist

<class 'pandas.core.frame.DataFrame'>
Index(['year', 'month', 'district', 'mal_cases', 'avg_temp_max',
       'avg_temp_min', 'avg_humidity', 'sum_precipitation',
       'sum_sunshine_hours'],
      dtype='object')


In [5]:
import os
import logging
from pathlib import Path

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Correct model directory relative to this notebook's cwd
cwd = Path.cwd()
model_dir = cwd.parent / "malaria_models"

model_paths = {
    'Dense': model_dir / "dense.keras",
    'LSTM': model_dir / "LSTM.keras",
    'GRU': model_dir / "GRU.keras",
    'Transformer': model_dir / "transformer.keras",
    'XGBoost': model_dir / "XGBoost.json",  # if you also load XGB separately
}

logger.info(f"📁 Current working directory: {cwd}")
logger.info(f"📦 Looking for models at: {model_dir}")


INFO:__main__:📁 Current working directory: C:\Users\alber\anti-malarial-demand-forecast\data
INFO:__main__:📦 Looking for models at: C:\Users\alber\anti-malarial-demand-forecast\malaria_models


In [6]:
from keras.layers import Layer
from keras.saving import register_keras_serializable
import tensorflow as tf

@register_keras_serializable()
class CastAndClipLayer(Layer):
    def __init__(self, num_districts=118, **kwargs):
        super().__init__(**kwargs)
        self.num_districts = num_districts

    def call(self, inputs):
        clipped = tf.clip_by_value(inputs, 0, self.num_districts - 1)
        return tf.cast(clipped, tf.int32)

    def get_config(self):
        config = super().get_config()
        config.update({
            "num_districts": self.num_districts,
        })
        return config


In [7]:
from keras.models import load_model
import xgboost as xgb

models = {}
custom_objects = {'CastAndClipLayer': CastAndClipLayer}

for name, path in model_paths.items():
    if not path.exists():
        logger.warning(f"⚠️ Model file not found for '{name}' at: {path}")
        continue

    try:
        if name == 'XGBoost':
            booster = xgb.Booster()
            booster.load_model(str(path))
            models[name] = booster
            logger.info(f"✅ Loaded XGBoost model from: {path}")
        else:
            models[name] = load_model(path, custom_objects=custom_objects)
            logger.info(f"✅ Loaded Keras model '{name}' from {path}")
    except Exception as e:
        logger.warning(f"❌ Failed to load model '{name}' from {path}: {e}")






INFO:__main__:✅ Loaded Keras model 'Dense' from C:\Users\alber\anti-malarial-demand-forecast\malaria_models\dense.keras
INFO:__main__:✅ Loaded Keras model 'LSTM' from C:\Users\alber\anti-malarial-demand-forecast\malaria_models\LSTM.keras
INFO:__main__:✅ Loaded Keras model 'GRU' from C:\Users\alber\anti-malarial-demand-forecast\malaria_models\GRU.keras
INFO:__main__:✅ Loaded Keras model 'Transformer' from C:\Users\alber\anti-malarial-demand-forecast\malaria_models\transformer.keras
INFO:__main__:✅ Loaded XGBoost model from: C:\Users\alber\anti-malarial-demand-forecast\malaria_models\XGBoost.json


In [8]:
# Normalization at district level
import pandas as pd
import numpy as np

feature_cols = [
    'avg_temp_max', 'avg_temp_min', 'avg_humidity',
    'sum_precipitation', 'sum_sunshine_hours', 'mal_cases'
]

# Ensure required columns
assert 'district' in raw_data.columns and 'year' in raw_data.columns and 'month' in raw_data.columns

normalized_dfs = []
stats_per_district = {}

# Normalize per district
for district, df_district in raw_data.groupby('district'):
    df_district = df_district.sort_values(['year', 'month']).reset_index(drop=True)
    selected_data = df_district[feature_cols]
    data_values = selected_data.values.astype('float32')

    # Split for normalization
    num_samples = len(data_values)
    num_train = int(0.60 * num_samples)

    mean = data_values[:num_train].mean(axis=0)
    std = data_values[:num_train].std(axis=0)
    std[std < 1e-10] = 1.0  # avoid division by 0

    stats_per_district[district] = {'mean': mean, 'std': std}
    normalized_values = (data_values - mean) / std

    normalized_df = pd.DataFrame(normalized_values, columns=feature_cols, index=selected_data.index)
    normalized_df['district'] = district
    normalized_df['year'] = df_district['year'].values
    normalized_df['month'] = df_district['month'].values

    normalized_dfs.append(normalized_df)

  mean = data_values[:num_train].mean(axis=0)
  ret = um.true_divide(
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = um.true_divide(


In [10]:
#  Combine and Sort Data Globally
normalized_all_data = pd.concat(normalized_dfs).reset_index(drop=True)

# Sort by district, year, month
normalized_all_data = normalized_all_data.sort_values(['district', 'year', 'month']).reset_index(drop=True)

In [11]:
from sklearn.preprocessing import LabelEncoder

# Step 3: Encode Districts and Rearrange Columns
le = LabelEncoder()
normalized_all_data['district_id'] = le.fit_transform(normalized_all_data['district'])

# Move 'mal_cases' to last column and drop original 'district'
cols = [c for c in normalized_all_data.columns if c not in ['mal_cases', 'district']] + ['mal_cases']
normalized_all_data = normalized_all_data[cols]
normalized_all_data.head()

Unnamed: 0,avg_temp_max,avg_temp_min,avg_humidity,sum_precipitation,sum_sunshine_hours,year,month,district_id,mal_cases
0,0.553466,0.731504,-0.664082,-0.487812,0.11032,2020,1,0,-0.189366
1,0.734919,1.080765,-0.430009,-0.788428,0.087186,2020,2,0,-0.875033
2,0.566126,1.337575,0.250164,0.258035,-0.149606,2020,3,0,-0.831246
3,0.085064,0.978041,0.455058,0.748813,-2.350154,2020,4,0,-0.664761
4,-0.86018,-0.131377,1.202016,0.902537,0.811176,2020,5,0,0.576175


In [12]:
# 1. First, let's reconstruct the original LabelEncoder mapping
# (Assuming you don't have the original 'le' object anymore)
unique_district_ids = normalized_all_data['district_id'].unique()
district_names = sorted(normalized_all_data['district'].unique()) if 'district' in normalized_all_data.columns else None

# If you have the original district names column:
if district_names is not None:
    le = LabelEncoder()
    le.fit(district_names)
    label_to_name = {i: name for i, name in enumerate(le.classes_)}
else:
    # If you only have IDs, create a dummy mapping
    print("Warning: Original district names not found, using generated names")
    label_to_name = {i: f"District_{i}" for i in unique_district_ids}

# 2. Verify stats_per_district keys match our label_to_name values
print("Sample district names in mapping:", list(label_to_name.values())[:5])
print("Sample stats_per_district keys:", list(stats_per_district.keys())[:5])

Sample district names in mapping: ['District_0', 'District_1', 'District_2', 'District_3', 'District_4']
Sample stats_per_district keys: ['Abim', 'Adjumani', 'Agago', 'Alebtong', 'Amolatar']


In [13]:
import pandas as pd
import numpy as np
import tensorflow as tf
import xgboost as xgb  # <-- Import XGBoost
from datetime import datetime

# Step 1: Ensure Kamuli is removed and create district-to-ID mapping
# This assumes 'raw_data' is already loaded and available.
if 'Kamuli' in raw_data['district'].unique():
    raw_data = raw_data[raw_data['district'] != 'Kamuli'].copy()
unique_districts = sorted(raw_data['district'].unique())
district_to_id = {district: idx for idx, district in enumerate(unique_districts)}
num_districts = len(unique_districts)
print(f"Number of districts being processed: {num_districts}")

# Step 2: Prepare input data with district-level normalization
def prepare_input_data(df, stats_per_district, start_date='2024-11', end_date='2025-04'):
    """
    Prepare input data for prediction, normalizing features per district and skipping incomplete data.
    """
    df['date'] = pd.to_datetime(df[['year', 'month']].assign(day=1))
    start = pd.to_datetime(start_date + '-01')
    end = pd.to_datetime(end_date + '-01')
    
    input_df = df[(df['date'] >= start) & (df['date'] <= end)].copy()
    
    input_data = []
    valid_districts = []
    valid_indices = []
    
    feature_cols = ['avg_temp_max', 'avg_temp_min', 'avg_humidity', 'sum_precipitation', 'sum_sunshine_hours']
    
    for district_idx, district in enumerate(unique_districts):
        district_data = input_df[input_df['district'] == district].sort_values('date')
        if len(district_data) != 6:
            # print(f"Warning: District {district} has {len(district_data)} months, expected 6. Skipping.")
            continue
            
        district_features = district_data[feature_cols].to_numpy()
        
        if district not in stats_per_district:
            # print(f"Warning: No normalization stats for {district}. Skipping.")
            continue
            
        mean = stats_per_district[district]['mean'][:-1]
        std = stats_per_district[district]['std'][:-1]
        std[std < 1e-10] = 1.0
        normalized_features = (district_features - mean) / std
        
        district_id = np.full((6, 1), district_to_id[district])
        district_input = np.concatenate([normalized_features, district_id], axis=-1)
        
        input_data.append(district_input)
        valid_districts.append(district)
        valid_indices.append(district_idx)
    
    if not input_data:
        raise ValueError("No districts have complete data for the specified window.")
    
    return np.array(input_data), valid_districts, valid_indices

# Step 3: Prepare data for XGBoost
def prepare_xgboost_data(batch_features, num_districts):
    """
    Convert a batch of features to NumPy arrays for XGBoost.
    """
    if isinstance(batch_features, tf.Tensor):
        batch_features = batch_features.numpy()
    
    regular_features = batch_features[:, :, :-1]
    district_ids = batch_features[:, :, -1]
    district_ids_np = np.clip(district_ids[:, -1], 0, num_districts - 1).astype(np.int32)
    
    one_hot_encoder = tf.keras.layers.CategoryEncoding(num_tokens=num_districts, output_mode="one_hot")
    district_ids_encoded = one_hot_encoder(district_ids_np).numpy()
    
    regular_features_flat = regular_features.reshape(batch_features.shape[0], -1)
    combined_features = np.concatenate([regular_features_flat, district_ids_encoded], axis=-1)
    return combined_features, None

# Step 4: Make predictions and denormalize
def make_predictions(models_dict, input_data, valid_districts, stats_per_district):
    """
    Generate and denormalize predictions for May, June, July 2025 for each district.
    """
    predictions = {name: [] for name in models_dict}
    
    for model_name, model in models_dict.items():
        if model_name != 'XGBoost':
            y_pred = model.predict(input_data, verbose=0)
        else:
            # Prepare data specifically for XGBoost
            X_flat, _ = prepare_xgboost_data(input_data, num_districts)
            
            # FIX: Convert NumPy array to DMatrix for XGBoost prediction
            dmatrix_pred = xgb.DMatrix(X_flat)
            y_pred = model.predict(dmatrix_pred)
        
        # Denormalize predictions
        denorm_preds = np.zeros_like(y_pred)
        for i, district in enumerate(valid_districts):
            mean = stats_per_district[district]['mean'][-1]
            std = stats_per_district[district]['std'][-1]
            denorm_preds[i] = y_pred[i] * std + mean
        
        predictions[model_name] = denorm_preds
    
    return predictions

# Step 5: Export predictions to CSV
def export_predictions(predictions, districts, filename='predictions_may_june_july_2025.csv'):
    """
    Export predictions to a CSV file.
    """
    output_data = []
    months = ['May_2025', 'June_2025', 'July_2025']
    
    for district_idx, district in enumerate(districts):
        for model_name in predictions:
            preds = predictions[model_name][district_idx]
            for month_idx, month in enumerate(months):
                output_data.append({
                    'district': district,
                    'model': model_name,
                    'month': month,
                    'predicted_mal_cases': max(0, preds[month_idx]) # Ensure cases are non-negative
                })

    output_df = pd.DataFrame(output_data)
    output_df.to_csv(filename, index=False)
    print(f"✅ Predictions successfully saved to {filename}")

# --- Execution ---
try:
    # This assumes 'models', 'raw_data', and 'stats_per_district' are loaded in the environment
    input_data, valid_districts, valid_indices = prepare_input_data(raw_data, stats_per_district, start_date='2024-11', end_date='2025-04')
    print(f"Found complete data for {len(valid_districts)} out of {num_districts} districts.")
    
    predictions = make_predictions(models, input_data, valid_districts, stats_per_district)
    export_predictions(predictions, valid_districts)
    
    # Optional: Print sample predictions for the first valid district
    if valid_districts:
        first_district = valid_districts[0]
        print(f"\n--- Sample Predictions for {first_district} ---")
        for model_name in predictions:
            preds = predictions[model_name][0]
            print(f"  Model: {model_name}")
            print(f"    - May 2025: {preds[0]:.2f}")
            print(f"    - June 2025: {preds[1]:.2f}")
            print(f"    - July 2025: {preds[2]:.2f}")
            
except (ValueError, NameError) as e:
    print(f"❌ An error occurred: {e}")
    print("Please ensure 'raw_data', 'stats_per_district', and 'models' are loaded correctly.")

Number of districts being processed: 118
Found complete data for 118 out of 118 districts.








✅ Predictions successfully saved to predictions_may_june_july_2025.csv

--- Sample Predictions for Abim ---
  Model: Dense
    - May 2025: 5259.26
    - June 2025: 4791.98
    - July 2025: 4210.12
  Model: LSTM
    - May 2025: 5422.97
    - June 2025: 5053.59
    - July 2025: 5015.16
  Model: GRU
    - May 2025: 5029.68
    - June 2025: 4161.72
    - July 2025: 4774.34
  Model: Transformer
    - May 2025: 5387.88
    - June 2025: 5464.46
    - July 2025: 5531.08
  Model: XGBoost
    - May 2025: 6293.69
    - June 2025: 6281.08
    - July 2025: 6207.02
