## Prediction Generation Pipeline

This Notebook defines the pipline for generatin prediction from the saved models. It is run by papermill package in Apache Airflow and predictions are generated accordingly. 

In [7]:
# ====================
# 0. ENV SETUP
# ====================
import os
import json
import logging
import numpy as np
import pandas as pd
from datetime import datetime
from tensorflow import keras

##### Set the working directories

In [10]:
import logging
from pathlib import Path
import os

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# --- Determine the execution environment ---

running_in_airflow = False
# A common way to detect Airflow is by checking for Airflow-specific environment variables
# For example, 'AIRFLOW_HOME' or 'AIRFLOW_CTX_DAG_ID' are good indicators.
if os.environ.get('AIRFLOW_HOME') or os.environ.get('AIRFLOW_CTX_DAG_ID'):
    running_in_airflow = True
    logger.info("Notebook detected as running in Airflow.")
else:
    logger.info("Notebook detected as running in a non-Airflow (likely Jupyter) environment.")

# --- Set the target data directory ---

target_data_dir = Path("/opt/data")

# If running in Jupyter from /home/notebooks, we need to adjust the path
if not running_in_airflow:
    # Assuming the notebook is at /home/notebooks/your_notebook.ipynb
    # and you want to access /opt/data
    # In a typical Jupyter setup, os.getcwd() will be /home/notebooks
    current_notebook_dir = Path(os.getcwd())
    logger.info(f"Current working directory in Jupyter: {current_notebook_dir}")

    # You might need to adjust this logic depending on how you mount /opt/data
    # If /opt/data is directly accessible at the root level, then target_data_dir is fine as is.
    # If /opt/data is mounted *relative* to your Jupyter environment in a non-standard way,
    # you'd need to adjust target_data_dir to reflect that.
    # For instance, if /opt/data is mounted as /home/notebooks/data_mount, you'd do:
    # target_data_dir = current_notebook_dir / "data_mount"

    # Given your requirement that /opt/data is the target, regardless of where the notebook runs,
    # and assuming /opt/data is directly accessible from the root,
    # the target_data_dir remains /opt/data.
    pass # No change needed for target_data_dir if /opt/data is a root-level path

# --- Ensure the target directory exists and set current working directory if needed ---

if not target_data_dir.exists():
    logger.warning(f"Target data directory '{target_data_dir}' does not exist. Attempting to create it.")
    try:
        target_data_dir.mkdir(parents=True, exist_ok=True)
    except OSError as e:
        logger.error(f"Failed to create directory {target_data_dir}: {e}")
        # Depending on your use case, you might want to raise an exception here
        # or handle the error gracefully.

# Change the current working directory if it's not already the target_data_dir
# This is particularly useful for relative paths within your notebook's logic.
if Path.cwd() != target_data_dir:
    try:
        os.chdir(target_data_dir)
        logger.info(f"Changed current working directory to: {os.getcwd()}")
    except OSError as e:
        logger.error(f"Failed to change working directory to {target_data_dir}: {e}")

logger.info(f"Final effective data directory for operations: {Path.cwd()}")

# Now, any file operations in your notebook (e.g., Path("my_file.txt").read_text())
# will operate relative to /opt/data.

# Example usage:
# data_file_path = Path("example.txt")
# if not data_file_path.exists():
#     data_file_path.write_text("Hello from the notebook!")
#     logger.info(f"Created {data_file_path.name} in {Path.cwd()}")
# else:
#     logger.info(f"Content of {data_file_path.name}: {data_file_path.read_text()}")

INFO:__main__:Notebook detected as running in a non-Airflow (likely Jupyter) environment.
INFO:__main__:Current working directory in Jupyter: /home/notebooks
INFO:__main__:Changed current working directory to: /opt/data
INFO:__main__:Final effective data directory for operations: /opt/data


### Load Data

In [12]:
# ====================
# 1. LOAD INPUT DATA
# ====================

# Since you've already changed the current working directory to /opt/data,
# you can now simply refer to the file by its name.
data_filename = "malaria_historical.csv"
data_path = Path(data_filename) # Path object automatically resolves relative to cwd

if not data_path.exists():
    logger.error(f"❌ Data not found at: {data_path.absolute()}") # Use .absolute() for full path in error message
    raise FileNotFoundError(f"{data_filename} not found in the current working directory ({Path.cwd()})")

try:
    raw_data = pd.read_csv(data_path)
    logger.info(f"✅ Successfully loaded {data_filename} from {data_path.absolute()}")
    logger.info(f"First 5 rows of data:\n{raw_data.head()}")
except Exception as e:
    logger.error(f"Error loading data from {data_path.absolute()}: {e}")
    raise

INFO:__main__:✅ Successfully loaded malaria_historical.csv from /opt/data/malaria_historical.csv
INFO:__main__:First 5 rows of data:
   year  month  district  mal_cases  avg_temp_max  avg_temp_min  avg_humidity  \
0  2020      1      Abim     5945.0         30.00         18.60         51.77   
1  2020      1  Adjumani    25321.0         33.31         20.08         42.10   
2  2020      1     Agago    19090.0         32.09         19.14         48.42   
3  2020      1  Alebtong     1450.0         32.11         19.72         47.16   
4  2020      1  Amolatar     3373.0         29.64         20.06         64.35   

   sum_precipitation  sum_sunshine_hours  
0               55.8              320.49  
1               13.7              327.06  
2               42.8              322.06  
3               43.9              305.70  
4               68.7              314.29  


In [13]:
import pandas as pd
print(type(raw_data))  # Should output: <class 'pandas.core.frame.DataFrame'>
print(raw_data.columns)  # Check if all columns exist

<class 'pandas.core.frame.DataFrame'>
Index(['year', 'month', 'district', 'mal_cases', 'avg_temp_max',
       'avg_temp_min', 'avg_humidity', 'sum_precipitation',
       'sum_sunshine_hours'],
      dtype='object')


In [15]:
# Filter rows for Kamuli district
kamuli_data = raw_data[raw_data['district'] == 'Kamuli']

# Display the number of rows and preview the data
print(f"Total samples for Kamuli: {len(kamuli_data)}")
print(kamuli_data.head(10))  # or kamuli_data.tail(), or just kamuli_data to see all

Total samples for Kamuli: 1
      year  month district  mal_cases  avg_temp_max  avg_temp_min  \
7479  2025      4   Kamuli    11911.0         26.57         18.87   

      avg_humidity  sum_precipitation  sum_sunshine_hours  
7479         83.87              178.5              285.57  


In [16]:
import numpy as np
import pandas as pd

# ====================
# 1. FEATURE SELECTION
# ====================
feature_cols = [
    'district', 'year','month', # Add district column
    'avg_temp_max', 'avg_temp_min', 'avg_humidity',
    'sum_precipitation', 'sum_sunshine_hours', 'mal_cases'
]

# Extract selected columns
selected_data = raw_data[feature_cols]

# Convert to NumPy array (keeping district as string)
data_values = selected_data.drop(columns=['district']).values.astype('float32')
district_labels = selected_data['district'].values

In [17]:
# Initialize stats storage
stats_per_district = {}

# Normalize per district using training split
for district, df_district in raw_data.groupby('district'):
    df_district = df_district.sort_values(['year', 'month']).reset_index(drop=True)
    
    # Select only feature columns for normalization (excluding 'district', 'year', 'month')
    selected_features = df_district[['avg_temp_max', 'avg_temp_min', 'avg_humidity',
                                     'sum_precipitation', 'sum_sunshine_hours', 'mal_cases']]
    data_values = selected_features.values.astype('float32')

    # Split for normalization (first 60%)
    num_samples = len(data_values)
    num_train = int(0.60 * num_samples)

    if num_train < 2:
        print(f"⚠️ Not enough training data for district {district}, skipping...")
        continue

    train_values = data_values[:num_train]

    mean = train_values.mean(axis=0)
    std = train_values.std(axis=0)
    std[std < 1e-10] = 1.0  # avoid divide by zero

    stats_per_district[district] = {'mean': mean, 'std': std}

# Print preview
for district in list(stats_per_district.keys())[:3]:
    print(f"📍 {district} → mean: {stats_per_district[district]['mean']}, std: {stats_per_district[district]['std']}")


⚠️ Not enough training data for district Kamuli, skipping...
📍 Abim → mean: [  28.68842    17.887894   62.01184    98.63947   318.86868  6338.5527  ], std: [2.3697593e+00 9.7348303e-01 1.5422560e+01 8.7819649e+01 1.4696355e+01
 2.0782686e+03]
📍 Adjumani → mean: [3.1614996e+01 2.0099737e+01 6.2325531e+01 7.1605270e+01 3.1614636e+02
 2.6186447e+04], std: [2.4268689e+00 8.7700599e-01 1.6375797e+01 5.8883087e+01 1.4303403e+01
 9.1575957e+03]
📍 Agago → mean: [   30.909472    18.830788    60.056313    77.70264    316.6992
 17938.395   ], std: [2.4519389e+00 9.0371865e-01 1.5902541e+01 7.1074310e+01 1.5716954e+01
 8.2685703e+03]


In [18]:
# ====================
# 5. LOAD MODELS
# ====================

# Define the absolute path to your models folder
models_base_dir = Path("/opt/malaria_models/")

# Ensure the models base directory exists
if not models_base_dir.exists():
    logger.error(f"❌ Model directory not found at: {models_base_dir.absolute()}")
    raise FileNotFoundError(f"The specified model directory {models_base_dir.absolute()} does not exist.")
elif not models_base_dir.is_dir():
    logger.error(f"❌ Path exists but is not a directory: {models_base_dir.absolute()}")
    raise NotADirectoryError(f"The specified model path {models_base_dir.absolute()} is not a directory.")


model_paths = {
    'Dense': models_base_dir / "dense.keras",
    'GRU': models_base_dir / "GRU.keras",
    'LSTM': models_base_dir / "LSTM.keras",
    'transformer': models_base_dir / "transformer.keras",
}

models = {}
for name, path in model_paths.items():
    if not path.exists():
        logger.warning(f"⚠️ Model file not found for '{name}' at: {path.absolute()}")
        continue # Skip to the next model if the file doesn't exist

    try:
        models[name] = keras.models.load_model(path)
        logger.info(f"✅ Loaded model: {name} from {path.absolute()}")
    except Exception as e:
        logger.warning(f"⚠️ Could not load model {name} from {path.absolute()}: {e}")

# You can now access your loaded models, e.g., models['Dense']
if not models:
    logger.error("❌ No models were loaded successfully.")
else:
    logger.info(f"Successfully loaded {len(models)} models: {', '.join(models.keys())}")

ERROR:__main__:❌ Model directory not found at: /opt/malaria_models


FileNotFoundError: The specified model directory /opt/malaria_models does not exist.

### Make predictions

### Cell 3: Make Predictions and Demornalize

In [32]:
# ====================
# 6. MAKE PREDICTIONS
# ====================
ddd_mean = mean[-1]
ddd_std = std[-1]

for name, model in models.items():
    try:
        y_pred = model.predict(input_keras, verbose=0).flatten()
        y_pred_orig = y_pred * ddd_std + ddd_mean

        logger.info(f"\n📈 {name} Predictions (ddd_demand):")
        for i, month in enumerate(prediction_months):
            print(f"{month}: {y_pred_orig[i]:.3f}")
    except Exception as e:
        logger.error(f"❌ Error predicting with {name}: {e}")

INFO:__main__:
📈 Dense Predictions (ddd_demand):


April 2025: 1.616
May 2025: 1.582
June 2025: 1.504


INFO:__main__:
📈 GRU Predictions (ddd_demand):


April 2025: 1.866
May 2025: 1.671
June 2025: 1.245


INFO:__main__:
📈 LSTM Predictions (ddd_demand):


April 2025: 1.532
May 2025: 1.498
June 2025: 1.413


INFO:__main__:
📈 transformer Predictions (ddd_demand):


April 2025: 1.483
May 2025: 1.610
June 2025: 1.391


In [None]:
### Save Predictions

### Save

In [None]:
# ====================
# 8. SAVE PREDICTIONS
# ====================
from datetime import date
import os
import pandas as pd
import numpy as np

# Define output directory using cwd
output_dir = os.path.join(cwd, "data")
os.makedirs(output_dir, exist_ok=True)
logger.info(f"Output directory created: {output_dir}")

# Determine base month string from last available data
month_str = f"{last_year}_{last_month:02d}"
output_path = os.path.join(output_dir, f"predictions_{month_str}.csv")
logger.info(f"Output path: {output_path}")

# Collect predictions
prediction_records = []

for name, model in models.items():
    try:
        y_pred = model.predict(input_keras, verbose=0).flatten()
        y_pred_orig = y_pred * ddd_std + ddd_mean

        for i, y in enumerate(y_pred_orig):
            pred_year = last_year + ((last_month + i) // 12)
            pred_month = ((last_month + i) % 12) + 1
            pred_date = date(pred_year, pred_month, 1).isoformat()

            prediction_records.append({
                "model_name": name,
                "date": pred_date,
                "predicted_demand": round(float(y), 4)
            })

    except Exception as e:
        logger.error(f"❌ Error predicting with {name}: {e}")

logger.info(f"Number of prediction records: {len(prediction_records)}")

# Convert to DataFrame and save
pred_df = pd.DataFrame(prediction_records)
pred_df.to_csv(output_path, index=False)
logger.info(f"💾 Saved predictions to {output_path}")

# Verify file exists
if os.path.exists(output_path):
    logger.info(f"File confirmed at {output_path}")
else:
    logger.error(f"File not found at {output_path}")

# Preview if running interactively
if not parameters.get("airflow", False):
    display(pred_df)

INFO:__main__:💾 Saved predictions to .\../..\data\predicted_demand_2025_03.csv


Unnamed: 0,model_name,date,predicted_demand
0,Dense,2025-04-01,1.6158
1,Dense,2025-05-01,1.5825
2,Dense,2025-06-01,1.5041
3,GRU,2025-04-01,1.8661
4,GRU,2025-05-01,1.6706
5,GRU,2025-06-01,1.2447
6,LSTM,2025-04-01,1.5321
7,LSTM,2025-05-01,1.4979
8,LSTM,2025-06-01,1.4128
9,transformer,2025-04-01,1.4832
