In [None]:
# Parameters injected by Papermill or default fallback
import pandas as pd
import numpy as np

try:
    parameters
except NameError:
    parameters = {"cwd": ".", "airflow": True}  # Add "airflow": True default to avoid preview


In [None]:
import dask.dataframe as dd
import os
import logging
from datetime import datetime, timedelta

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

logger.info(f"📥 Working directory set to: {cwd}")


In [None]:
try:
    month_str
except NameError:
    today = datetime.today()
    first_of_this_month = datetime(today.year, today.month, 1)
    last_of_prev_month = first_of_this_month - timedelta(days=1)
    first_of_prev_month = datetime(last_of_prev_month.year, last_of_prev_month.month, 1)
    month_str = first_of_prev_month.strftime("%Y_%m")
    logger.info(f"📅 Using month_str: {month_str}") 


In [None]:
# Define input path to national monthly weather data (produced by transform)
input_path = os.path.join(cwd, f"weather_national_monthly_{month_str}.csv")

if not os.path.exists(input_path):
    logger.error(f"❌ Input file not found: {input_path}")
    raise FileNotFoundError(f"Input file not found: {input_path}")

logger.info(f"✅ Input file found: {input_path}")

# Output path for combined/loading data
output_path = os.path.join(cwd, "selected_data.csv")


In [None]:
# National-level input and output file paths
input_path_national = os.path.join(cwd, f"weather_national_monthly_{month_str}.csv")
output_path_national = os.path.join(cwd, "selected_data.csv")

if not os.path.exists(input_path_national):
    logger.error(f"❌ National input file not found: {input_path_national}")
    raise FileNotFoundError(f"National input file not found: {input_path_national}")

logger.info(f"✅ Found national input file: {input_path_national}")


✅ Appended new month at bottom of file.


## Append/Load National Monthly Data to final destination

In [None]:
# Load national data 
national_df = pd.read_csv(input_path_national)

# Add new empty target column
national_df["ddd_demand"] = np.nan

# Reorder columns
cols_national = [
    "avg_temp_max", "avg_temp_min", "avg_humidity",
    "total_precipitation", "total_sunshine_hours", "ddd_demand"
]
national_df = national_df[cols_national]

logger.info(f"✅ Loaded national data with shape {national_df.shape}")

# Append or create output
if os.path.exists(output_path_national):
    existing_df = pd.read_csv(output_path_national)
    match_cols = cols_national[:-1]  # exclude 'ddd_demand'

    mask = (existing_df[match_cols] == national_df.loc[0, match_cols]).all(axis=1)
    if mask.any():
        logger.info(f"⚠️ Data for {month_str} already exists in national dataset. Skipping append.")
    else:
        combined_df = pd.concat([existing_df, national_df], ignore_index=True)
        combined_df.to_csv(output_path_national, index=False)
        logger.info(f"✅ Appended new month data to {output_path_national}")
else:
    national_df.to_csv(output_path_national, index=False)
    logger.info(f"📦 Created new national data file: {output_path_national}")


In [None]:
# District-level input and output paths
input_path_district = os.path.join(cwd, f"weather_district_monthly_{month_str}.csv")
output_path_district = os.path.join(cwd, "malaria_historical.csv")

if not os.path.exists(input_path_district):
    logger.error(f"❌ District input file not found: {input_path_district}")
    raise FileNotFoundError(f"District input file not found: {input_path_district}")

logger.info(f"✅ Found district input file: {input_path_district}")


## Append/Load District Monthly Data to final destination

In [None]:
# Load new district monthly data
district_df = pd.read_csv(input_path_district)

# Add empty mal_cases column
district_df["mal_cases"] = pd.NA

# Reorder columns for consistency
cols_district = [
    "year", "month", "district", "mal_cases",
    "avg_temp_max", "avg_temp_min", "avg_humidity",
    "total_precipitation", "total_sunshine_hours"
]
district_df = district_df[cols_district]

# Load historical district data if exists
if os.path.exists(output_path_district):
    historical_df = pd.read_csv(output_path_district)

    # Rename columns if needed
    historical_df = historical_df.rename(columns={
        "sum_precipitation": "total_precipitation",
        "sum_sunshine_hours": "total_sunshine_hours"
    })

    if "mal_cases" not in historical_df.columns:
        historical_df["mal_cases"] = pd.NA

    # Reorder to match district_df columns
    historical_df = historical_df[cols_district]

    # Check for duplicates (district, year, month)
    duplicates = pd.merge(
        district_df[["district", "year", "month"]],
        historical_df[["district", "year", "month"]],
        on=["district", "year", "month"],
        how="inner"
    )

    if not duplicates.empty:
        logger.info(f"⚠️ Data for {month_str} already exists for some districts. Skipping append for duplicates.")
        logger.info(f"Duplicate entries:\n{duplicates.to_string(index=False)}")
    else:
        # Append and save
        combined_df = pd.concat([historical_df, district_df], ignore_index=True)
        combined_df = combined_df.sort_values(["district", "year", "month"]).reset_index(drop=True)
        combined_df.to_csv(output_path_district, index=False)
        logger.info(f"✅ Appended district data and saved to {output_path_district}")
else:
    # No historical data yet — save new data
    district_df = district_df.sort_values(["district", "year", "month"]).reset_index(drop=True)
    district_df.to_csv(output_path_district, index=False)
    logger.info(f"📦 Created new district data file: {output_path_district}")



🚫 Data already exists for the following district-month combinations:

     district  year  month
         Abim  2025      6
     Adjumani  2025      6
        Agago  2025      6
     Alebtong  2025      6
     Amolatar  2025      6
       Amudat  2025      6
       Amuria  2025      6
        Amuru  2025      6
         Apac  2025      6
         Arua  2025      6
       Budaka  2025      6
       Bududa  2025      6
       Bugiri  2025      6
      Bugweri  2025      6
      Buhweju  2025      6
       Buikwe  2025      6
      Bukedea  2025      6
 Bukomansimbi  2025      6
        Bukwo  2025      6
    Bulambuli  2025      6
      Buliisa  2025      6
   Bundibugyo  2025      6
   Bunyangabu  2025      6
        Busia  2025      6
     Butaleja  2025      6
    Butambala  2025      6
       Butebo  2025      6
       Buvuma  2025      6
      Buyende  2025      6
       Dokolo  2025      6
         Gulu  2025      6
        Hoima  2025      6
       Ibanda  2025      6
       Igan

SystemExit: 0

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
if not parameters.get("airflow", False):
    logger.info("📊 Preview of national data:")
    print(national_df.head())

    logger.info("📊 Preview of district data:")
    print(district_df.head())
