In [6]:
import subprocess
import sys

try:
    import dask
except ImportError:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "dask[complete]"])

In [None]:
# Parameters injected by Papermill or default fallback
try:
    parameters
except NameError:
    parameters = {"cwd": ".", "airflow": True}  # Add "airflow": True default to avoid preview


In [None]:
import dask.dataframe as dd
import os
import logging
from datetime import datetime, timedelta

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

logger.info(f"📥 Working directory set to: {cwd}")


In [None]:
try:
    month_str
except NameError:
    today = datetime.today()
    first_of_this_month = datetime(today.year, today.month, 1)
    last_of_prev_month = first_of_this_month - timedelta(days=1)
    first_of_prev_month = datetime(last_of_prev_month.year, last_of_prev_month.month, 1)
    month_str = first_of_prev_month.strftime("%Y_%m")
    logger.info(f"📅 Using month_str: {month_str}")


In [12]:
file_path = os.path.join(cwd, f"raw_weather_data_{month_str}.csv")

if not os.path.exists(file_path):
    logger.error(f"❌ Input file not found: {file_path}")
    raise FileNotFoundError(f"Input file not found: {file_path}")

try:
    daily_df = dd.read_csv(file_path, parse_dates=["date"])
    logger.info(f"✅ Successfully loaded daily data from: {file_path}")
except Exception as e:
    logger.error(f"❌ Failed to load daily data CSV: {str(e)}")
    raise


Unnamed: 0,district,date,temp_max,temp_min,precipitation,humidity,sunshine,month
0,Abim,2025-06-01,28.1,16.7,0.1,67,40393.03,2025_06
1,Abim,2025-06-02,30.4,17.5,0.0,55,40393.94,2025_06
2,Abim,2025-06-03,29.4,18.1,5.9,67,38380.78,2025_06
3,Abim,2025-06-04,28.5,17.2,1.0,72,40509.11,2025_06
4,Abim,2025-06-05,27.4,17.5,14.0,76,33168.99,2025_06


In [None]:
daily_df["year"] = daily_df["date"].dt.year
daily_df["month"] = daily_df["date"].dt.month

## District

In [14]:
try:
    district_df = daily_df.groupby(["district", "year", "month"]).agg({
        "temp_max": "mean",
        "temp_min": "mean",
        "humidity": "mean",
        "precipitation": "sum",
        "sunshine": "sum"
    }).rename(columns={
        "temp_max": "avg_temp_max",
        "temp_min": "avg_temp_min",
        "humidity": "avg_humidity",
        "precipitation": "total_precipitation",
        "sunshine": "total_sunshine_seconds"
    })

    district_df = district_df.assign(
        total_sunshine_hours=(district_df["total_sunshine_seconds"] / 3600).round(2)
    ).drop(columns="total_sunshine_seconds")

    district_df = district_df.map_partitions(lambda df: df.round(2))
    district_df = district_df.reset_index().compute()

    logger.info("✅ District-level aggregation complete")
except Exception as e:
    logger.error(f"❌ District aggregation error: {str(e)}")
    raise


district                 object
date             datetime64[ns]
temp_max                float64
temp_min                float64
precipitation           float64
humidity                  int64
sunshine                float64
month                    object
dtype: object


## National

In [None]:
try:
    national_df = daily_df.groupby(["year", "month"]).agg({
        "temp_max": "mean",
        "temp_min": "mean",
        "humidity": "mean",
        "precipitation": "sum",
        "sunshine": "sum"
    }).rename(columns={
        "temp_max": "avg_temp_max",
        "temp_min": "avg_temp_min",
        "humidity": "avg_humidity",
        "precipitation": "total_precipitation",
        "sunshine": "total_sunshine_seconds"
    })

    national_df = national_df.assign(
        total_sunshine_hours=(national_df["total_sunshine_seconds"] / 3600).round(2)
    ).drop(columns="total_sunshine_seconds")

    national_df = national_df.map_partitions(lambda df: df.round(2))
    national_df = national_df.reset_index().compute()

    logger.info("✅ National-level aggregation complete")
except Exception as e:
    logger.error(f"❌ National aggregation error: {str(e)}")
    raise


## Save Outputs

In [21]:
district_path = os.path.join(cwd, f"weather_district_monthly_{month_str}.csv")
national_path = os.path.join(cwd, f"weather_national_monthly_{month_str}.csv")

try:
    district_df.to_csv(district_path, index=False)
    logger.info(f"✅ Saved district-level data: {district_path}")
except Exception as e:
    logger.error(f"❌ Failed to save district CSV: {str(e)}")
    raise

try:
    national_df.to_csv(national_path, index=False)
    logger.info(f"✅ Saved national-level data: {national_path}")
except Exception as e:
    logger.error(f"❌ Failed to save national CSV: {str(e)}")
    raise


In [None]:
if not parameters.get("airflow", False):
    print("📊 District-level preview:")
    print(district_df.head())

    print("\n🌍 National-level preview:")
    print(national_df.head())
