In [1]:
import subprocess
import sys

try:
    import dask
except ImportError:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "dask[complete]"])

Collecting dask[complete]
  Downloading dask-2023.5.0-py3-none-any.whl.metadata (3.6 kB)
Collecting cloudpickle>=1.5.0 (from dask[complete])
  Downloading cloudpickle-3.1.1-py3-none-any.whl.metadata (7.1 kB)
Collecting partd>=1.2.0 (from dask[complete])
  Downloading partd-1.4.1-py3-none-any.whl.metadata (4.6 kB)
Collecting toolz>=0.10.0 (from dask[complete])
  Downloading toolz-1.0.0-py3-none-any.whl.metadata (5.1 kB)
Collecting lz4>=4.3.2 (from dask[complete])
  Downloading lz4-4.3.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.7 kB)
Collecting locket (from partd>=1.2.0->dask[complete])
  Downloading locket-1.0.0-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting distributed==2023.5.0 (from dask[complete])
  Downloading distributed-2023.5.0-py3-none-any.whl.metadata (3.4 kB)
Collecting bokeh>=2.4.2 (from dask[complete])
  Downloading bokeh-3.1.1-py3-none-any.whl.metadata (12 kB)
Collecting msgpack>=1.0.0 (from distributed==2023.5.0->dask[complete])
  Downloadin

[0m

In [2]:
import dask.dataframe as dd
import pandas as pd
from time import sleep
from datetime import datetime
import random
from datetime import datetime, timedelta
# === Get previous month range ===
# === Set date range for May 2025 ===
start_date = "2025-04-01"
end_date = "2025-04-30"
month_str = "2025_04"

# Load raw daily weather data
daily_df = dd.read_csv(f"raw_weather_data_{month_str}.csv", parse_dates=["date"])

daily_df.head()


Unnamed: 0,district,date,temp_max,temp_min,precipitation,humidity,sunshine,month
0,Abim,2025-04-01,30.5,18.6,0.0,54,41060.69,2025_05
1,Abim,2025-04-02,31.8,19.0,0.0,46,41297.85,2025_05
2,Abim,2025-04-03,31.6,18.2,0.5,55,41099.63,2025_05
3,Abim,2025-04-04,32.5,20.2,0.1,49,41119.02,2025_05
4,Abim,2025-04-05,32.1,19.3,0.6,59,40114.68,2025_05


### District Monthly dataset Transformations and exporting: For DDD Predictions

In [3]:
import dask.dataframe as dd

# Extract year and month
daily_df["year"] = daily_df["date"].dt.year
daily_df["month"] = daily_df["date"].dt.month

# First: Aggregate without the custom lambda
agg_df = daily_df.groupby(["district", "year", "month"]).agg({
    "temp_max": "mean",
    "temp_min": "mean",
    "humidity": "mean",
    "precipitation": "sum",
    "sunshine": "sum"
}).rename(columns={
    "temp_max": "avg_temp_max",
    "temp_min": "avg_temp_min",
    "humidity": "avg_humidity",
    "precipitation": "total_precipitation",
    "sunshine": "total_sunshine_seconds"
})

# Then: Add derived column for sunshine hours using assign
agg_df = agg_df.assign(
    total_sunshine_hours=(agg_df["total_sunshine_seconds"] / 3600).round(2)
).drop(columns="total_sunshine_seconds")

# Round numeric values (Dask-safe)
agg_df = agg_df.map_partitions(lambda df: df.round(2))

# Compute to get a pandas DataFrame
agg_df = agg_df.reset_index().compute()

# Preview
print(agg_df.head())


   district  year  month  avg_temp_max  avg_temp_min  avg_humidity  \
0      Abim  2025      4         28.93         18.46         68.23   
1  Adjumani  2025      4         32.82         20.53         66.73   
2     Agago  2025      4         31.30         19.88         66.53   
3  Alebtong  2025      4         29.58         19.07         75.30   
4  Amolatar  2025      4         27.69         20.33         79.87   

   total_precipitation  total_sunshine_hours  
0                146.7                309.08  
1                 73.8                303.97  
2                113.8                306.12  
3                180.4                299.12  
4                177.5                304.60  


In [4]:
# No need to compute again
agg_df = agg_df.reset_index()
agg_df.to_csv(f"weather_district_monthly_{month_str}.csv", index=False)

### National Level Monthly dataset Transformations and exporting: For DDD Predictions

In [6]:
import dask.dataframe as dd

# Assuming daily_df is already loaded and 'date' is datetime64[ns]

# Extract year and month
daily_df["year"] = daily_df["date"].dt.year
daily_df["month"] = daily_df["date"].dt.month

# National-level aggregation (group by year and month only)
national_df = daily_df.groupby(["year", "month"]).agg({
    "temp_max": "mean",
    "temp_min": "mean",
    "humidity": "mean",
    "precipitation": "sum",
    "sunshine": "sum"
}).rename(columns={
    "temp_max": "avg_temp_max",
    "temp_min": "avg_temp_min",
    "humidity": "avg_humidity",
    "precipitation": "total_precipitation",
    "sunshine": "total_sunshine_seconds"
})

# Compute sunshine hours
national_df = national_df.assign(
    total_sunshine_hours=(national_df["total_sunshine_seconds"] / 3600).round(2)
).drop(columns="total_sunshine_seconds")

# Round all numeric columns
national_df = national_df.map_partitions(lambda df: df.round(2))

# Compute final result
national_df = national_df.reset_index().compute()

# Save to CSV
national_df.to_csv(f"weather_national_monthly_{month_str}.csv", index=False)
