In [6]:
import subprocess
import sys

try:
    import dask
except ImportError:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "dask[complete]"])

In [12]:
import dask.dataframe as dd
import pandas as pd
from time import sleep
from datetime import datetime
import random
from datetime import datetime, timedelta
# === Get previous month range ===
today = datetime.today()
first_of_this_month = datetime(today.year, today.month, 1)
last_of_prev_month = first_of_this_month - timedelta(days=1)
first_of_prev_month = datetime(last_of_prev_month.year, last_of_prev_month.month, 1)

start_date = first_of_prev_month.strftime("%Y-%m-%d")
end_date = last_of_prev_month.strftime("%Y-%m-%d")
month_str = first_of_prev_month.strftime("%Y_%m")

# Load raw daily weather data
daily_df = dd.read_csv(f"raw_weather_data_{month_str}.csv", parse_dates=["date"])

daily_df.head()


Unnamed: 0,district,date,temp_max,temp_min,precipitation,humidity,sunshine,month
0,Abim,2025-06-01,28.1,16.7,0.1,67,40393.03,2025_06
1,Abim,2025-06-02,30.4,17.5,0.0,55,40393.94,2025_06
2,Abim,2025-06-03,29.4,18.1,5.9,67,38380.78,2025_06
3,Abim,2025-06-04,28.5,17.2,1.0,72,40509.11,2025_06
4,Abim,2025-06-05,27.4,17.5,14.0,76,33168.99,2025_06


In [14]:
print(daily_df.dtypes)


district                 object
date             datetime64[ns]
temp_max                float64
temp_min                float64
precipitation           float64
humidity                  int64
sunshine                float64
month                    object
dtype: object


In [16]:
import dask.dataframe as dd

# Assuming daily_df is a Dask DataFrame and 'date' is datetime64[ns]

# Extract year and month
daily_df["year"] = daily_df["date"].dt.year
daily_df["month"] = daily_df["date"].dt.month

# First: Aggregate without the custom lambda
agg_df = daily_df.groupby(["district", "year", "month"]).agg({
    "temp_max": "mean",
    "temp_min": "mean",
    "humidity": "mean",
    "precipitation": "sum",
    "sunshine": "sum"
}).rename(columns={
    "temp_max": "avg_temp_max",
    "temp_min": "avg_temp_min",
    "humidity": "avg_humidity",
    "precipitation": "total_precipitation",
    "sunshine": "total_sunshine_seconds"
})

# Then: Add derived column for sunshine hours using assign
agg_df = agg_df.assign(
    total_sunshine_hours=(agg_df["total_sunshine_seconds"] / 3600).round(2)
).drop(columns="total_sunshine_seconds")

# Round numeric values (Dask-safe)
agg_df = agg_df.map_partitions(lambda df: df.round(2))

# Compute to get a pandas DataFrame
agg_df = agg_df.reset_index().compute()

# Preview
print(agg_df.head())


   district  year  month  avg_temp_max  avg_temp_min  avg_humidity  \
0      Abim  2025      6         28.39         17.58         67.70   
1  Adjumani  2025      6         30.94         19.57         69.17   
2     Agago  2025      6         30.82         18.78         66.00   
3  Alebtong  2025      6         29.41         18.54         71.87   
4  Amolatar  2025      6         28.23         19.78         74.17   

   total_precipitation  total_sunshine_hours  
0                 92.1                315.87  
1                 82.7                324.84  
2                 40.3                314.71  
3                106.3                312.32  
4                 76.0                315.23  


In [19]:
# No need to compute again
agg_df = agg_df.reset_index()
agg_df.to_csv(f"weather_district_monthly_{month_str}.csv", index=False)


✅ Saved as weather_district_monthly_2025_06.csv


In [None]:
import os

# === Setup output folder ===
DATA_DIR = "/home/iceberg/data/weather_data"
os.makedirs(DATA_DIR, exist_ok=True)

# Construct output file path using the month_str
output_path = os.path.join(DATA_DIR, f"weather_monthly_{month_str}_2025_all_districts.csv")

# Save the aggregated DataFrame
agg_df.to_csv(output_path, index=False)

print(f"✅ Monthly aggregates saved to: {output_path}")


In [21]:
import dask.dataframe as dd

# Assuming daily_df is already loaded and 'date' is datetime64[ns]

# Extract year and month
daily_df["year"] = daily_df["date"].dt.year
daily_df["month"] = daily_df["date"].dt.month

# National-level aggregation (group by year and month only)
national_df = daily_df.groupby(["year", "month"]).agg({
    "temp_max": "mean",
    "temp_min": "mean",
    "humidity": "mean",
    "precipitation": "sum",
    "sunshine": "sum"
}).rename(columns={
    "temp_max": "avg_temp_max",
    "temp_min": "avg_temp_min",
    "humidity": "avg_humidity",
    "precipitation": "total_precipitation",
    "sunshine": "total_sunshine_seconds"
})

# Compute sunshine hours
national_df = national_df.assign(
    total_sunshine_hours=(national_df["total_sunshine_seconds"] / 3600).round(2)
).drop(columns="total_sunshine_seconds")

# Round all numeric columns
national_df = national_df.map_partitions(lambda df: df.round(2))

# Compute final result
national_df = national_df.reset_index().compute()

# Save to CSV
national_df.to_csv(f"weather_national_monthly_{month_str}.csv", index=False)
