## Append/Load District and National Monthly Data to final destination

In [None]:
import os
import pandas as pd

# These should come from your transform notebooks or passed via Papermill
# Assuming both already exist as Pandas DataFrames
# - `national_df` (columns: year, month, avg_temp_max, ...)
# - `agg_df` (columns: district, year, month, avg_temp_max, ...)

# -------------------------------------
# 1. National-Level Aggregated Data
# -------------------------------------
national_df = "weather_national_monthly.csv"
national_path = "weather_national_monthly.csv"

if os.path.exists(national_path):
    national_existing = pd.read_csv(national_path)
    national_combined = pd.concat([national_existing, national_df], ignore_index=True)
    national_combined = national_combined.drop_duplicates(subset=["year", "month"], keep="last")
else:
    national_combined = national_df

national_combined = national_combined.sort_values(by=["year", "month"]).reset_index(drop=True)
national_combined.to_csv(national_path, index=False)
print("✅ Load complete for  national monthly weather data.")

In [4]:
import os
import pandas as pd

# File paths
historical_path = "malaria_historical.csv"
new_data_path = "weather_district_monthly_2025_05.csv"

# Load new monthly climate data
new_df = pd.read_csv(new_data_path)

# Add mal_cases column as empty (for future prediction)
new_df["mal_cases"] = pd.NA

# Reorder columns for consistency
new_df = new_df[[
    "year", "month", "district", "mal_cases",
    "avg_temp_max", "avg_temp_min", "avg_humidity",
    "total_precipitation", "total_sunshine_hours"
]]

# If historical file exists, load and align column names
if os.path.exists(historical_path):
    historical_df = pd.read_csv(historical_path)

    # If the historical data has old column names, rename them to match the new schema
    historical_df = historical_df.rename(columns={
        "sum_precipitation": "total_precipitation",
        "sum_sunshine_hours": "total_sunshine_hours"
    })

    # Ensure mal_cases column exists in historical data
    if "mal_cases" not in historical_df.columns:
        historical_df["mal_cases"] = pd.NA

    # Reorder columns to match new_df
    historical_df = historical_df[new_df.columns]

    # Concatenate and drop duplicates
    combined_df = pd.concat([historical_df, new_df], ignore_index=True)
    combined_df = combined_df.drop_duplicates(subset=["district", "year", "month"], keep="last")
else:
    # No historical data yet
    combined_df = new_df

# Sort and write back
combined_df = combined_df.sort_values(by=["district", "year", "month"]).reset_index(drop=True)
combined_df.to_csv(historical_path, index=False)

print("✅ District-level data appended and saved successfully.")


✅ District-level data appended and saved successfully.
