In [1]:
import xarray as xr
import pandas as pd
import geopandas as gpd
import numpy as np

In [18]:
def regions_from_xarray(hdd_dataset, pop_dataset):
    #Load subregion shapefile
    gdf = gpd.read_file('../utils/files/USRegion/12Regions/DOI_12_Unified_Regions_20180801.shp')
    gdf = gdf.to_crs(epsg=4326) #reproject file into classical lat lon coordinates
    gdf = gdf[['REG_NAME', 'geometry']]
    gdf = gdf[~gdf['REG_NAME'].isin(['Alaska', 'Pacific Islands'])] #Remove both of these regions, irrelevant

    #Make a point grid from dataset
    lon_grid, lat_grid = np.meshgrid(hdd_dataset.longitude.values, hdd_dataset.latitude.values)
    coords_flat = list(zip(lon_grid.ravel(), lat_grid.ravel()))

    #Make a gdf of dataset grid to prepare for spatial join
    gdf_points = gpd.GeoDataFrame({
        'latitude': lat_grid.flatten(),
        'longitude': lon_grid.flatten(),
        'geometry': gpd.points_from_xy([c[0] for c in coords_flat], [c[1] for c in coords_flat])
    }, crs=gdf.crs) #Make sure we have the same crs

    #Spatial join between the full US grid and the subregions
    gdf_joined = gpd.sjoin(gdf_points, gdf[['REG_NAME', 'geometry']], how='inner', predicate='within')

    #Find subregions points and mean then to have the mean of a region for a day of year
    zone_means = {}
    for zone in gdf_joined['REG_NAME'].unique():
        zone_pts = gdf_joined[gdf_joined['REG_NAME'] == zone]
        zone_data = hdd_dataset.sel(latitude=xr.DataArray(zone_pts['latitude'].values, dims='points'), #Select region points from the xarray dataset
                                    longitude=xr.DataArray(zone_pts['longitude'].values, dims='points'),
                                    method='nearest')
        zone_pop = pop_dataset.sel(latitude=xr.DataArray(zone_pts['latitude'].values, dims='points'), #Select region points from the xarray dataset
                                   longitude=xr.DataArray(zone_pts['longitude'].values, dims='points'),
                                   method='nearest')
        zone_hdd_sum = zone_data.sum(dim='points') #sum the founded points HDD
        zone_pop_sum = zone_pop.sum(dim='points') #sum the pop

        zone_hdd_sum_pop_weighted = zone_hdd_sum / zone_pop_sum
        
        zone_means[zone] = zone_hdd_sum_pop_weighted

    return zone_means

In [None]:
#Read Zarr dataset from earthdatahub destine EU
ds = xr.open_dataset(
    "https://data.earthdatahub.destine.eu/era5/era5-land-daily-utc-v1.zarr",
    storage_options={"client_kwargs":{"trust_env":True}},
    chunks={},
    engine="zarr",
)

In [40]:
#Reassign longitude to be -180, 180
if ds.longitude.max() > 180:
    ds = ds.assign_coords(longitude=((ds.longitude + 180) % 360) - 180)
    ds = ds.sortby("longitude")
ds_us = ds.sel(**{"latitude": slice(50, 24), "longitude": slice(-125, -67)}) #Slice to get only US 

#Format dataset for HDD
t2m_us = ds_us.t2m #Keep only 2m temperature
t2m_us = (t2m_us - 273.15) * 1.8 + 32 #Convert kelvin to °F
t2m_us.attrs['units'] = '°F'
t2m_30years_us = t2m_us.sel(valid_time=slice('1990', '2025')) #Select last 30 years

hdd = (65 - t2m_30years_us).clip(min=0) #make max(0, 65 - t2m)

#Population reggrided to ERA5-Land dataset
pop = xr.open_dataarray('../utils/files/population_regridded_01deg.nc') #from reproject_and_align_pop function in tools
us_pop_sum = pop.sum(dim=['latitude', 'longitude'])

hdd_weighted = hdd * pop #Weight the hdd by population for each point in the grid and each valid_time

#Make US sum and format a df
us_hdd_sum = hdd_weighted.sum(dim=['latitude', 'longitude']) #Sum every point in the US to have one sum weighted HDD for each valid time
us_hdd_sum_per_pop = us_hdd_sum / us_pop_sum
us_hdd_sum_per_pop_df = us_hdd_sum_per_pop.to_dataframe(name='US Sum')
us_hdd_sum_per_pop_df = us_hdd_sum_per_pop_df[['US Sum']]

In [43]:
#Make regions sum and format a df
region_list_hdd = []
zone_means = regions_from_xarray(hdd_weighted, pop)
for zone_name, zone_data in zone_means.items():
    us_hdd_sum_per_pop_region_df = zone_data.to_dataframe(name=zone_name)
    us_hdd_sum_per_pop_region_df = us_hdd_sum_per_pop_region_df[[zone_name]]
    region_list_hdd.append(us_hdd_sum_per_pop_region_df)
region_hdd = pd.concat(region_list_hdd, axis=1)

In [54]:
full_hdds = pd.concat([us_hdd_sum_per_pop_df, region_hdd], axis=1)
full_hdds = full_hdds.reset_index()
full_hdds = full_hdds.rename(columns={'valid_time': 'time'})
full_hdds_row = full_hdds.melt(id_vars=['time'], var_name='region', value_name='hdd')
full_hdds_row['source'] = 'era5_land'
full_hdds_row['data_type'] = 'observation'

In [55]:
full_hdds_row

Unnamed: 0,time,region,hdd,source,data_type
0,1990-01-01,US Sum,24.909163,era5_land,observation
1,1990-01-02,US Sum,27.400915,era5_land,observation
2,1990-01-03,US Sum,23.516171,era5_land,observation
3,1990-01-04,US Sum,21.049710,era5_land,observation
4,1990-01-05,US Sum,23.059405,era5_land,observation
...,...,...,...,...,...
144634,2025-12-27,South Atlantic Gulf,2.035970,era5_land,observation
144635,2025-12-28,South Atlantic Gulf,4.079060,era5_land,observation
144636,2025-12-29,South Atlantic Gulf,6.975467,era5_land,observation
144637,2025-12-30,South Atlantic Gulf,21.338607,era5_land,observation


In [3]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../")))  # set vscode notebook path for module imports

from db.repo import ResultsRepository
from db.mongo import MongoWrapper
from pipeline.config import settings
from pipeline.downloader import ERA5LandDownloader

In [None]:
request_era5 = {
    "variable": ["2m_temperature"],
    "time": [
        "00:00", "01:00", "02:00",
        "03:00", "04:00", "05:00",
        "06:00", "07:00", "08:00",
        "09:00", "10:00", "11:00",
        "12:00", "13:00", "14:00",
        "15:00", "16:00", "17:00",
        "18:00", "19:00", "20:00",
        "21:00", "22:00", "23:00"
    ],
    "data_format": "grib",
    "download_format": "unarchived",
    "area": [50, -125, 24, -67]
}
d = ERA5LandDownloader(request_era5, '../data/hist_observation.grib')
mongo = MongoWrapper(settings.MONGO_URI_PROD, settings.MONGO_DB) #mongo client
repo = ResultsRepository(mongo_client=mongo, repo=settings.MONGO_COLLECTION) #mongo repo

In [None]:
d = ERA5LandDownloader(request_era5, '../data/hist_observation.grib')

dates = pd.date_range('2026-01-01', pd.Timestamp(d.check_latest_available()).tz_localize(None))
hist_obs_list = []
for date in dates:
    if d.is_valid_run(pd.Timestamp(date)) and not repo.exists_for_date(pd.Timestamp(date), d.name):
        filepath, dt = d.download(pd.Timestamp(date))
        df = d.compute(filepath, pd.Timestamp(date))
        hist_obs_list.append(df)

In [28]:
hist_df = pd.concat(hist_obs_list)