Make historical HDD mean (base) 

In [4]:
import xarray as xr
import pandas as pd
import matplotlib.pyplot as plt
import geopandas as gpd
import numpy as np

import math

In [5]:
def regions_from_xarray(hdd_dataset, pop_dataset):
    #Load subregion shapefile
    gdf = gpd.read_file('../utils/files/USRegion/12Regions/DOI_12_Unified_Regions_20180801.shp')
    gdf = gdf.to_crs(epsg=4326) #reproject file into classical lat lon coordinates
    gdf = gdf[['REG_NAME', 'geometry']]
    gdf = gdf[~gdf['REG_NAME'].isin(['Alaska', 'Pacific Islands'])] #Remove both of these regions, irrelevant

    #Make a point grid from dataset
    lon_grid, lat_grid = np.meshgrid(hdd_dataset.longitude.values, hdd_dataset.latitude.values)
    coords_flat = list(zip(lon_grid.ravel(), lat_grid.ravel()))

    #Make a gdf of dataset grid to prepare for spatial join
    gdf_points = gpd.GeoDataFrame({
        'latitude': lat_grid.flatten(),
        'longitude': lon_grid.flatten(),
        'geometry': gpd.points_from_xy([c[0] for c in coords_flat], [c[1] for c in coords_flat])
    }, crs=gdf.crs) #Make sure we have the same crs

    #Spatial join between the full US grid and the subregions
    gdf_joined = gpd.sjoin(gdf_points, gdf[['REG_NAME', 'geometry']], how='inner', predicate='within')

    #Find subregions points and mean then to have the mean of a region for a day of year
    zone_means = {}
    for zone in gdf_joined['REG_NAME'].unique():
        zone_pts = gdf_joined[gdf_joined['REG_NAME'] == zone]
        zone_data = hdd_dataset.sel(latitude=xr.DataArray(zone_pts['latitude'].values, dims='points'), #Select region points from the xarray dataset
                                    longitude=xr.DataArray(zone_pts['longitude'].values, dims='points'),
                                    method='nearest')
        zone_pop = pop_dataset.sel(latitude=xr.DataArray(zone_pts['latitude'].values, dims='points'), #Select region points from the xarray dataset
                                   longitude=xr.DataArray(zone_pts['longitude'].values, dims='points'),
                                   method='nearest')
        zone_hdd_sum = zone_data.sum(dim='points') #sum the founded points HDD
        zone_pop_sum = zone_pop.sum(dim='points') #sum the pop

        zone_hdd_sum_pop_weighted = zone_hdd_sum / zone_pop_sum
        
        zone_means[zone] = zone_hdd_sum_pop_weighted

    return zone_means

In [2]:
#Read Zarr dataset from earthdatahub destine EU
ds = xr.open_dataset(
    "https://data.earthdatahub.destine.eu/era5/era5-land-daily-utc-v1.zarr",
    storage_options={"client_kwargs":{"trust_env":True}},
    chunks={},
    engine="zarr",
)

#Reassign longitude to be -180, 180
if ds.longitude.max() > 180:
    ds = ds.assign_coords(longitude=((ds.longitude + 180) % 360) - 180)
    ds = ds.sortby("longitude")
ds_us = ds.sel(**{"latitude": slice(50, 24), "longitude": slice(-125, -67)}) #Slice to get only US 

#Format dataset for HDD
t2m_us = ds_us.t2m #Keep only 2m temperature
t2m_us = (t2m_us - 273.15) * 1.8 + 32 #Convert kelvin to °F
t2m_us.attrs['units'] = '°F'
t2m_30years_us = t2m_us.sel(valid_time=slice('1995', '2025')) #Select last 30 years

hdd = (65 - t2m_30years_us).clip(min=0) #make max(0, 65 - t2m)

#Population reggrided to ERA5-Land dataset
pop = xr.open_dataarray('../utils/files/population_regridded_01deg.nc') #from reproject_and_align_pop function in tools

hdd_weighted = hdd * pop #Weight the hdd by population for each point in the grid and each valid_time

hdd_weighted_yearly_mean = hdd_weighted.groupby(hdd_weighted.valid_time.dt.dayofyear).mean() #Make the mean of each grid point for each day of year (1-365)
hdd_weighted_yearly_mean_computed = hdd_weighted_yearly_mean.compute() #here we finally compute the xarray because that's the one we will then manipulate for US and subregion mean

  super().__init__(**codec_config)


In [6]:
#Make US sum
us_yearly_sum = hdd_weighted_yearly_mean_computed.sum(dim=['latitude', 'longitude']) #Mean every point in the US to have one mean weighted HDD for each day of year
us_pop_sum = pop.sum(dim=['latitude', 'longitude'])
base_us_weighted_hdd = us_yearly_sum / us_pop_sum
base_us_weighted_hdd_df = base_us_weighted_hdd.to_dataframe(name='US Sum')
base_us_weighted_hdd_df = base_us_weighted_hdd_df[['US Sum']]

#Make subregion sum
zone_means = regions_from_xarray(hdd_weighted_yearly_mean_computed, pop)

#Make a dataframe from mean dict with day of year as index and regions as columns
subregion_yearly_mean_df = pd.DataFrame({zone: data.values for zone, data in zone_means.items()}, index=hdd_weighted_yearly_mean_computed.dayofyear.values)

us_base_hdd = pd.concat([base_us_weighted_hdd_df, subregion_yearly_mean_df], axis=1) #add US Mean to a new column

if 366 in us_base_hdd.index: #if 366 days in a year, we remove the last one (366th)
    us_base_hdd = us_base_hdd.drop(index=366)

In [7]:
us_base_hdd.to_csv('../utils/files/base_revised.csv')