In [1]:
import geopandas as gpd
import xarray as xr
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from shapely.geometry import Point
from shapely.geometry import mapping
from time import time

import rioxarray  # for the extension to load
import rasterio

crs = "EPSG:3310"

In [2]:
# load geodataframe which maps WRF grid points to census tracts
census_tracts = gpd.read_file("loca_points_to_geoid.gdb")

## Parameters

In [3]:
# above 90

ds = xr.open_dataset(
    "avg_extreme_heat_days_over_90.nc", engine="netcdf4"
)
name = "avg annual # extreme heat days above 90 F"

ds = ds.sel(time_slice_name='past')


# # above 100

# ds = xr.open_dataset("avg_extreme_heat_days_over_100.nc", engine="netcdf4")
# name = "avg annual # extreme heat days above 100 F"

# # extreme heat

# ds = xr.open_dataset("avg_extreme_heat_days_above_98th_percentile.nc", engine="netcdf4")
# name = "avg annual # extreme heat days above 98th percentile"

# # warm nights

# ds = xr.open_dataset("avg_warm_nights_above_98th_percentile.nc", engine="netcdf4")
# name = "avg annual # warm nights above 98th percentile"

In [4]:
# ds.rio.write_crs(
#     "EPSG:4326", inplace=True
# )  # EPSG for LOCA2: https://analytics.cal-adapt.org/data/access/
# ds = ds.rio.set_spatial_dims(x_dim="lon", y_dim="lat")
# ds = ds.rio.reproject(crs)  # reproject to CA equal albers

### Run it

In [5]:
# we want to load the dataset into memory here; it will make future computations much faster
ds = ds.compute()

display(ds)

In [6]:
# convert to geodataframe
gdf_begin_time = time()
df = ds.squeeze().to_dataframe().reset_index().set_index("time_slice_name")
gdf = gpd.GeoDataFrame(data=df[name], geometry=gpd.points_from_xy(df.lon, df.lat)).set_crs(
    "EPSG:4326"
)  # CRS of original LOCA data
gdf_time = time()
print(f"gdf made in {gdf_time-gdf_begin_time} seconds")
display(gdf)

gdf made in 0.08677124977111816 seconds


Unnamed: 0_level_0,avg annual # extreme heat days above 90 F,geometry
time_slice_name,Unnamed: 1_level_1,Unnamed: 2_level_1
past,0.0,POINT (-124.39062 32.54688)
past,0.0,POINT (-124.35938 32.54688)
past,0.0,POINT (-124.32812 32.54688)
past,0.0,POINT (-124.29688 32.54688)
past,0.0,POINT (-124.26562 32.54688)
...,...,...
past,0.0,POINT (-114.26562 41.98438)
past,0.0,POINT (-114.23438 41.98438)
past,0.0,POINT (-114.20312 41.98438)
past,0.0,POINT (-114.17188 41.98438)


In [7]:
# join heat metric geodataframe with the geodataframe
# which maps LOCA points to tracts

joined_gdf = (
    census_tracts[["GEOID", "geometry"]]
    .merge(gdf.reset_index(), on="geometry")
    .set_index("time_slice_name")
)

#!! Why no common geometries??
display(joined_gdf)

Unnamed: 0_level_0,GEOID,geometry,avg annual # extreme heat days above 90 F
time_slice_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1


In [None]:
# take the average of grid points in each tract
climate_gdf_tracts_avg = (
    joined_gdf.reset_index().groupby(["GEOID", "time_slice_name"])[[name]].aggregate("mean")
)
climate_gdf_tracts_avg = climate_gdf_tracts_avg.reset_index()
avg_time = time()
display(climate_gdf_tracts_avg)

In [None]:
climate_gdf_tracts_avg[name].max()

In [None]:
prelim = climate_gdf_tracts_avg.set_index(["time_slice_name","GEOID"])

In [None]:
# convert into dataset
final_ds = prelim.to_xarray()

## Export

In [None]:
# first, import from local
# final_ds.to_netcdf("census_tract_avg_extreme_heat_days_over_90.nc", engine="netcdf4")
# final_ds.to_netcdf("census_tract_avg_extreme_heat_days_over_100.nc", engine="netcdf4")
# # final_ds.to_netcdf(
#     "census_tract_avg_extreme_heat_days_above_98th_percentile.nc", engine="netcdf4"
# )
final_ds.to_netcdf(
    "census_tract_avg_warm_nights_above_98th_percentile.nc", engine="netcdf4"
)

## Merge with geometries

In [None]:
# Load census tract shapefile or GeoJSON
census_tracts = gpd.read_file(
    "CA_tiger_2023_tract/"
)  # GeoDataFrame of census tracts for LA County

### we want to convert the census tract polygons from lon-lat coordinates
### to x-y coordinates like we have for our WRF output.
### this is because sjoin_nearest() can better calculate the distances
### between grid points and polygons when we use an area-preserving projection,
### which WRF natively uses.

# subset geodataframe to the minimum of what we need
# so we save memory
census_tracts = census_tracts[["geometry", "GEOID"]]
census_tracts = census_tracts.to_crs(crs)

In [None]:
census_tracts = gpd.read_file("CA_tiger_2023_tract/")

In [None]:
merged = census_tracts.merge(climate_gdf_tracts_avg, on="GEOID", how="inner")

In [None]:
merged

In [None]:
prelim = merged.set_index(["time_slice_name", "GEOID",'geometry'])

In [None]:
final_ds = prelim.to_xarray()

In [None]:
final_ds

In [None]:
final_ds

## Scraps

In [None]:
def reproject_to_tracts(ds_delta, ca_boundaries):
    df = ds_delta.to_dataframe().reset_index()

    gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.lat, df.lon))
    gdf = gdf.set_crs("EPSG:3310")

    ca_boundaries = ca_boundaries.to_crs("EPSG:3310")
    ca_boundaries = ca_boundaries.set_index(["GEOID"])

    clipped_gdf = gpd.sjoin_nearest(ca_boundaries, gdf, how="left")
    clipped_gdf = clipped_gdf.drop(["index_right"], axis=1)
    clipped_gdf = clipped_gdf.reset_index()[["GEOID", name, "geometry"]]

    ### some coastal tracts do not contain any land grid cells ###
    ### due to the WRF's underlying surface type for a given grid cell. ###

    # aggregate the gridded data to the tract level
    clipped_gdf_diss = clipped_gdf.reset_index().dissolve(by="GEOID", aggfunc="mean")
    clipped_gdf_diss = clipped_gdf_diss.rename(columns={f"{name}_right": name})

    # separate tracts with data from tracts without data
    clipped_gdf_nan = clipped_gdf_diss[np.isnan(clipped_gdf_diss[name])]
    clipped_gdf_nan = clipped_gdf_nan[["geometry", name]]
    clipped_gdf_valid = clipped_gdf_diss[~np.isnan(clipped_gdf_diss[name])]
    clipped_gdf_valid = clipped_gdf_valid[["geometry", name]]

    # compute the centroid of each tract
    clipped_gdf_nan["centroid"] = clipped_gdf_nan.centroid
    clipped_gdf_nan = clipped_gdf_nan.set_geometry("centroid")
    clipped_gdf_valid["centroid"] = clipped_gdf_valid.centroid
    clipped_gdf_valid = clipped_gdf_valid.set_geometry("centroid")

    # fill in missing tracts with values from the closest tract
    # in terms of distance between the tract centroids
    clipped_gdf_filled = clipped_gdf_nan.sjoin_nearest(clipped_gdf_valid, how="left")
    clipped_gdf_filled = clipped_gdf_filled[["geometry_left", f"{name}_right"]]
    clipped_gdf_filled = clipped_gdf_filled.rename(
        columns={"geometry_left": "geometry", f"{name}_right": name}
    )
    clipped_gdf_valid = clipped_gdf_valid.drop(columns="centroid")

    # concatenate filled-in tracts with the original tract which had data
    gdf_all_tracts = pd.concat([clipped_gdf_valid, clipped_gdf_filled])

    return gdf_all_tracts