# PHI Data Request

The purpose of this notebook is to generate heat metrics for all of CA within four 30-year target period. Steps 0-3 cover the dataset generation. These are not meant to be run again, but rather to illustrate what the process involved. Step 4 covers loading in the data and visualizing the results. This is meant to run and used to explore the data. 

NOTE:
This notebook focuses on the gridded data, before converting to the census tract level. 

The dataset was generated within the following parameters.

- Downscaling: LOCA2

- Temporal scale: Annual

- Time periods: 1 value per 30 year spread

    - Past (1961-1990)

    - Current (2005-2034) 
    
    - Mid-century (1935-1964)

    - End-century (2070-2099) 

- Variable: Average (across models) count of days or nights over a given threshold per time period

    - extreme heat days over 90° F

    - extreme heat days over 100° F
    
    - extreme heat days over 98th percentile 

    - warm nights over 98th percentile

- GCMs: All

- SSPs: 5.85

- Format: NetCDF

## 0. Setup

In [2]:
# import climakitae as ck
# from climakitae.core.data_interface import (
#     get_data_options,
#     get_subsetting_options,
#     get_data,
# )
import xarray as xr
import numpy as np
import pandas as pd
import geopandas as gpd

In [None]:
# select the mode to run in here
## "test" mode retrieves data for 5-year test periods, and for the smallest CA county
## "full_run" mode retrieves data meeting the complete data request criteria - all of CA, for the four 30-year target periods

mode = "full_run"  # 'test' or 'full_run'

In [None]:
test_time_slices_dict = {"test1": (1995, 2000), "test2": (2005, 2010)}

time_slices_dict = {
    "past": (1961, 1990),
    "current": (2005, 2034),
    "mid-century": (2035, 2064),
    "end-century": (2070, 2099),
}

if mode == "test":
    area = "San Francisco County"
    dict_slices_run = test_time_slices_dict
elif mode == "full_run":
    area = "CA"
    dict_slices_run = time_slices_dict

# parameters
max_temp_params = {
    "variable": "Maximum air temperature at 2m",
    "resolution": "3 km",
    "timescale": "daily",
    "downscaling_method": "Statistical",
    "scenario": ["Historical Climate", "SSP 5-8.5"],
    "units": "degF",
    "cached_area": area,
}

min_temp_params = {
    "variable": "Minimum air temperature at 2m",
    "resolution": "3 km",
    "timescale": "daily",
    "downscaling_method": "Statistical",
    "scenario": ["Historical Climate", "SSP 5-8.5"],
    "units": "degF",
    "cached_area": area,
}


hist_max_temp_params = {
    "variable": "Maximum air temperature at 2m",
    "resolution": "3 km",
    "timescale": "daily",
    "downscaling_method": "Statistical",
    "scenario": ["Historical Climate"],
    "units": "degF",
    "cached_area": area,
    "time_slice": (1981, 2010),
}

hist_min_temp_params = {
    "variable": "Minimum air temperature at 2m",
    "resolution": "3 km",
    "timescale": "daily",
    "downscaling_method": "Statistical",
    "scenario": ["Historical Climate"],
    "units": "degF",
    "cached_area": area,
    "time_slice": (1981, 2010),
}

percentile = 0.98
# select April through October over which to compute the 98th percentile
months_to_measure = [m for m in np.arange(4, 11, 1)]

In [None]:
# Load and compute maximum temperature historical data once
print("Loading historical data...")
hist_data_max = get_data(**hist_max_temp_params)
print("Historical data loaded (lazy).")
# compute 98th percentile historical temperature between April and October
hist_thresh_max = hist_data_max.sel(
    time=hist_data_max.time.dt.month.isin(months_to_measure)
).quantile(percentile, dim="time")

# CRITICAL: Compute the threshold immediately to avoid huge computation graph
print("  Computing threshold...")
hist_thresh_max = hist_thresh_max.compute()
print("  Threshold computed.")

In [None]:
# Load and compute minimum temperature historical data once
print("Loading historical data...")
hist_data_min = get_data(**hist_min_temp_params)
print("Historical data loaded (lazy).")
# compute 98th percentile historical temperature between April and October
hist_thresh_min = hist_data_min.sel(
    time=hist_data_min.time.dt.month.isin(months_to_measure)
).quantile(percentile, dim="time")

# CRITICAL: Compute the threshold immediately to avoid huge computation graph
print("  Computing threshold...")
hist_thresh_max = hist_thresh_min.compute()
print("  Threshold computed.")

In [None]:
nan_ds = hist_data_max.sel(time=0)
nan_ds = nan_ds.isel(scenario=0)
nan_ds = nan_ds.reset_coords("simulation", drop=True)
nan_ds = nan_ds.reset_coords("time", drop=True)

## 1. Functions

The following functions are used to calculate

- average annual number of extreme heat days above 90 deg F
- average annual number of extreme heat days above 100 deg F
- average annual number of extreme heat days above historical baseline (98th percentile)
- average annual number of warm nights (based on historical 98th percentile)


In [None]:
# extreme heat days above given temperature threshold

def avg_days_above_X_degF(data, temp_threshold_in_degF):
    return (data.mean('simulation') > temp_threshold_in_degF).groupby('time.year').sum('time').mean('year').squeeze()

In [None]:
# extreme heat days or warm nights above historical baseline (98th percentile)

def avg_days_above_percentile(hist_data, data, percentile, hist_thresh):

    # select out scenario to reduce complexity
    data = data.isel(scenario=0)

    # count total days > 98th percentile in historical data and take average across time slice
    print("  Computing daily counts...")
    hist_count = (
        xr.where(data > hist_thresh, x=1, y=0)
        .groupby("time.year")
        .sum("time")
        .mean(dim="year")
        .mean(dim="simulation")
        .squeeze()
    )

    # CRITICAL: Compute immediately before returning
    print("  Computing final result...")
    hist_count = hist_count.compute()
    print("  Result computed.")

    return hist_count

## 2. Generate metrics

### Average annual number of extreme heat days above 90 deg F

In [None]:
temp_threshold = 90

results = []

for label, time_slice in dict_slices_run.items():

    # retrieve data
    data = get_data(**max_temp_params, time_slice=time_slice)

    # for that time slice, calculate the average number of days above a temperature threshold in the time slice
    result = avg_days_above_X_degF(data, temp_threshold_in_degF=temp_threshold)
    print()

    # add time slice name as a dimension
    result = result.expand_dims({"time_slice_name": [label]})

    # now append this result to the ongoing list of results
    results.append(result)

# concatenate all along the new "time_slice_name" dimension
combined_90 = xr.concat(results, dim="time_slice_name")

# add in the time ranges to the dataset coordinates
combined_90 = combined_90.assign_coords(
    {
        "time_slice_range": (
            "time_slice_name",
            [f"{t[0]}–{t[1]}" for t in dict_slices_run.values()],
        )
    }
)

# rename
combined_90.name = f"avg annual # extreme heat days above {temp_threshold} F"

# add in nan values from the original LOCA dataset that were removed in the process
combined_90 = xr.where(np.isnan(nan_ds), x=np.nan, y=combined_90)

# export to local, as netCDF
combined_90.to_netcdf(
    path=f"avg_extreme_heat_days_over_{temp_threshold}.nc", engine="netcdf4"
)

### Average annual number of extreme heat days above 100 deg F

In [None]:
temp_threshold = 100

results = []

for label, time_slice in dict_slices_run.items():

    # retrieve data
    data = get_data(**max_temp_params, time_slice=time_slice)

    # for that time slice, calculate the average number of days above a temperature threshold in the time slice
    result = avg_days_above_X_degF(data, temp_threshold_in_degF=temp_threshold)
    print()

    # add time slice name as a dimension
    result = result.expand_dims({"time_slice_name": [label]})

    # now append this result to the ongoing list of results
    results.append(result)

# concatenate all along the new "time_slice_name" dimension
combined_100 = xr.concat(results, dim="time_slice_name")

# add in the time ranges to the dataset coordinates
combined_100 = combined_100.assign_coords(
    {
        "time_slice_range": (
            "time_slice_name",
            [f"{t[0]}–{t[1]}" for t in dict_slices_run.values()],
        )
    }
)

# rename
combined_100.name = f"avg annual # extreme heat days above {temp_threshold} F"

# add in nan values from the original LOCA dataset that were removed in the process
combined_100 = xr.where(np.isnan(nan_ds), x=np.nan, y=combined_100)

# export to local, as netCDF
combined_100.to_netcdf(path=f"avg_extreme_heat_days_over_{temp_threshold}.nc", engine="netcdf4")

### Average annual number of extreme heat days above historical baseline (98th percentile)

In [None]:
results = []

for idx, (label, time_slice) in enumerate(dict_slices_run.items(), 1):
    print(f"\n{'='*60}")
    print(f"Processing {label} ({idx}/{len(dict_slices_run)}): {time_slice}")
    print(f"{'='*60}")

    # retrieve data
    print(f"Retrieving data for {label}...")
    data = get_data(**max_temp_params, time_slice=time_slice)

    # for that time slice, calculate the average number of days above 98th percentile
    result = avg_days_above_percentile(hist_data_max, data, percentile,hist_thresh_max)

    # add time slice name as a dimension
    result = result.expand_dims({"time_slice_name": [label]})

    # append this result to the list
    results.append(result)

    # Clean up
    del data
    print(f"Completed {label}")

print("\n" + "=" * 60)
print("All time slices processed")
print("=" * 60)

# concatenate all along the new "time_slice_name" dimension
print("Concatenating results...")
combined_hot = xr.concat(results, dim="time_slice_name")

# add in the time ranges to the dataset coordinates
combined_hot = combined_hot.assign_coords(
    {
        "time_slice_range": (
            "time_slice_name",
            [f"{t[0]}–{t[1]}" for t in dict_slices_run.values()],
        )
    }
)

# rename variable
combined_hot.name = "avg annual # extreme heat days above 98th percentile"

# add in nan values from the original LOCA dataset that were removed in the process
combined_hot = xr.where(np.isnan(nan_ds), x=np.nan, y=combined_hot)

# export to local, as netCDF
print("Exporting to netCDF...")
combined_hot.to_netcdf(
    path="avg_extreme_heat_days_above_98th_percentile.nc", engine="netcdf4"
)

print("Complete!")

### Average annual number of warm nights (based on historical 98th percentile)

In [None]:
results = []

for idx, (label, time_slice) in enumerate(dict_slices_run.items(), 1):
    print(f"\n{'='*60}")
    print(f"Processing {label} ({idx}/{len(dict_slices_run)}): {time_slice}")
    print(f"{'='*60}")

    # retrieve data
    print(f"Retrieving data for {label}...")
    data = get_data(**min_temp_params, time_slice=time_slice)

    # for that time slice, calculate the average number of days above 98th percentile
    result = avg_days_above_percentile(hist_data_max, data, percentile, hist_thresh_min)

    # add time slice name as a dimension
    result = result.expand_dims({"time_slice_name": [label]})

    # append this result to the list
    results.append(result)

    # Clean up
    del data
    print(f"Completed {label}")

print("\n" + "=" * 60)
print("All time slices processed")
print("=" * 60)

# concatenate all along the new "time_slice_name" dimension
print("Concatenating results...")
combined_warm = xr.concat(results, dim="time_slice_name")

# add in the time ranges to the dataset coordinates
combined_warm = combined_warm.assign_coords(
    {
        "time_slice_range": (
            "time_slice_name",
            [f"{t[0]}–{t[1]}" for t in dict_slices_run.values()],
        )
    }
)

# rename variable
combined_warm.name = "avg annual # warm nights above 98th percentile"

# add in nan values from the original LOCA dataset that were removed in the process
combined_warm = xr.where(np.isnan(nan_ds), x=np.nan, y=combined_warm)

# export to local, as netCDF
print("Exporting to netCDF...")
combined_warm.to_netcdf(
    path="avg_warm_nights_above_98th_percentile.nc", engine="netcdf4"
)

print("Complete!")

## 3. Visualize gridded results

In [None]:
# first, import from local
above_90 = xr.open_dataset("nans_avg_extreme_heat_days_over_90.nc", engine="netcdf4")
above_100 = xr.open_dataset("nans_avg_extreme_heat_days_over_100.nc", engine="netcdf4")
extreme_heat = xr.open_dataset(
    "nans_avg_extreme_heat_days_above_98th_percentile.nc", engine="netcdf4"
)
warm_nights = xr.open_dataset(
    "nans_avg_warm_nights_above_98th_percentile.nc", engine="netcdf4"
)

In [None]:
# choose which time slice you would like to visualize
# OPTIONS:
# 'past'
# 'current'
# 'mid-century'
# 'late-century'

time_slice = 'mid-century'

In [None]:
# average days over 90 F for the selected time slice

above_90.sel(time_slice_name=time_slice)[
    "avg annual # extreme heat days above 90 F"
].squeeze().plot()

In [None]:
# average days over 100 F for the selected time slice

above_100.sel(time_slice_name=time_slice)[
    "avg annual # extreme heat days above 100 F"
].squeeze().plot()

In [None]:
# average extreme heat days over the 98th percentile for the selected time slice

extreme_heat.sel(time_slice_name=time_slice)[
    "avg annual # extreme heat days above 98th percentile"
].squeeze().plot()

In [None]:
# average warm nights over the 98th percentile for the selected time slice

warm_nights.sel(time_slice_name=time_slice)[
    "avg annual # warm nights above 98th percentile"
].squeeze().plot()

In [3]:
# first, import from local
above_90 = xr.open_dataset("census_tract_avg_extreme_heat_days_over_90.nc", engine="netcdf4")
above_100 = xr.open_dataset(
    "census_tract_avg_extreme_heat_days_over_100.nc", engine="netcdf4"
)
extreme_heat = xr.open_dataset(
    "census_tract_avg_extreme_heat_days_above_98th_percentile.nc", engine="netcdf4"
)
warm_nights = xr.open_dataset(
    "census_tract_avg_warm_nights_above_98th_percentile.nc", engine="netcdf4"
)

In [15]:
name = "avg annual # extreme heat days above 90 F"

In [4]:
df_90 = above_90.to_dataframe()

In [7]:
df_90 = df_90.reset_index()

In [13]:
test = df_90[df_90['time_slice_name']=='current']

In [17]:
test[name].nunique()

2115