In [1]:
import xarray as xr
import dask
import glob
import matplotlib.pyplot as plt
import numpy as np
from scipy import integrate
import matplotlib
from cartopy.mpl.gridliner import LONGITUDE_FORMATTER, LATITUDE_FORMATTER
import pandas as pd

In [2]:
cluster_data_path = "/lus/scratch/shao/data/NEP36_extremes/processed/daily/by_year/by_cluster"
n_clusters = range(5,10)

shallows_ds_dict = {}
canyons_ds_dict = {}

for n_cluster in n_clusters:
    print(n_cluster)
    shallows_ds_dict[n_cluster] = xr.open_mfdataset(
                f"{cluster_data_path}/n_clusters_{n_cluster}/*shallows.nc",
                engine="h5netcdf",
                chunks = {"time":730}
    ).persist()
    canyons_ds_dict[n_cluster] = xr.open_mfdataset(
                f"{cluster_data_path}/n_clusters_{n_cluster}/*canyons.nc",
                engine="h5netcdf",
                chunks = {"time":730}
    ).persist()

5
6
7
8
9


In [3]:
downwelling_months = [10, 11, 12, 1, 2, 3]
upwelling_months   = [4, 5, 6, 7, 8, 9]
timeseries_vars = ['O2','OmegaA','T']

base_percentile = 0.1
percentiles = {
    'O2':base_percentile,
    'OmegaA':base_percentile,
    'T':1-base_percentile
}

In [4]:
def filter_by_season_and_values(ds, month_range):       
    ds_out = ds.where(ds['time.month'].isin(month_range),drop=True)
    return ds_out
    
def calculate_threshold(ds, percentile, nbins=1000):
    data = ds.to_numpy().flatten()
    hist, edges = np.histogram(data, bins=nbins, density=True)
    cdf = integrate.cumulative_trapezoid(hist,edges[1:])
    return np.interp(percentile, cdf, edges[2:])    


In [5]:
heading = "-"*5
# threshold_df = pd.DataFrame(columns = ["Number of Clusters", "Variable", "Season", "Threshold"])
thresholds = []

for n_cluster in n_clusters:
    print(n_cluster)
    
    downwelling_ds = filter_by_season_and_values(
        shallows_ds_dict[n_cluster],
        downwelling_months
    )
    upwelling_ds = filter_by_season_and_values(
        shallows_ds_dict[n_cluster],
        upwelling_months
    )

    for var in timeseries_vars:
        upwelling_threshold = calculate_threshold(upwelling_ds[var], percentiles[var])
        downwelling_threshold = calculate_threshold(downwelling_ds[var], percentiles[var])

        thresholds.append(
            {
                "Number of Clusters":n_cluster,
                "Variable": var,
                "Season": "upwelling",
                "Threshold": upwelling_threshold
            }
        )
        thresholds.append(
            {
                "Number of Clusters":n_cluster,
                "Variable": var,
                "Season": "downwelling",
                "Threshold": downwelling_threshold
            }
        )

shallows_threshold_df = pd.DataFrame(thresholds)

5
6
7
8
9


In [6]:
thresholds = []
for n_cluster in n_clusters:
    print(n_cluster)
    
    downwelling_ds = filter_by_season_and_values(
        canyons_ds_dict[n_cluster],
        downwelling_months
    )
    upwelling_ds = filter_by_season_and_values(
        canyons_ds_dict[n_cluster],
        upwelling_months
    )

    for var in timeseries_vars:
        upwelling_threshold = calculate_threshold(upwelling_ds[var], percentiles[var])
        downwelling_threshold = calculate_threshold(downwelling_ds[var], percentiles[var])

        thresholds.append(
            {
                "Number of Clusters":n_cluster,
                "Variable": var,
                "Season": "upwelling",
                "Threshold": upwelling_threshold
            }
        )
        thresholds.append(
            {
                "Number of Clusters":n_cluster,
                "Variable": var,
                "Season": "downwelling",
                "Threshold": downwelling_threshold
            }
        )

canyons_threshold_df = pd.DataFrame(thresholds)

5
6
7
8
9


In [7]:
shallows_threshold_df

Unnamed: 0,Number of Clusters,Variable,Season,Threshold
0,5,O2,upwelling,233.405417
1,5,O2,downwelling,257.417836
2,5,OmegaA,upwelling,1.435456
3,5,OmegaA,downwelling,1.343577
4,5,T,upwelling,13.967632
5,5,T,downwelling,10.642964
6,6,O2,upwelling,241.276602
7,6,O2,downwelling,260.055514
8,6,OmegaA,upwelling,1.480033
9,6,OmegaA,downwelling,1.348746


In [8]:
canyons_threshold_df

Unnamed: 0,Number of Clusters,Variable,Season,Threshold
0,5,O2,upwelling,60.407168
1,5,O2,downwelling,70.684417
2,5,OmegaA,upwelling,0.618708
3,5,OmegaA,downwelling,0.644158
4,5,T,upwelling,6.554591
5,5,T,downwelling,7.284056
6,6,O2,upwelling,58.861594
7,6,O2,downwelling,68.581387
8,6,OmegaA,upwelling,0.615094
9,6,OmegaA,downwelling,0.638242


In [9]:
print("Canyons")
for var in timeseries_vars:
    display(canyons_threshold_df[(canyons_threshold_df["Season"] == "upwelling") & (canyons_threshold_df["Variable"] == var)])
for var in timeseries_vars:
    display(canyons_threshold_df[(canyons_threshold_df["Season"] == "downwelling") & (canyons_threshold_df["Variable"] == var)])

Canyons


Unnamed: 0,Number of Clusters,Variable,Season,Threshold
0,5,O2,upwelling,60.407168
6,6,O2,upwelling,58.861594
12,7,O2,upwelling,54.900212
18,8,O2,upwelling,54.105735
24,9,O2,upwelling,59.429871


Unnamed: 0,Number of Clusters,Variable,Season,Threshold
2,5,OmegaA,upwelling,0.618708
8,6,OmegaA,upwelling,0.615094
14,7,OmegaA,upwelling,0.604319
20,8,OmegaA,upwelling,0.60187
26,9,OmegaA,upwelling,0.618757


Unnamed: 0,Number of Clusters,Variable,Season,Threshold
4,5,T,upwelling,6.554591
10,6,T,upwelling,6.387404
16,7,T,upwelling,6.208723
22,8,T,upwelling,6.193952
28,9,T,upwelling,6.252322


Unnamed: 0,Number of Clusters,Variable,Season,Threshold
1,5,O2,downwelling,70.684417
7,6,O2,downwelling,68.581387
13,7,O2,downwelling,64.229052
19,8,O2,downwelling,63.421058
25,9,O2,downwelling,68.768146


Unnamed: 0,Number of Clusters,Variable,Season,Threshold
3,5,OmegaA,downwelling,0.644158
9,6,OmegaA,downwelling,0.638242
15,7,OmegaA,downwelling,0.624864
21,8,OmegaA,downwelling,0.622325
27,9,OmegaA,downwelling,0.640035


Unnamed: 0,Number of Clusters,Variable,Season,Threshold
5,5,T,downwelling,7.284056
11,6,T,downwelling,7.058875
17,7,T,downwelling,6.826975
23,8,T,downwelling,6.810201
29,9,T,downwelling,6.891159


In [10]:
print("Shallows")
for var in timeseries_vars:
    display(shallows_threshold_df[(shallows_threshold_df["Season"] == "upwelling") & (shallows_threshold_df["Variable"] == var)])
for var in timeseries_vars:
    display(shallows_threshold_df[(shallows_threshold_df["Season"] == "downwelling") & (shallows_threshold_df["Variable"] == var)])

Shallows


Unnamed: 0,Number of Clusters,Variable,Season,Threshold
0,5,O2,upwelling,233.405417
6,6,O2,upwelling,241.276602
12,7,O2,upwelling,242.17079
18,8,O2,upwelling,250.478553
24,9,O2,upwelling,249.25022


Unnamed: 0,Number of Clusters,Variable,Season,Threshold
2,5,OmegaA,upwelling,1.435456
8,6,OmegaA,upwelling,1.480033
14,7,OmegaA,upwelling,1.485151
20,8,OmegaA,upwelling,1.536534
26,9,OmegaA,upwelling,1.527673


Unnamed: 0,Number of Clusters,Variable,Season,Threshold
4,5,T,upwelling,13.967632
10,6,T,upwelling,14.086865
16,7,T,upwelling,14.10354
22,8,T,upwelling,14.310648
28,9,T,upwelling,14.267605


Unnamed: 0,Number of Clusters,Variable,Season,Threshold
1,5,O2,downwelling,257.417836
7,6,O2,downwelling,260.055514
13,7,O2,downwelling,260.400275
19,8,O2,downwelling,263.422334
25,9,O2,downwelling,262.904769


Unnamed: 0,Number of Clusters,Variable,Season,Threshold
3,5,OmegaA,downwelling,1.343577
9,6,OmegaA,downwelling,1.348746
15,7,OmegaA,downwelling,1.34941
21,8,OmegaA,downwelling,1.35392
27,9,OmegaA,downwelling,1.353448


Unnamed: 0,Number of Clusters,Variable,Season,Threshold
5,5,T,downwelling,10.642964
11,6,T,downwelling,10.684057
17,7,T,downwelling,10.691169
23,8,T,downwelling,10.773682
29,9,T,downwelling,10.757295
