In [1]:
import xarray as xr
import dask
import glob
import matplotlib.pyplot as plt
import numpy as np
from scipy import integrate
import matplotlib
from cartopy.mpl.gridliner import LONGITUDE_FORMATTER, LATITUDE_FORMATTER
import pandas as pd

In [3]:
cluster_data_path = "/lus/scratch/shao/data/NEP36_extremes/processed/daily/by_year/by_cluster"
n_clusters = range(5,10)

shallows_ds_dict = {}
canyons_ds_dict = {}

for n_cluster in n_clusters:
    print(n_cluster)
    shallows_ds_dict[n_cluster] = xr.open_mfdataset(
                f"{cluster_data_path}/n_clusters_{n_cluster}/*shallows.nc",
                engine="h5netcdf",
                chunks = {"time":730}
    ).persist()
    canyons_ds_dict[n_cluster] = xr.open_mfdataset(
                f"{cluster_data_path}/n_clusters_{n_cluster}/*canyons.nc",
                engine="h5netcdf",
                chunks = {"time":730}
    ).persist()

5
6
7
8
9


In [4]:
downwelling_months = [10, 11, 12, 1, 2, 3]
upwelling_months   = [4, 5, 6, 7, 8, 9]
timeseries_vars = ['O2','OmegaA','T']

base_percentile = 0.1
percentiles = {
    'O2':base_percentile,
    'OmegaA':base_percentile,
    'T':1-base_percentile
}

In [5]:
def filter_by_season_and_values(ds, month_range):       
    ds_out = ds.where(ds['time.month'].isin(month_range),drop=True)
    return ds_out
    
def calculate_threshold(ds, percentile, nbins=1000):
    data = ds.to_numpy().flatten()
    hist, edges = np.histogram(data, bins=nbins, density=True)
    cdf = integrate.cumulative_trapezoid(hist,edges[1:])
    return np.interp(percentile, cdf, edges[2:])    


In [6]:
heading = "-"*5
# threshold_df = pd.DataFrame(columns = ["Number of Clusters", "Variable", "Season", "Threshold"])
thresholds = []

for n_cluster in n_clusters:
    print(n_cluster)
    
    downwelling_ds = filter_by_season_and_values(
        shallows_ds_dict[n_cluster],
        downwelling_months
    )
    upwelling_ds = filter_by_season_and_values(
        shallows_ds_dict[n_cluster],
        upwelling_months
    )

    for var in timeseries_vars:
        upwelling_threshold = calculate_threshold(upwelling_ds[var], percentiles[var])
        downwelling_threshold = calculate_threshold(downwelling_ds[var], percentiles[var])

        thresholds.append(
            {
                "Number of Clusters":n_cluster,
                "Variable": var,
                "Season": "upwelling",
                "Threshold": upwelling_threshold
            }
        )
        thresholds.append(
            {
                "Number of Clusters":n_cluster,
                "Variable": var,
                "Season": "downwelling",
                "Threshold": downwelling_threshold
            }
        )

shallows_threshold_df = pd.DataFrame(thresholds)

5
6
7
8
9


In [7]:
thresholds = []
for n_cluster in n_clusters:
    print(n_cluster)
    
    downwelling_ds = filter_by_season_and_values(
        canyons_ds_dict[n_cluster],
        downwelling_months
    )
    upwelling_ds = filter_by_season_and_values(
        canyons_ds_dict[n_cluster],
        upwelling_months
    )

    for var in timeseries_vars:
        upwelling_threshold = calculate_threshold(upwelling_ds[var], percentiles[var])
        downwelling_threshold = calculate_threshold(downwelling_ds[var], percentiles[var])

        thresholds.append(
            {
                "Number of Clusters":n_cluster,
                "Variable": var,
                "Season": "upwelling",
                "Threshold": upwelling_threshold
            }
        )
        thresholds.append(
            {
                "Number of Clusters":n_cluster,
                "Variable": var,
                "Season": "downwelling",
                "Threshold": downwelling_threshold
            }
        )

canyons_threshold_df = pd.DataFrame(thresholds)

5
6
7
8
9


In [8]:
shallows_threshold_df

Unnamed: 0,Number of Clusters,Variable,Season,Threshold
0,5,O2,upwelling,222.724201
1,5,O2,downwelling,254.739485
2,5,OmegaA,upwelling,1.368367
3,5,OmegaA,downwelling,1.314121
4,5,T,upwelling,13.862464
5,5,T,downwelling,10.697802
6,6,O2,upwelling,226.890312
7,6,O2,downwelling,255.84453
8,6,OmegaA,upwelling,1.393057
9,6,OmegaA,downwelling,1.316575


In [9]:
canyons_threshold_df

Unnamed: 0,Number of Clusters,Variable,Season,Threshold
0,5,O2,upwelling,61.486474
1,5,O2,downwelling,71.954129
2,5,OmegaA,upwelling,0.62213
3,5,OmegaA,downwelling,0.648083
4,5,T,upwelling,6.608051
5,5,T,downwelling,7.34404
6,6,O2,upwelling,60.976231
7,6,O2,downwelling,71.401635
8,6,OmegaA,upwelling,0.620277
9,6,OmegaA,downwelling,0.646248


In [10]:
print("Canyons")
for var in timeseries_vars:
    display(canyons_threshold_df[(canyons_threshold_df["Season"] == "upwelling") & (canyons_threshold_df["Variable"] == var)])
for var in timeseries_vars:
    display(canyons_threshold_df[(canyons_threshold_df["Season"] == "downwelling") & (canyons_threshold_df["Variable"] == var)])

Canyons


Unnamed: 0,Number of Clusters,Variable,Season,Threshold
0,5,O2,upwelling,61.486474
6,6,O2,upwelling,60.976231
12,7,O2,upwelling,59.000292
18,8,O2,upwelling,57.384056
24,9,O2,upwelling,57.27719


Unnamed: 0,Number of Clusters,Variable,Season,Threshold
2,5,OmegaA,upwelling,0.62213
8,6,OmegaA,upwelling,0.620277
14,7,OmegaA,upwelling,0.615402
20,8,OmegaA,upwelling,0.611283
26,9,OmegaA,upwelling,0.61096


Unnamed: 0,Number of Clusters,Variable,Season,Threshold
4,5,T,upwelling,6.608051
10,6,T,upwelling,6.594428
16,7,T,upwelling,6.405305
22,8,T,upwelling,6.301916
28,9,T,upwelling,6.297086


Unnamed: 0,Number of Clusters,Variable,Season,Threshold
1,5,O2,downwelling,71.954129
7,6,O2,downwelling,71.401635
13,7,O2,downwelling,68.769992
19,8,O2,downwelling,66.949096
25,9,O2,downwelling,66.827039


Unnamed: 0,Number of Clusters,Variable,Season,Threshold
3,5,OmegaA,downwelling,0.648083
9,6,OmegaA,downwelling,0.646248
15,7,OmegaA,downwelling,0.638837
21,8,OmegaA,downwelling,0.633421
27,9,OmegaA,downwelling,0.633033


Unnamed: 0,Number of Clusters,Variable,Season,Threshold
5,5,T,downwelling,7.34404
11,6,T,downwelling,7.330374
17,7,T,downwelling,7.080547
23,8,T,downwelling,6.956731
29,9,T,downwelling,6.950252


In [11]:
print("Shallows")
for var in timeseries_vars:
    display(shallows_threshold_df[(shallows_threshold_df["Season"] == "upwelling") & (shallows_threshold_df["Variable"] == var)])
for var in timeseries_vars:
    display(shallows_threshold_df[(shallows_threshold_df["Season"] == "downwelling") & (shallows_threshold_df["Variable"] == var)])

Shallows


Unnamed: 0,Number of Clusters,Variable,Season,Threshold
0,5,O2,upwelling,222.724201
6,6,O2,upwelling,226.890312
12,7,O2,upwelling,237.308586
18,8,O2,upwelling,246.88918
24,9,O2,upwelling,247.483651


Unnamed: 0,Number of Clusters,Variable,Season,Threshold
2,5,OmegaA,upwelling,1.368367
8,6,OmegaA,upwelling,1.393057
14,7,OmegaA,upwelling,1.452108
20,8,OmegaA,upwelling,1.50693
26,9,OmegaA,upwelling,1.511439


Unnamed: 0,Number of Clusters,Variable,Season,Threshold
4,5,T,upwelling,13.862464
10,6,T,upwelling,13.904125
16,7,T,upwelling,14.046591
22,8,T,upwelling,14.253007
28,9,T,upwelling,14.293538


Unnamed: 0,Number of Clusters,Variable,Season,Threshold
1,5,O2,downwelling,254.739485
7,6,O2,downwelling,255.84453
13,7,O2,downwelling,258.976386
19,8,O2,downwelling,262.34335
25,9,O2,downwelling,262.687377


Unnamed: 0,Number of Clusters,Variable,Season,Threshold
3,5,OmegaA,downwelling,1.314121
9,6,OmegaA,downwelling,1.316575
15,7,OmegaA,downwelling,1.323862
21,8,OmegaA,downwelling,1.331994
27,9,OmegaA,downwelling,1.332693


Unnamed: 0,Number of Clusters,Variable,Season,Threshold
5,5,T,downwelling,10.697802
11,6,T,downwelling,10.697261
17,7,T,downwelling,10.721378
23,8,T,downwelling,10.789874
29,9,T,downwelling,10.81283
