In [None]:
import scipy.stats
import numpy
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import xarray as xr
import tqdm

## set plotting style
sns.set(rc={"axes.facecolor": "white", "axes.grid": False})

## initialize random number generator
rng = np.random.default_rng()

## Functions

In [None]:
def fit_model(data, model_class):
    """Get model fitted to data. Returns scipy.rv_continuous object"""

    ## First, estimate params. In order: [c, loc, scale]
    params = model_class.fit(data)

    ## instantiate random variable
    rv = model_class(*params)

    return rv


def get_return_levels(rv, return_periods=np.logspace(0.01, 3.5)):
    """get return value for given random variable at given return times"""

    return rv.isf(1 / return_periods), return_periods


def draw_sample(data, n=None):
    """draw random (bootstrap) sample from data.
    'n' is number of elements in given sample"""

    if n is None:
        n = len(data)

    return rng.choice(data, size=n, replace=True)


def get_return_period_bnds(data, model_class, n_samples=1000, alpha=0.05):
    """get bounds for return period using bootstrap sampling"""

    ## empty list to hold result
    return_levels_samples = []

    ## loop through number of samples
    for _ in tqdm.tqdm(range(n_samples)):

        ## fit model on bootstrapped sample
        model = fit_model(draw_sample(data), model_class=model_class)

        ## compute return period
        return_levels_samples.append(get_return_levels(model)[0])

    ## convert to array and compute bounds
    return_levels_samples = np.stack(return_levels_samples, axis=0)
    lb, ub = np.quantile(return_levels_samples, axis=0, q=[alpha / 2, 1 - alpha / 2])

    return lb, ub

## Load data

In [None]:
# Load from CSV
data = pd.read_csv("../data/train/A_tas_lat.41_lon.272_spatial.2_reduce.max.csv")

# set time data as index
data = data.set_index("time")

# convert to xarray
data = xr.Dataset.from_dataframe(data)["tas"]

# update time axis (string to cftime object)
updated_time = xr.cftime_range(
    start=data["time"].values[0], periods=len(data["time"]), freq="1D"
)
data["time"] = updated_time

## Prep data (get blocks or peak over threshold)

In [None]:
data_blocked = data.groupby("time.year").max()

## Empirical PDF (normalized histogram)

In [None]:
## histogram
bin_width = 1
bin_edges = np.arange(304, 326, bin_width)
counts, _ = np.histogram(data_blocked, bins=bin_edges)

## empirical PDF (normalize histogram)
pdf_empirical = counts / (counts * bin_width).sum()

## Fit model

In [None]:
model_class = scipy.stats.genextreme

## First fit to all data
model = fit_model(data_blocked, model_class)
Xr, tr = get_return_levels(model)

## Compute bounds
Xr_lb, Xr_ub = get_return_period_bnds(
    data_blocked, model_class=model_class, n_samples=30
)

## Empirical return period

In [None]:
data_sorted = data_blocked.isel(year=np.argsort(data_blocked.values))
n = len(data_sorted)
m = np.arange(1, n + 1)

cdf_empirical = m / (n + 1)
tr_empirical = 1 / (1 - cdf_empirical)

## Plot result

In [None]:
xvals = np.linspace(304, 326, 200)

fig, ax = plt.subplots(figsize=(4, 3))
ax.stairs(pdf_empirical, edges=bin_edges, color="gray", fill=True, alpha=0.3)
ax.plot(xvals, model.pdf(xvals), c="k")

ax.set_xlabel(r"Annual max ($K$)")
ax.set_ylabel("Prob.")
ax.set_ylim([-0.01, None])
ax.set_yticks([0, 0.05, 0.1])

## plot max value
ax.scatter(data_blocked.max(), 0, marker="x", c="r", s=50)

plt.show()

## Plot as function of number of blocks

In [None]:
fig, ax = plt.subplots(figsize=(4, 3))

## plot modeled return period
ax.plot(tr, Xr)
ax.fill_between(tr, Xr_ub, Xr_lb, color="k", alpha=0.1)

## plot empirical return period
ax.scatter(tr_empirical, data_sorted, c="k", s=1.5)

## label axes
ax.set_xlabel("Return period (years)")
ax.set_ylabel(r"$T_{2m}$ ($K$)")
ax.set_xscale("log")

plt.show()