In [None]:
import scipy.stats
import numpy
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import xarray as xr
import tqdm
import os
import pickle
import glob
from src.evt import *

## set plotting style
sns.set()

## initialize random number generator
rng = np.random.default_rng()


def load_var_for_sim(varname, sim):
    """Load data for given varname from given sim.
    - varname is either "tas" or "hus"
    - sim is integer
    """

    # Load from CSV
    sim_fp = os.path.join(os.environ["DATA_FP"], "plasim", f"sim{sim:02d}")
    fname = f"{varname}_lat.45_lon.240_spatial.2_reduce.max.csv"
    data = pd.read_csv(os.path.join(sim_fp, fname))

    ## make sure time column is called "time"
    if "Day" in data.columns:
        data = data.rename(columns={"Day": "time"})

    ## set time index
    data = data.set_index("time")

    ## Convert to xarray, and rename "day" coordinate
    data = xr.Dataset.from_dataframe(data)
    updated_time = xr.cftime_range(
        start=data["time"].values[0], periods=len(data["time"]), freq="1D"
    )
    data["time"] = pd.Index(updated_time, name="time")

    return data


def load_vars_for_sim(sim):
    """Load surface temp and humidity for given sim"""

    data = xr.merge([load_var_for_sim(name, sim) for name in ["tas", "hus"]])

    return data


def get_valid_sim_indices():

    ## get pattern for simulation folders
    pattern = os.path.join(os.environ["DATA_FP"], "plasim", "sim*")

    ## get list of folders with that pattern
    file_list = sorted(glob.glob(pattern))

    ## Get list of simulation indices
    index_list = [int(name[-2:]) for name in file_list]

    return sorted(index_list)


def get_annual_max_for_sim(sim):
    """Get annual max for given sim"""

    ## load data for given simulation
    data = load_vars_for_sim(sim)

    ## compute annual max
    data_annual_max = data.groupby("time.year").max()

    ## replace year dimension with sample dimension
    sample_dim = [f"{sim:02d}_{y:04d}" for y in data_annual_max.year]
    data_annual_max = data_annual_max.rename({"year": "sample"})
    data_annual_max["sample"] = sample_dim

    return data_annual_max


def get_annual_maxes_from_csv():
    """Load data from all simulations from CSV files"""

    ## get valid simulation indices
    sim_idx = get_valid_sim_indices()

    ## load each simulation in order
    data = xr.concat(
        [get_annual_max_for_sim(i) for i in tqdm.tqdm(sim_idx)], dim="sample"
    )

    return data


def get_annual_maxes():
    """Get annual maxes for all simulations. Save to .nc file
    if not precomputed."""

    ## define filepath for .nc file
    save_fp = os.path.join(os.environ["DATA_FP"], "plasim", "PNW_max.nc")

    ## check if file exists; if not, load from CSV
    if os.path.isfile(save_fp):
        data = xr.open_dataset(save_fp)

    else:
        data = get_annual_maxes_from_csv()
        data.to_netcdf(save_fp)

    return data


def compute_mse():

    ## TO-DO: compute MSE/Tw as well!
    ## get pressure
    # p = np.exp(load_var_for_sim("pl",0)["pl"])

    # import metpy.calc
    # rh = metpy.calc.relative_humidity_from_specific_humidity(
    #     pressure=p.values * metpy.units.units.Pa,
    #     temperature=d["tas"].values * metpy.units.units.kelvin,
    #     specific_humidity = d["hus"].values * metpy.units.units("kg/kg")
    # )

    return

## Set params. and load data

In [None]:
# PARAMS
model_class = scipy.stats.genextreme
n_train = 80  # number of points to train on
n_mc = 100  # number of montecarlo sims
bounds_gev = dict(c=[-5, 5], loc=[0, 100], scale=[1e-5, 10])
bounds_gauss = dict(loc=[0, 100], scale=[-10, 10])

## LOAD DATA
data = get_annual_maxes()

## convert from Kelvin to celsius
data["tas"] = data["tas"] - 273.15
X_raw = data["tas"].values

## get version of X with values >cutoff (see PDF plots below)
cutoff = 26 # deg C
X = X_raw[X_raw>cutoff]

## Plot empirical PDF

In [None]:
fig, axs = plt.subplots(1,2, figsize=(8,3))

for ax, X_ in zip(axs, [X_raw, X]):

    ## Compute empirical PDF
    pdf_empirical, bin_edges = get_empirical_pdf(X_)
    
    # ## setup plot
    # fig, ax = plt.subplots(figsize=(4, 3))
    
    ## plot empirical pdf
    ax.stairs(pdf_empirical, edges=bin_edges, color="gray", fill=True, alpha=0.3)
    
    ## plot max value
    ax.scatter(X.max(), 0, marker="|", c="r", s=50)
    ax.scatter(cutoff, 0, marker="|", c="r", s=50)
    
    ## plot marker at cutoff
    
    ## label
    ax.set_xlabel(r"Annual max ($^{\circ}C$)")
    ax.set_ylim([-.005,.18])
    ax.set_xlim([18,43])

## add some labels
axs[0].set_title("Before trimming")
axs[1].set_title("After trimming")
axs[0].set_ylabel("Prob.")
axs[1].set_yticks([])

plt.show()

## First plot: return time curve w/ 80 years of data

In [None]:
## Empirical return period
tr_empirical, Xr_empirical = get_empirical_return_period(X)

## fit model to small subset of data
## we're not going to use this model; only use it
## to get the return periods for plotting
model_temp = fit_model(X[:n_train], model_class, bounds_gev)
_, tr = get_return_levels(model_temp)


## compute curve with GEV
Xr_mean, Xr_median, Xr_lb, Xr_ub = compute_MC_return_period_bnds(
    X, model_class=model_class, bounds=bounds_gev, n_train=n_train, n_mc=n_mc
)

# Compute the same with Gaussian fit
Xr_mean_gauss, Xr_median_gauss, Xr_lb_gauss, Xr_ub_gauss = (
    compute_MC_return_period_bnds(
        X, model_class=scipy.stats.norm, bounds=bounds_gauss, n_train=n_train, n_mc=n_mc
    )
)

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))

## plot modeled return period
ax.plot(tr, Xr_mean, c="indianred", label=f"GEV fit ({n_train} years)")
ax.fill_between(tr, Xr_ub, Xr_lb, color="coral", alpha=0.1)


## plot modeled return period
ax.plot(tr, Xr_mean_gauss, c="deepskyblue", label=f"Gaussian fit ({n_train} years)")
ax.fill_between(tr, Xr_ub_gauss, Xr_lb_gauss, alpha=0.1, color="deepskyblue")

## label axes
ax.set_xlabel("Return period (years)")
ax.set_ylabel(
    r"$T_{2m}$-return value ($°C$)"
)  # MY DATA IS IN CELSIUS, but you can change it to whatever you want
ax.set_xscale("log")
# ax.set_ylim(-5, 20)

# add the gound truth
t_test_r_empirical, X_test_r_empirical = get_empirical_return_period(X)
ax.scatter(
    t_test_r_empirical,
    X_test_r_empirical,
    c="black",
    s=1.5,
    label="55K-years Ground truth",
)

# # # compute negative log likelihood on the test set
# NLL_test = -model.logpdf(X_test).sum()/len(X_test)
# # print it in the plot in the bottom right corner
# ax.text(0.7, 0.2, f'test NLL={NLL_test:.2f}', transform=ax.transAxes)

plt.title(
    f"Return-levels plot \n Monte Carlo estimation with M={n_mc}", y=1.05, fontsize=15
)
plt.legend()
plt.show()

## Second plot: boxplots of bias vs training data size

In [None]:
n_train_list = [40, 80, 160, 320, 640, 1280, 5000, 10000]
n_mc = 100

target_return_period = 1000  # We should probably stick to 1000 years
target_return_value = get_target_return_value(X, target_return_period)[0]

Xr_samples_list = []
Xr_samples_list_gauss = []

for n_train in n_train_list:
    print(f"Training on {n_train} samples...")
    Xr_samples = compute_MC_return_value(
        X,
        model_class,
        bounds_gev,
        n_train=n_train,
        n_mc=n_mc,
        return_periods=target_return_period,
    )
    Xr_samples_list.append(Xr_samples)
    Xr_samples_gauss = compute_MC_return_value(
        X,
        scipy.stats.norm,
        bounds=None,
        n_train=n_train,
        n_mc=n_mc,
        return_periods=target_return_period,
    )
    Xr_samples_list_gauss.append(Xr_samples_gauss)
    print(f"\n")

Xr_samples_list = np.stack(Xr_samples_list, axis=0)
Xr_samples_list_gauss = np.stack(Xr_samples_list_gauss, axis=0)

In [None]:
# Create the box plot
fig, ax = plt.subplots(figsize=(10, 8))

bias = (-target_return_value + Xr_samples_list).T
# change color of median
plt.boxplot(
    bias,
    vert=True,
    patch_artist=True,
    tick_labels=n_train_list,
    boxprops=dict(facecolor="lightcoral"),
    medianprops=dict(color="maroon"),
)  # , label= 'Gaussian fit')

bias_gauss = (-target_return_value + Xr_samples_list_gauss).T
# change color for the gaussian model
plt.boxplot(
    bias_gauss,
    vert=True,
    patch_artist=True,
    tick_labels=n_train_list,
    boxprops=dict(facecolor="lightblue", alpha=.8),
)  # , label= 'Gaussian fit')

# Create custom handles for the legend
handles = [
    plt.Line2D([0], [0], color="lightcoral", lw=4, label="GEV fit"),
    plt.Line2D([0], [0], color="lightblue", lw=4, label="Gaussian fit"),
]

# Add the legend
plt.legend(handles=handles, loc="upper right")

# Add title and labels
plt.title("Box plot of n_values")
plt.xlabel("Nb of training data years")
plt.ylabel(f"Bias (K)")
plt.title(
    f"Difference between the MC estimation of {target_return_period}-year return level \n and the ground truth {target_return_period}-year return level",
    y=1.05,
    fontsize=15,
)
# plot a straight line at 0
plt.axhline(y=0, color="black", linestyle="--")
plt.ylim(-5, 5)

# Show the plot
plt.show()

## Look at PDF and fit (full data)

In [None]:
## Empirical PDF (normalized histogram)
pdf_empirical, bin_edges = get_empirical_pdf(X)

## Empirical return period
tr_empirical, Xr_empirical = get_empirical_return_period(X)

## Fit model and get return levels
bounds = dict(c=[-1, 1], loc=[200, 400], scale=[-1e5, 1e5])
model = fit_model(X, model_class, bounds=bounds_gev)
Xr, tr = get_return_levels(model, return_periods=np.logspace(0.01, 5, 100))

## compute estimated return time for max event
tr_max = tr[np.argmin(np.abs(Xr_empirical[-1] - Xr))]

In [None]:
## test points to plot curve
X_test = np.linspace(bin_edges.min(), bin_edges.max() + 1, 100)

## setup plot
fig, ax = plt.subplots(figsize=(4, 3))

## plot empirical pdf
ax.stairs(pdf_empirical, edges=bin_edges, color="gray", fill=True, alpha=0.3)

## plot distribution fit
ax.plot(X_test, model.pdf(X_test), c="k")

## plot max value
ax.scatter(X.max(), 0, marker="x", c="r", s=50)

## label
ax.set_xlabel(r"Annual max ($K$)")
ax.set_ylabel("Prob.")

plt.show()