In [31]:
#---
# Modules
#---
import numpy as np
import xarray as xr
import pandas as pd

from data import data_loader
from data import gesla_preprocessing
from data import era5_preprocessing

from gesla import GeslaDataset


In [32]:
#---
# Load Predictors
#----

# Select years and region
range_of_years = "1999-2008" 
subregion = "lon-0530_lat7040" 
season = "autumn" 
preprocess = "preprocess1"
predictor = "sp"

# Load daily mean data for all predictors
sp_dmean = data_loader.load_daymean_era5(
    range_of_years=range_of_years, 
    subregion=subregion, 
    season=season, 
    predictor=predictor,
    preprocess=preprocess,  
)

sp_dmean

In [66]:
#---
# Load Predictand
#---

# Create GESLA Dataset
meta_file = "resources/gesla/GESLA3_ALL.csv"
data_path = "resources/gesla/GESLA3.0_ALL.zip"

g3 = GeslaDataset(meta_file=meta_file, data_path=data_path)

# Select Stations
filenames = [
    'hanko-han-fin-cmems',
    # 'vahemadal-vah-est-cmems', just nan
    'pori-por-fin-cmems',
]

ds = g3.files_to_xarray(filenames)

# Select a season
if season == "autumn":
    get_season = gesla_preprocessing.is_autumn
elif season == "winter":
    get_season = gesla_preprocessing.is_winter

season_ds = ds.sel(date_time=get_season(ds['date_time.month']))

# Select only sea_level analysis data
df = gesla_preprocessing.get_analysis(season_ds)
df = df["sea_level"]

# Detrend data grouped by station
df_anom = gesla_preprocessing.detrend(df, level="station")

# Apply one hot encoding
df_isextreme = gesla_preprocessing.apply_dummies(df_anom, percentile=0.95, level="station")

# Convert to dataset
# nan values: no measurement at that timestamp for specific station
ds_extremes = df_isextreme.to_xarray()

ds_extremes

# # Select data of one station
# nstation = 0
# sea_level = ds_extremes.sel(station=nstation).values

In [89]:
#---
# Predictor and predictand values of overlapping time series
#
# GESLA data is hourly. Needs to be daily, like ERA5. 
#---
predictor_time = pd.to_datetime(sp_dmean.time.values).date
predictand_time = pd.to_datetime(ds_extremes.date_time.values).date
sp = sp_dmean["sp"].values # Daily data
sl = ds_extremes.values # Hourly data

# Choose maximum per day, i.e. if one hour
# a day indicates an extreme surge, the whole day 
# is seen as extreme surge.
sl_dmax = []
for date in predictor_time:
    time_idx = np.where(predictand_time==date)[0] # Intersection of timeseries'
    slmax = np.max(sl[:, time_idx], axis=1)
    sl_dmax.append(slmax)

sl_dmax = np.array(sl_dmax)

#---
# Get only data without nan
#---
value_idx = np.where(~np.isnan(sl_dmax))
sl_dmax = sl_dmax[value_idx]

# value_idx = value_idx[0] # Time component
# predictor_time = predictor_time[value_idx] 
# sp = sp[value_idx, :, :]

In [75]:
x = sp
y = sl_dmax
t = predictor_time