# Preprocess 1

One-Hot Encoding of Sea-Level data based on Percentiles

For documentation see:

https://www.notion.so/kai-bellinghausen/Preprocessing-GESLA-One-Hot-Encoding-097b987b7fce4febb12f80898ea61842



In [8]:
import numpy as np
import pandas as pd
from gesla import GeslaDataset

In [9]:
def is_winter(month):
    """
    Description:
        Mask for winter season, e.g. December, January and February.

    Parameters:
        month (xr.DataArray): Containing the month of a timeseries
    
    Returns:
        Boolean mask for winter season
    """

    return (month == 1) | (month == 2) | (month == 12)

def is_autumn(month):
    """
    Description:
        Mask for autumn season, e.g. Sep, Oct and Nov.

    Parameters:
        month (xr.DataArray): Containing the month of a timeseries
    
    Returns:
        Boolean mask for autumn season
    """

    return (month == 9) | (month == 10) | (month == 11)

In [10]:
def get_analysis_df(ds):
    """
    Description: 
        Selects all values of GESLA data where use_flag == 1.
        Drops all NaN values.

    Parameters:
        ds (xr.Dataset): GESLA Dataset for several stations

    Returns:
        df (pd.Dataframe): 
    """
    ds = ds.where(ds.use_flag == 1., drop = True) # Analysis flag
    df = ds.to_dataframe().dropna(how="all") 

    return df

In [11]:
def detrend_df(df, level="station"):
    """
    Description:
        Detrends dataframe by subtracting mean from specified index / level.
        Data is grouped by level.

    Parameters:
        df (pd.DataFrame): Dataframe with timeseries data
        level (str): Index along which to subtract mean (Default:"station")

    Returns:
        pd.DataFrame: Detrended dataframe for each index of level
    """
    return (df - df.groupby(level=level).mean())

In [12]:
def apply_dummies(df, percentile=0.95, level="station"):
    """
    Description:
        Applies one-hot encoding on dataframe for specified percentile along
        an index. Labels data with 1, if datapoint is in percentile.

    Parameters:
        df (pd.DataFrame): Dataframe with timeseries data
        percentile (float): Percentile to evaluate dummies (Default: 0.95)
        level (str): Index along which to subtract mean (Default: "station")

    Returns:
        dummies (pd.DataFrame): DataFrame with labeled data (1 if data is in percentile, 0 if not.)
    """
    dummies = df - df.groupby(level=level).quantile(percentile)
    dummies[dummies >= 0] = 1
    dummies[dummies < 0] = 0

    return dummies

In [13]:
# Create GESLA Dataset
meta_file = "resources/gesla/GESLA3_ALL.csv"
data_path = "resources/gesla/GESLA3.0_ALL.zip"

g3 = GeslaDataset(meta_file=meta_file, data_path=data_path)

In [14]:
# Select Stations
filenames = [
    "durban-181a-zaf-uhslc",
    "dutch_harbor_ak-041b-usa-uhslc", 
    "duxbury-8446166-usa-noaa", 
]

ds = g3.files_to_xarray(filenames)
ds

In [15]:
# Select a season
season_ds = ds.sel(date_time=is_autumn(ds['date_time.month']))

# Select only sea_level analysis data
df = get_analysis_df(season_ds)
df = df["sea_level"]

# Detrend data
df_anom = detrend_df(df, level="station")

# Apply one hot encoding
df_isextreme = apply_dummies(df_anom, percentile=0.95, level="station")

# Convert to dataset
# nan values: no measurement at that timestamp
ds_extremes = df_isextreme.to_xarray()

ds_extremes

In [152]:
# Select data of station
nstation = 2
sea_level = ds_extremes.sel(station=nstation).values
sea_level.shape

array([nan, nan, nan, ..., nan, nan, nan])