# Select stations
***

__Author__: Chus Casado<br>
__Date__:   16-02-2024<br>

__Introduction__:<br>
This notebook does a selection of reporting points based on the correlation between the reanalysis discharge time series. The selection is done on a catchment basis. From every pair of reporting points with a Spearman correlation coefficient larger than a given value, either the upstream or downstream one is kept depending on the value of the attribute `upstream` in the configuration file.

As a result, the notebook generates a folder for each catchment with a series of plots (hydrograph with flood events, correlation matrix, maps of reporting points...), a CSV file with the original and selected number of reporting points and observed events, and a PARQUET file with the table of attributes of the selected reporting points.

In [1]:
import os
os.environ['USE_PYGEOS'] = '0'
path_root = os.getcwd()
import numpy as np
import pandas as pd
import geopandas as gpd
import xarray as xr
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from tqdm import tqdm_notebook
import yaml
from pathlib import Path
import warnings
warnings.filterwarnings("ignore")

os.chdir('../py/')
from config import Config
from compute import identify_events
from selection import filter_correlation_matrix
from plot.timeseries import plot_events_timeseries, exceedances_timeline
from plot.maps import create_cmap, map_events, map_stations
from plot.results import plot_correlation_matrix
os.chdir(path_root)

## 1 Configuration

In [2]:
config_path = Path('../conf')
config = Config.load_from_yaml(config_path / 'config_COMB_leadtime_ranges.yml')

### 1.1 Reporting points

In [3]:
# area threshold
area_threshold = config.reporting_points['area']

# reporting points
path_stations = config.reporting_points['output']
file_stations = path_stations / f'reporting_points_over_{area_threshold}km2.parquet'

# catchments
catchments = config.reporting_points['catchments']

# correlation threshold
rho = config.reporting_points['selection']['rho']
ascending = config.reporting_points['selection']['upstream']

# rivers
rivers_shp = config.reporting_points['rivers']
if rivers_shp is not None:
    rivers = gpd.read_file(rivers_shp)
    
# minimum performance required from the reporting points
min_kge = config.reporting_points['KGE']

### 1.2 Discharge

In [5]:
# local directory where I have saved the raw discharge data
path_discharge = config.discharge['output']['reanalysis']

# start and end of the study period
start = config.discharge['study_period']['start']
end = config.discharge['study_period']['end']

# return period
rp = config.discharge['return_period']['threshold']
col_events = f'obs_events_{rp}'

## 2 Data
### 2.1 Reporting points

In [14]:
# load table of fixed reporting points
stations = pd.read_parquet(file_stations)
stations[['X', 'Y', 'area']] = stations[['X', 'Y', 'area']].astype(int)

# remove points with a performance (KGE) lower than the established threshold
if min_kge is not None:
    mask_kge = ~(stations.KGE <= min_kge)
    stations = stations.loc[mask_kge]
    
# select stations that belong to the selected catchments
if catchments is None:
    catchments = stations.catchment.unique().tolist()
else:
    if isinstance(catchments, str):
        catchments = [catchments]
stations = stations.loc[stations.catchment.isin(catchments),:]
    
print(f'no. reporting points:\t\t{stations.shape[0]}')
print(f'no. catchments:\t\t\t{len(catchments)}')

no. reporting points:		1979
no. catchments:			327


## 3 Analysis: selection of reporting points

In [None]:
for catchment in tqdm_notebook(catchments):    

    print(f'\n{catchment.upper()}')
    print('-' * len(catchment))

    path_out = path_stations / f'{catchment}'
    path_out.mkdir(parents=True, exist_ok=True)
    file_out = path_out / f'reporting_points_over_{area_threshold}km2.parquet'
    if file_out.exists():
        continue
        
    # EXTRACT/IMPORT DATA
    # ---------------------
    
    # extract stations in the catchment
    stns_all = stations.loc[stations.catchment == catchment].copy()
    stns_all.sort_values('area', ascending=ascending, inplace=True)
    stns_all.sort_values('pfafstetter', ascending=ascending, inplace=True)
    print('original no. reporting points:\t\t{1}'.format(catchment, stns_all.shape[0]))

    # extract rives in the catchment, if the GeoDataFrame exists
    if 'rivers' in globals():
        rivers_ctch = rivers.loc[rivers.BASIN == catchment]
    else:
        rivers_ctch = None

    # import timeseries of reanalysis discharge
    dis = {stn: xr.open_dataarray(path_discharge / f'{stn:04}.nc').to_pandas() for stn in stns_all.index}
    dis = pd.DataFrame(dis)
    print('discharge timeseries:\t\t\t{0} timesteps\t{1} stations'.format(*dis.shape))

    # IDENTIFY EVENTS
    # ---------------

    events = identify_events(dis, stns_all[f'rl{rp}'])
    stns_all[col_events] = events.sum()
    print('no. stations with at least one event:\t{0}'.format((stns_all[col_events] > 0).sum()))
    print('total no. events:\t\t\t{0}'.format(stns_all[col_events].sum()))
    
    if stns_all[col_events].sum() > 0:

        # plot timeseries
        # ...............
        mask = stns_all[col_events] > 0 # select the stations with flood events
        for stn in stns_all.loc[mask].index:
            title = '{0} - {1} ({2}) - {3:.0f} km2 ({4:.0f})'.format(stn, *stns_all.loc[stn, ['river', 'subcatchment', 'area', 'pfafstetter']])
            plot_events_timeseries(dis[stn],
                                   events[stn],
                                   thresholds=stns_all.loc[stn, ['rl1.5', 'rl2', 'rl5', 'rl20']],
                                   title=title,
                                   save=path_out / f'{stn:04}_observed_events.jpg')

    # SELECT STATIONS
    # ---------------

    if stns_all.shape[0] > 1:
        # correlation matrix
        corr = dis.corr(method='spearman')
        # keep only upper diagonal
        corr = filter_correlation_matrix(corr,
                                         rho=None)
        # plot correlation matrix of all reporting points
        plot_correlation_matrix(corr,
                                rho,
                                save=path_out / f'correlation_matrix_all_points.jpg')

        # filter out highly correlated stations with fewer observed events
        # ................................................................

        # sort stations according to number of flood events
        stns_sel = stns_all.copy()
        stns_sel.sort_values(col_events, ascending=True, inplace=True)
        # correlation matrix
        corr_sel = dis[stns_sel.index.to_list()].corr(method='spearman')
        # remove highly correlated stations
        corr_sel = filter_correlation_matrix(corr_sel,
                                             rho=rho)
        stns_sel = stns_sel.loc[corr_sel.index]
        # plot correlation matrix of selected reporting points
        if corr_sel.shape[0] > 1:
            plot_correlation_matrix(corr_sel,
                                    rho,
                                    save=path_out / f'correlation_matrix_selected_points.jpg')
        print('selected no. reporting points:\t\t{0}'.format(stns_sel.shape[0]))

        
    # organize all sets of stations in a single dictionary
    if stns_all.shape[0] > 1:    
        stns_sets = {'all': stns_all,
                    'selected': stns_sel}
    else:
        stns_sets = {'all': stns_all}
        
    # sort all subsets equally
    for key, stns in stns_sets.items():
        stns.sort_values('area', ascending=ascending, inplace=True)
        stns.sort_values('pfafstetter', ascending=ascending, inplace=True)

    # ANALYSE RESULTS
    # ---------------
    
    # table summarizing no. stations and events with every station set
    summary = pd.DataFrame(index=stns_sets.keys(),
                           columns=['no_stations', 'p_stations_event', 'no_events'])
    summary.index.name = 'set'
    for i, (key, stns) in enumerate(stns_sets.items()):
        summary.loc[key] = stns.shape[0], sum(stns[col_events] > 0), stns[col_events].sum()
    summary.p_stations_event /= summary.no_stations
    summary.p_stations_event = summary.p_stations_event.astype(float)
    print()
    print(summary.round(3))
    summary.to_csv(f'{path_out}summary.csv', float_format='%.3f')

    # plot maps of observed events
    if stns_all[col_events].sum() > 0:
        extent = [stns_all.lon.min() - 1,
                  stns_all.lon.max() + 1,
                  stns_all.lat.min() - 1,
                  stns_all.lat.max() + 1]
        cmap, norm = create_cmap('Oranges',
                                 np.arange(stns_all[col_events].max() + 2),
                                 'no. events',
                                 [0, (0.41176, 0.41176, 0.41176, 1)])
        for label, stns in stns_sets.items():
            map_events(stns.X,
                       stns.Y,
                       stns[col_events],
                       rivers=rivers_ctch,
                       s=20,
                       alpha=1,
                       cmap=cmap,
                       norm=norm,
                       title=f'{catchment} - {label}',
                       extent=extent,
                       save=path_out / f'map_observed_events_{label}_points.jpg')

    # plot timeline of threshold exceedances
    height_ratios = [df.shape[0] for label, df in stns_sets.items()]
    figsize = (12, max(2, np.sum(height_ratios) / 20))
    fig = plt.figure(figsize=figsize, constrained_layout=True)
    gs = fig.add_gridspec(nrows=len(height_ratios), height_ratios=height_ratios)
    for i, (key, stns) in enumerate(stns_sets.items()):
        ax = fig.add_subplot(gs[i])
        exceedances_timeline(dis,
                             stns,
                             thresholds=['rl2', 'rl5', 'rl20'],
                             grid=True,
                             title=f'{key} reporting points',
                             ax=ax)
        if i != len(height_ratios) - 1:
            ax.set_xticklabels([])
    plt.savefig(f'{path_out}/exceedance_timeline.jpg', dpi=300, bbox_inches='tight')

    # export selected points
    stns_sel.to_parquet(path_out / f'reporting_points_selected_over_{area_threshold}km2.parquet')

  0%|          | 0/327 [00:00<?, ?it/s]


DANUBE
------
original no. reporting points:		432
discharge timeseries:			3919 timesteps	432 stations
no. stations with at least one event:	203
total no. events:			300
selected no. reporting points:		147

         no_stations  p_stations_event no_events
set                                             
all              432             0.470       300
selected         147             0.537       131

ELBE
----
original no. reporting points:		65
discharge timeseries:			3919 timesteps	65 stations
no. stations with at least one event:	7
total no. events:			12
selected no. reporting points:		26

         no_stations  p_stations_event no_events
set                                             
all               65             0.108        12
selected          26             0.231        10

ODER
----
original no. reporting points:		85
discharge timeseries:			3919 timesteps	85 stations
no. stations with at least one event:	28
total no. events:			29
selected no. reporting points:		35

         