# Select stations
***

__Author__: Chus Casado<br>
__Date__:   14-06-2023<br>

__Introduction__:<br>
This notebook does a selection of reporting points based on the correlation between the reanalysis discharge time series. The selection is done on a catchment basis. From every pair of reporting points with a Spearman correlation coefficient larger than a given value, either the upstream or downstream one is kept depending on the value of the attribute `upstream` in the configuration file.

As a result, the notebook generates a folder for each catchment with a series of plots (hydrograph with flood events, correlation matrix, maps of reporting points...), a CSV file with the original and selected number of reporting points and observed events, and a PARQUET file with the table of attributes of the selected reporting points.

In [5]:
import os
path_root = os.getcwd()
import numpy as np
import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings("ignore")
os.environ['USE_PYGEOS'] = '0'
import geopandas as gpd
import yaml

os.chdir('../py/')
from compute import identify_events
from extract import filter_correlation_matrix
from plot.timeseries import plot_events_timeseries, exceedances_timeline
from plot.maps import create_cmap, map_events
from plot.results import plot_correlation_matrix
os.chdir(path_root)

## 1 Configuration

In [6]:
with open("../conf/config.yml", "r", encoding='utf8') as ymlfile:
    cfg = yaml.load(ymlfile, Loader=yaml.FullLoader)

### 1.1 Reporting points

In [7]:
# area threshold
area_threshold = cfg.get('reporting_points', {}).get('area', 500)

# catchments to be analysed
catchments = cfg.get('reporting_points', {}).get('catchments', {})
if isinstance(catchments, str):
    catchments = [catchments]
    
# correlation threshold
rho = cfg.get('reporting_points', {}).get('selection', {}).get('rho', None)
ascending = cfg.get('reporting_points', {}).get('selection', {}).get('upstream', True)

# reporting points
path_stations = cfg.get('reporting_points', {}).get('output', '../results/reporting_points/')
file_stations = f'{path_stations}reporting_points_over_{area_threshold}km2.parquet'

# rivers
file_rivers = cfg.get('reporting_points', {}).get('input', {}).get('rivers', None)

### 1.2 Discharge

In [None]:
# local directory where I have saved the raw discharge data
path_discharge = cfg.get('discharge', {}).get('output', {}).get('reanalysis', f'../data/discharge/reanalysis/')

# start and end of the study period
start = cfg.get('discharge', {}).get('study_period', {}).get('start', None)
if isinstance(start, str):
    start = datetime.strptime(start, '%Y-%m-%d %H:%M')
end = cfg.get('discharge', {}).get('study_period', {}).get('end', None)
if isinstance(end, str):
    end = datetime.strptime(end, '%Y-%m-%d %H:%M')

# return period
rp = cfg.get('discharge', {}).get('return_period', {}).get('threshold', 5)

## 2 Data
### 2.1 Reporting points

In [None]:
# load table of fixed reporting points
stations = pd.read_parquet(file_stations)

# extract reporting points of the catchments of interest
if catchments is not None:
    stations = stations.loc[stations.catchment.isin(catchments)]
else:
    catchments = np.sort(stations.catchment.unique())

print('no. stations:\t{0}'.format(stations.shape[0]))

# shapefile of rivers of Europe
if file_rivers is not None:
    rivers_shp = gpd.read_file(file_rivers)

### 3. Analysis: selection of reporting points

In [None]:
for catchment in catchments:
    
    print(f'\n{catchment.upper()}')
    print('-' * len(catchment))

    path_out = f'{path_stations}{catchment}/'
    if os.path.exists(path_out) is False:
        os.makedirs(path_out)
        
    file_out = f'{path_out}reporting_points_over_{area_threshold}km2.parquet'
    if os.path.exists(file_out):
        continue
        
    # EXTRACT/IMPORT DATA
    # ---------------------
    
    # extract stations in the catchment
    stns_all = stations.loc[stations.catchment == catchment].copy()
    stns_all.sort_values('area', ascending=ascending, inplace=True)
    stns_all.sort_values('pfafstetter', ascending=ascending, inplace=True)
    print('original no. reporting points:\t\t{1}'.format(catchment, stns_all.shape[0]))

    # extract rives in the catchment
    if file_rivers is not None:
        rivers = rivers_shp.loc[rivers_shp.BASIN == catchment]
    else:
        rivers = None

    # import timeseries of reanalysis discharge
    dis = {stn: xr.open_dataarray(f'{path_discharge}{stn:04}.nc').to_pandas() for stn in stns_all.index}
    dis = pd.DataFrame(dis)
    print('discharge timeseries:\t\t\t{0} timesteps\t{1} stations'.format(*dis.shape))

    # IDENTIFY EVENTS
    # ---------------

    events = identify_events(dis, stns_all[f'rl{rp}'])
    stns_all['n_events_obs'] = events.sum()
    print('no. stations with at least one event:\t{0}'.format((stns_all.n_events_obs > 0).sum()))
    print('total no. events:\t\t\t{0}'.format(stns_all.n_events_obs.sum()))
    
    if stns_all.n_events_obs.sum() > 0:

        # plot timeseries
        # ...............
        mask = stns_all.n_events_obs > 0 # select the stations with flood events
        for stn in stns_all.loc[mask].index:
            title = '{0} - {1} ({2}) - {3:.0f} km2 ({4:.0f})'.format(stn, *stns_all.loc[stn, ['river', 'subcatchment', 'area', 'pfafstetter']])
            plot_events_timeseries(dis[stn], events[stn], thresholds=stns_all.loc[stn, ['rl1.5', 'rl2', 'rl5', 'rl20']],
                                   title=title, save=f'{path_out}/{stn:04}_observed_events.png')

    # SELECT STATIONS
    # ---------------

    if stns_all.shape[0] > 1:
        # correlation matrix
        corr = dis.corr(method='spearman')
        # keep only upper diagonal
        corr = filter_correlation_matrix(corr, rho=None)
        # plot correlation matrix of all reporting points
        plot_correlation_matrix(corr, rho, save=f'{path_out}correlation_matrix_all_points.jpg')

        # filter out highly correlated stations with fewer observed events
        # ................................................................

        # sort stations according to number of flood events
        stns_sel = stns_all.copy()
        stns_sel.sort_values('n_events_obs', ascending=True, inplace=True)
        # correlation matrix
        corr_sel = dis[stns_sel.index.to_list()].corr(method='spearman')
        # remove highly correlated stations
        corr_sel = filter_correlation_matrix(corr_sel, rho=rho)
        stns_sel = stns_sel.loc[corr_sel.index]
        # plot correlation matrix of selected reporting points
        if corr_sel.shape[0] > 1:
            plot_correlation_matrix(corr_sel, rho, save=f'{path_out}correlation_matrix_selected_points.jpg')
        print('selected no. reporting points:\t\t{0}'.format(stns_sel.shape[0]))

        
    # organize all sets of stations in a single dictionary
    if stns_all.shape[0] > 1:    
        stns_sets = {'all': stns_all,
                    'selected': stns_sel}
    else:
        stns_sets = {'all': stns_all}
        
    # sort all subsets equally
    for key, stns in stns_sets.items():
        stns.sort_values('area', ascending=ascending, inplace=True)
        stns.sort_values('pfafstetter', ascending=ascending, inplace=True)

    # ANALYSE RESULTS
    # ---------------
    
    # table summarizing no. stations and events with every station set
    summary = pd.DataFrame(index=stns_sets.keys(), columns=['no_stations', 'p_stations_event', 'no_events'])
    summary.index.name = 'set'
    for i, (key, stns) in enumerate(stns_sets.items()):
        summary.loc[key] = stns.shape[0], sum(stns.n_events_obs > 0), stns.n_events_obs.sum()
    summary.p_stations_event /= summary.no_stations
    summary.p_stations_event = summary.p_stations_event.astype(float)
    print()
    print(summary.round(3))
    summary.to_csv(f'{path_out}summary.csv', float_format='%.3f')

    # plot maps of observed events
    if stns_all.n_events_obs.sum() > 0:
        cmap, norm = create_cmap('Oranges', np.arange(stns_all.n_events_obs.max() + 2), 'no. events', [0, (0.41176, 0.41176, 0.41176, 1)])
        for label, stns in stns_sets.items():
            map_events(stns, 'n_events_obs', rivers=rivers, s=10, alpha=1, cmap=cmap, norm=norm, title=catchment,
                       save=f'{path_out}map_observed_events_{label}_points.jpg')

    # plot timeline of threshold exceedances
    height_ratios = [df.shape[0] for label, df in stns_sets.items()]
    figsize = (12, max(2, np.sum(height_ratios) / 20))
    fig = plt.figure(figsize=figsize, constrained_layout=True)
    gs = fig.add_gridspec(nrows=len(height_ratios), height_ratios=height_ratios)
    for i, (key, stns) in enumerate(stns_sets.items()):
        ax = fig.add_subplot(gs[i])
        exceedances_timeline(dis, stns, thresholds=['rl2', 'rl5', 'rl20'], grid=True, title=f'{key} reporting poitns', ax=ax)
        if i != len(height_ratios) - 1:
            ax.set_xticklabels([])
    plt.savefig(f'{path_out}/exceedance_timeline.jpg', dpi=300, bbox_inches='tight')

    # export selected points
    stns_sel.to_parquet(f'{path_out}reporting_points_selected_over_{area_threshold}km2.parquet')