# Select reservoirs and study period
***

**Author:** Chus Casado Rodríguez<br>
**Date:** 06-09-2024<br>

**Introduction:**<br>
This notebook reads all the attributes and time series in the dataset and selects the reservoirs appropriate for testing the different reservoir routines. Several conditions need to be met for a reservoir to be selected:

1. It must contain observed time series of the variables `inflow`, `storage` and `outflow`.
2. The longest period without gaps in those three time series needs to be longer than 8 years.
3. The bias between the observed inflow and outflow timeseries needs to be between 0.7 and 1.3.

In [1]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from pathlib import Path
import pickle

from lisfloodreservoirs.utils import DatasetConfig
from lisfloodreservoirs import read_attributes, read_timeseries
from lisfloodreservoirs.utils.timeseries import define_period

## Configuration

In [2]:
cfg = DatasetConfig('config_dataset.yml')

PATH_OUT = cfg.PATH_RESOPS / cfg.VERSION / 'selection'
PATH_OUT.mkdir(parents=True, exist_ok=True)
print(f'Selected reservoirs and periods will be saved in:\n\t{PATH_OUT}\n')

variables = ['inflow', 'storage', 'outflow']

Selected reservoirs and periods will be saved in:
	Z:\nahaUsers\casadje\datasets\reservoirs\ResOpsUS\v2.0\selection



## Data

### Attributes

In [3]:
# import all tables of attributes
attributes = read_attributes(cfg.PATH_ATTRS, reservoirs=None)
print(f'{attributes.shape[0]} reservoirs in the attribute tables')

# keep only reservoirs with all observed variables
mask = pd.concat([attributes[var.upper()] == 1 for var in variables], axis=1).all(axis=1)
attributes = attributes[mask]
attributes.sort_index(axis=0, inplace=True)
print('{0} reservoirs include observed time series for all variables: {1}'.format(attributes.shape[0],
                                                                                ', '.join(variables)))

# NOTE!! The checks below (area and volume) were alreday done in notebook 1

# # keep reservoirs that comply with the catchment area and total storage conditions
# if cfg.MIN_AREA is not None:
#     mask_area = attributes.CATCH_SKM >= cfg.MIN_AREA
#     attributes = attributes[mask_area]
#     print('{0} reservoirs comply with the minimum catchment area: {1} km²'.format(attributes.shape[0],
#                                                                                            cfg.MIN_AREA))
# if cfg.MIN_VOL is not None:
#     mask_volume = attributes.CAP_MCM >= cfg.MIN_VOL
#     attributes = attributes[mask_volume]
#     print('{0} reservoirs comply with the minimum storage capacity: {1} hm3'.format(attributes.shape[0],
#                                                                                 cfg.MIN_VOL))

528 reservoirs in the attribute tables
231 reservoirs include observed time series for all variables: inflow, storage, outflow


In [5]:
attributes.columns.value_counts()

ReservoirRnormqMult    2
adjustNormalFlood      2
DAM_NAME               1
DIS_AVG_LS             1
DOR_PC                 1
                      ..
soildepth2             1
lambda1                1
soildepth3             1
lambda3                1
TIME_SERIES_START      1
Length: 150, dtype: int64

In [None]:
attributes

#### Time series

In [4]:
# read time series
timeseries = read_timeseries(cfg.PATH_TS / 'csv',
                             attributes.index)
print(f'{len(timeseries)} reservoirs with timeseries\n')

# remove reservoirs with excessively low degree of regulation
if cfg.MIN_DOR is not None:
    dor = pd.Series({grand_id: attributes.loc[grand_id, 'CAP_MCM'] * 1e6 / (ts.inflow.mean() * 365 * 24 * 3600) for grand_id, ts in timeseries.items()},
                    name='DOR')
    mask_dor = dor > cfg.MIN_DOR
    attributes = attributes[mask_dor]
    timeseries = {grand_id: ts for grand_id, ts in timeseries.items() if mask_dor[grand_id]}
    print('{0} reservoirs comply with the minimum degre of regulation: {1}'.format(attributes.shape[0],
                                                                                       cfg.MIN_DOR))

231 reservoirs with timeseries

217 reservoirs comply with the minimum degre of regulation: 0.08


## Selection

In [7]:
bias = {}
periods = {}
for grand_id, ts in tqdm(timeseries.items(), desc='select reservoirs', total=len(timeseries)):
    
    # select study period
    start, end = define_period(ts[variables])
    if np.isnan(start) or np.isnan(end):
        print(f'{grand_id:>4} discarded for lack of records')
        continue
    duration = (end - start) / np.timedelta64(1, 'D')
    if duration >= cfg.MIN_YEARS * 365:
        ts = ts.loc[start:end]
    else:
        print(f'{grand_id:>4} discarded for lack of records:\t{duration:.0f} days')
        continue
        
    # bias between inflow and outflow
    bias[grand_id] = ts.outflow.mean() / ts.inflow.mean()
    if (1 - cfg.TOL_BIAS) <= bias[grand_id] <= (1 + cfg.TOL_BIAS):
        # save periods
        periods[str(grand_id)] = {
            'start_dates': [pd.Timestamp(start)],
            'end_dates': [pd.Timestamp(end)]
        }
    else:
        print(f'{grand_id:>4} discarded for excesive bias:\t{bias[grand_id]:.2f}')
    
print(f'\n{len(periods)} reservoirs selected')

select reservoirs:   0%|          | 0/217 [00:00<?, ?it/s]

 135 discarded for lack of records:	0 days
 138 discarded for lack of records:	1 days
 144 discarded for lack of records:	444 days
 148 discarded for lack of records:	602 days
 158 discarded for lack of records:	958 days
 169 discarded for lack of records:	1003 days
 173 discarded for lack of records:	206 days
 180 discarded for lack of records:	604 days
 185 discarded for excesive bias:	0.68
 190 discarded for lack of records:	302 days
 191 discarded for lack of records:	594 days
 193 discarded for lack of records:	503 days
 203 discarded for lack of records:	89 days
 210 discarded for excesive bias:	0.49
 214 discarded for lack of records:	594 days
 223 discarded for lack of records:	606 days
 299 discarded for lack of records
 320 discarded for lack of records:	1163 days
 338 discarded for lack of records:	512 days
 347 discarded for lack of records:	3 days
 374 discarded for lack of records:	1444 days
 382 discarded for lack of records:	354 days
 385 discarded for lack of records:	

In [14]:
attributes.columns.tolist()

['CAP',
 'CAP_GLWD',
 'GLOFAS_ID',
 'GLWD_ID',
 'LAT_LISFLOOD',
 'LON_LISFLOOD',
 'Qf',
 'Qmin',
 'Qn',
 'Qn_adj',
 'ReservoirRnormqMult',
 'Vf',
 'Vmin',
 'Vn',
 'Vn_adj',
 'adjustNormalFlood',
 'ADMIN_UNIT',
 'ALT_HGT_M',
 'ALT_LEN_M',
 'ALT_NAME',
 'ALT_RIVER',
 'ALT_YEAR',
 'AREA_SKM',
 'CAP_MCM',
 'CATCH_SKM',
 'COMMENTS',
 'COUNTRY',
 'DAM_HGT_M',
 'DAM_LEN_M',
 'DAM_NAME',
 'DEPTH_M',
 'DIS_AVG_LS',
 'DOR_PC',
 'ELEV_MASL',
 'LAKE_CTRL',
 'LAT',
 'LON',
 'MAIN_BASIN',
 'MAIN_ELEC',
 'MAIN_FCON',
 'MAIN_FISH',
 'MAIN_IRRI',
 'MAIN_NAVI',
 'MAIN_OTHR',
 'MAIN_RECR',
 'MAIN_SUPP',
 'MULTI_DAMS',
 'NEAR_CITY',
 'QUALITY',
 'RES_NAME',
 'RIVER',
 'SEC_ADMIN',
 'SINGLE_USE',
 'USE_ELEC',
 'USE_FCON',
 'USE_FISH',
 'USE_IRRI',
 'USE_LIVE',
 'USE_NAVI',
 'USE_OTHR',
 'USE_PCON',
 'USE_RECR',
 'USE_SUPP',
 'YEAR',
 'CAP_RESOPS',
 'ELEVATION',
 'ELEVATION_END',
 'ELEVATION_START',
 'EVAPORATION',
 'EVAPORATION_END',
 'EVAPORATION_START',
 'INCONSISTENCIES_NOTED',
 'INFLOW',
 'INFLOW_END',

In [16]:
rename_cols = {
    'CATCH_SKM': 'area',
    'LAT': 'lat',
    'LON': 'lon'
}

In [19]:
csv = attributes.loc[[int(ID) for ID in periods], list(rename_cols)].copy()
csv.index.name = 'ID'
csv.rename(columns=rename_cols, inplace=True)

In [20]:
csv.head()

Unnamed: 0_level_0,area,lat,lon
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
41,2599,48.732466,-121.067305
63,485,46.654792,-121.128322
131,1789,40.802949,-122.760687
132,19018,40.720459,-122.422031
133,514,40.599432,-122.540564


In [None]:
cfg.P

In [21]:
csv.to_csv(PATH_OUT / 'points_lfcoords.csv')

### Export

In [17]:
# export list of selected reservoirs
with open(PATH_OUT / 'reservoirs.txt', 'w') as f:
    for grand_id in periods.keys():
        f.write(f'{grand_id}\n')

In [18]:
# export selected study period
with open(PATH_OUT / 'periods.pkl', 'wb') as f:
    pickle.dump(periods, f)