# Create dataset - time series
***

**Autor:** Chus Casado Rodríguez<br>
**Date:** 06-09-2024<br>

**Introduction:**<br>
This code creates the time series for the reservoirs in ResOpsUS. The time series include records from ResOpsUS and simulations from GloFAS.

The result is a time series that combines the observed data from ResOpsUS with the simulation from GloFASv4 (when possible). For each reservoir, these time series are exported both in CSV and a NetCDF format.

Records are cleaned to avoid errors:
    * Outliers in the **storage** time series are filtered by comparison with the a moving median (window 7 days). If the relative difference of a given storage value and the moving median exceeds a threshold, the value is removed. This procedure is encapsulated in the function `lisfloodreservoirs.utils.timeseries.clean_storage()`
    * Outliers in the **inflow** time series are removed using two conditions: one based in the gradient, and the other using an estimated inflow based on the water balance. When both conditions are met, the value is removed. Since inflow time series cannot contain missing values when used in the reservoir simulation, a simple linear interpolation is used to fill in gaps up to 7 days. This procedure is encapsulated in the function `lisfloodreservoirs.utils.timeseries.clean_inflow()`.

**To do:**<br>
* [ ] 8 reservoirs that should be in GloFAS don't have time series.
* [x] Plot time series
* [x] Make sure that there aren't negative values in the time series, nor zeros in storage.
* [x] Check the quality of the data by closing the mass balance when possible. <font color='steelblue'>I've used the mass balance to identify errors in the inflow time series (function `clean_inflow`).</font>.
* [x] Fill in the inflow time series with the mass balance, if possible. <font color='steelblue'>I've filled in gaps in the inflow time series with linear interpolation up to 7-day gaps (function `clean_inflow`).</font>.

In [1]:
import os
os.environ['USE_PYGEOS'] = '0'
import numpy as np
import pandas as pd
import xarray as xr
from datetime import datetime, timedelta
import yaml
from pathlib import Path
from tqdm.auto import tqdm
from copy import deepcopy

from lisfloodreservoirs.utils import DatasetConfig
from lisfloodreservoirs import read_attributes
from lisfloodreservoirs.utils.plots import plot_resops, reservoir_analysis, compare_flows
from lisfloodreservoirs.utils.timeseries import clean_storage, clean_inflow

## Configuration

In [3]:
cfg = DatasetConfig('config_dataset.yml')

print(f'Time series will be saved in {cfg.PATH_TS}')

Time series will be saved in Z:\nahaUsers\casadje\datasets\reservoirs\ResOpsUS\v2.0\time_series


## Data

### Attributes


In [4]:
# import all tables of attributes
attributes = read_attributes(cfg.PATH_ATTRS)
print(f'{attributes.shape[0]} reservoirs in the attribute tables')

528 reservoirs in the attribute tables


### Time series
#### ResOpsUS

In [5]:
path_plots = cfg.PATH_TS / 'plots'
path_plots.mkdir(parents=True, exist_ok=True)
resops_ts = {}
for grand_id in tqdm(attributes.index, desc='Reading observed time series'): # ID refers to GRanD
    # load timeseries
    file = cfg.PATH_OBS_TS / f'ResOpsUS_{grand_id}.csv'
    if file.is_file():
        series = pd.read_csv(file, parse_dates=True, index_col='date')
    else:
        print(f"{file} doesn't exist")
    # trim to GloFAS long run period
    series = series.loc[cfg.START:cfg.END]
    # remove duplicated index
    series = series[~series.index.duplicated(keep='first')]
    # remove negative values
    series[series < 0] = np.nan
    # clean storage time series
    series.storage = clean_storage(series.storage, w=7, error_thr=0.1)
    # clean inflow time series
    series.inflow = clean_inflow(series.inflow, 
                                 storage=series.storage if attributes.loc[grand_id, 'STORAGE'] == 1 else None, 
                                 outlfow=series.outflow if attributes.loc[grand_id, 'OUTFLOW'] == 1 else None, 
                                 grad_thr=1e4, 
                                 balance_thr=5, 
                                 int_method='linear')
    # save in dictionary
    resops_ts[grand_id] = series

    # plot observed time series
    plot_resops(series.storage,
                series.elevation,
                series.inflow,
                series.outflow,
                attributes.loc[grand_id, ['CAP_MCM', 'CAP_GLWD']].values,
                title=grand_id,
                save=path_plots / f'{grand_id:04}_lineplot.jpg'
               )

print(f'{len(resops_ts)} reservoirs in ResOpsUS time series')

Reading observed time series:   0%|          | 0/528 [00:00<?, ?it/s]

528 reservoirs in ResOpsUS time series


In [6]:
# convert to xarray.Dataset
xarray_list = []
for key, df in resops_ts.items():
    ds = xr.Dataset.from_dataframe(df)
    ds = ds.assign_coords(GRAND_ID=key)
    xarray_list.append(ds)
obs = xr.concat(xarray_list, dim='GRAND_ID')

#### GloFAS

In [27]:
# import time series
glofas_ts = {}
mask = ~attributes.GLOFAS_ID.isnull()
for grand_id, glofas_id in tqdm(attributes[mask].GLOFAS_ID.iteritems(), total=mask.sum(), desc='Reading simulated time series'):
    file = cfg.PATH_SIM_TS / f'{glofas_id:03.0f}.csv'
    if file.is_file():
        series = pd.read_csv(file, parse_dates=True, dayfirst=False, index_col='date')
        series.index -= timedelta(days=1)
        series.storage *= attributes.loc[grand_id, 'CAP_GLWD']
        series[series < 0] = np.nan
        # series.columns = [f'{col.lower()}_glofas' for col in series.columns]
        glofas_ts[grand_id] = series
    else:
        print(f"{file} doesn't exist")
        
print(f'{len(glofas_ts)} reservoirs in GloFAS time series')

# convert to xarray.Dataset
new_dim = 'GRAND_ID'
xarray_list = []
for key, df in glofas_ts.items():
    ds = xr.Dataset.from_dataframe(df)
    ds = ds.assign_coords({new_dim: key})
    xarray_list.append(ds)
sim = xr.concat(xarray_list, dim=new_dim)

# rename variables in the simulated time series
sim = sim.rename_vars({var: f'{var}_glofas' for var in list(sim)})

Reading simulated time series:   0%|          | 0/119 [00:00<?, ?it/s]

Z:\nahaUsers\casadje\datasets\reservoirs\ResOpsUS\ancillary\LISFLOOD\296.csv doesn't exist
Z:\nahaUsers\casadje\datasets\reservoirs\ResOpsUS\ancillary\LISFLOOD\179.csv doesn't exist
Z:\nahaUsers\casadje\datasets\reservoirs\ResOpsUS\ancillary\LISFLOOD\197.csv doesn't exist
Z:\nahaUsers\casadje\datasets\reservoirs\ResOpsUS\ancillary\LISFLOOD\323.csv doesn't exist
Z:\nahaUsers\casadje\datasets\reservoirs\ResOpsUS\ancillary\LISFLOOD\068.csv doesn't exist
Z:\nahaUsers\casadje\datasets\reservoirs\ResOpsUS\ancillary\LISFLOOD\185.csv doesn't exist
Z:\nahaUsers\casadje\datasets\reservoirs\ResOpsUS\ancillary\LISFLOOD\512.csv doesn't exist
Z:\nahaUsers\casadje\datasets\reservoirs\ResOpsUS\ancillary\LISFLOOD\465.csv doesn't exist
111 reservoirs in GloFAS time series


## Prepare dataset

### Convert units

In [28]:
if cfg.NORMALIZE:

    # reservoir attributes used to normalize the dataset
    area_sm = xr.DataArray.from_series(attributes.AREA_SKM) * 1e6 # m2
    capacity_cm = xr.DataArray.from_series(attributes.CAP_MCM) * 1e6 # m3
    catchment_sm = xr.DataArray.from_series(attributes.CATCH_SKM) * 1e6 # m2
    
    # Observed timeseries
    # -------------------
    for var, da in obs.items():
        # convert variables in hm3 to fraction of reservoir capacity [-]
        if var in ['storage', 'evaporation']:
            obs[f'{var}_norm'] = obs[var] * 1e6 / capacity_cm
        # convert variables in m3/s to fraction of reservoir capacity [-]
        elif var in ['inflow', 'outflow']:
            obs[f'{var}_norm'] = obs[var] * 24 * 3600 / capacity_cm

    # Simulated timeseries
    # -------------------
    for var, da in sim.items():
        # convert variables in hm3 to fraction of reservoir capacity [-]
        if var.split('_')[0] in ['storage']:
            sim[f'{var}_norm'] = sim[var] * 1e6 / capacity_cm
        # convert variables in m3/s to fraction of reservoir capacity [-]
        elif var.split('_')[0] in ['inflow', 'outflow']:
            sim[f'{var}_norm'] = sim[var] * 24 * 3600 / capacity_cm

### Export

In [29]:
path_csv = cfg.PATH_TS / 'csv'
path_csv.mkdir(parents=True, exist_ok=True)
path_nc = cfg.PATH_TS / 'netcdf'
path_nc.mkdir(parents=True, exist_ok=True)

for grand_id in tqdm(attributes.index, desc='Exporting time series'):    

    # concatenate time series
    # ds = obs_norm.sel(GRAND_ID=ID).drop(['GRAND_ID'])
    ds = obs.sel(GRAND_ID=grand_id).drop(['GRAND_ID'])
    if grand_id in sim.GRAND_ID.data:
        # ds = xr.merge((ds, sim_norm.sel(GRAND_ID=ID).drop(['GRAND_ID'])))
        ds = xr.merge((ds, sim.sel(GRAND_ID=grand_id).drop(['GRAND_ID'])))

    # # delete empty variables
    # for var in list(ds.data_vars):
    #     if (ds[var].isnull().all()):
    #         del ds[var]

    # trim time series to the observed period
    start, end = attributes.loc[grand_id, ['TIME_SERIES_START', 'TIME_SERIES_END']]
    ds = ds.sel(date=slice(start, end))

    # create time series of temporal attributes
    dates = pd.date_range(start, end, freq='D')
    ds['year'] = xr.DataArray(dates.year.values, dims='date', name='year')
    ds['month'] = xr.DataArray(dates.month.values, dims='date', name='month')
    ds['weekofyear'] = xr.DataArray(dates.isocalendar().week.values, dims='date', name='weekofyear')
    ds['dayofyear'] = xr.DataArray(dates.dayofyear.values, dims='date', name='dayofyear')
    ds['dayofweek'] = xr.DataArray(dates.dayofweek.values, dims='date', name='dayofweek')

    # export CSV
    # ..........
    ds.to_pandas().to_csv(path_csv / f'{grand_id}.csv')

    # export NetCDF
    # .............
    ds.to_netcdf(path_nc / f'{grand_id}.nc')

Exporting time series:   0%|          | 0/528 [00:00<?, ?it/s]