# Create dataset - time series
***

**Autor:** Chus Casado<br>
**Date:** 21-06-2024<br>

**Introduction:**<br>
This code creates the time series for the reservoirs in ResOpsUS. The time series include records from ResOpsUS and simulations from GloFAS.

The result is a time series that combines the observed data from ResOpsUS with the simulation from GloFASv4 (when possible). For each reservoir, these time series are exported both in CSV and a NetCDF format.

**To do:**<br>
* [ ] 8 reservoirs that should be in GloFAS don't have time series.
* [ ] Plot time series?

In [1]:
import os
os.environ['USE_PYGEOS'] = '0'
import numpy as np
import pandas as pd
import xarray as xr
import geopandas as gpd
from shapely import Point
import seaborn as sns
from datetime import datetime, timedelta
import spotpy
# from spotpy.objectivefunctions import kge
import yaml
from pathlib import Path
from tqdm.auto import tqdm
from copy import deepcopy

## Configuration

In [2]:
with open('config_dataset.yml', 'r', encoding='utf8') as ymlfile:
    cfg = yaml.load(ymlfile, Loader=yaml.FullLoader)

# paths
PATH_GLOFAS = Path(cfg['paths']['GloFAS']['root'])
PATH_RESOPS = Path(cfg['paths']['ResOpsUS']['root'])
PATH_OBS_TS = PATH_RESOPS / cfg['paths']['ResOpsUS']['obs_timeseries']
PATH_SIM_TS = PATH_RESOPS / cfg['paths']['ResOpsUS']['sim_timeseries']
PATH_GRAND = Path(cfg['paths']['GRanD'])

# period
START = cfg['period']['start']
END = cfg['period']['end']

# # conditions
# MIN_AREA = cfg['conditions']['min_area'] # km2
# MIN_VOL = cfg['conditions']['min_volume'] # hm3
# MIN_TS = cfg['conditions']['min_length'] * 365 # days

VERSION = cfg['version']
PATH_OUT = PATH_RESOPS / VERSION / 'time_series'
print(f'Time series will be saved in {PATH_OUT}')

Time series will be saved in Z:\nahaUsers\casadje\datasets\reservoirs\ResOpsUS\v1.0\time_series


## Data

### ResOpsUS

#### Attributes

In [3]:
resops = pd.read_csv(PATH_RESOPS / VERSION / 'attributes' / 'resops_attributes.csv', index_col='GRAND_ID')
print(f'{resops.shape[0]} reservoirs in ResOpsUS attributes')

526 reservoirs in ResOpsUS attributes


#### Time series

In [4]:
resops_ts = {}
for ID in tqdm(resops.index, desc='Reading observed time series'): # ID refers to GRanD
    # load timeseries
    file = PATH_OBS_TS / f'ResOpsUS_{ID}.csv'
    if file.is_file():
        series = pd.read_csv(file, parse_dates=True, index_col='date')
        # series.columns = series.columns.str.upper()
    else:
        print(f"{file} doesn't exist")
    # trim to GloFAS long run period
    series = series.loc[START:END]
    # remove duplicated index
    series = series[~series.index.duplicated(keep='first')]
    # remove empty series
    # series.dropna(axis=1, how='all', inplace=True)
    # convert storage from hm3 to m3
    if 'STORAGE' in series.columns:
        series.STORAGE *= 1e6
    # save in dictionary
    resops_ts[ID] = series

print(f'{len(resops_ts)} reservoirs in ResOpsUS time series')

# convert to xarray.Dataset
xarray_list = []
for key, df in resops_ts.items():
    ds = xr.Dataset.from_dataframe(df)
    ds = ds.assign_coords(GRAND_ID=key)
    xarray_list.append(ds)
obs = xr.concat(xarray_list, dim='GRAND_ID')

Reading observed time series:   0%|          | 0/526 [00:00<?, ?it/s]

526 reservoirs in ResOpsUS time series


### GloFAS

#### Attributes

In [5]:
glofas = pd.read_csv(PATH_RESOPS / VERSION / 'attributes' / 'glofas_attributes.csv', index_col='GRAND_ID')
print(f'{glofas.shape[0]} reservoirs in GloFAS attributes')

118 reservoirs in GloFAS attributes


#### Time series 

In [14]:
# import time series
glofas_ts = {}
for grand_id, glofas_id in tqdm(glofas.GLOFAS_ID.iteritems(), total=glofas.shape[0], desc='Reading simulated time series'):
    file = PATH_SIM_TS / f'{glofas_id:03}.csv'
    if file.is_file():
        series = pd.read_csv(file, parse_dates=True, dayfirst=False, index_col='date')
        series.index -= timedelta(days=1)
        series.storage *= glofas.loc[grand_id, 'CAP_GLWD']
        # series.columns = [f'{col.lower()}_glofas' for col in series.columns]
        glofas_ts[grand_id] = series
    else:
        print(f"{file} doesn't exist")
        
print(f'{len(glofas_ts)} reservoirs in GloFAS time series')

# convert to xarray.Dataset
new_dim = 'GRAND_ID'
xarray_list = []
for key, df in glofas_ts.items():
    ds = xr.Dataset.from_dataframe(df)
    ds = ds.assign_coords({new_dim: key})
    xarray_list.append(ds)
sim = xr.concat(xarray_list, dim=new_dim)

Reading simulated time series:   0%|          | 0/118 [00:00<?, ?it/s]

Z:\nahaUsers\casadje\datasets\reservoirs\ResOpsUS\ancillary\LISFLOOD\296.csv doesn't exist
Z:\nahaUsers\casadje\datasets\reservoirs\ResOpsUS\ancillary\LISFLOOD\179.csv doesn't exist
Z:\nahaUsers\casadje\datasets\reservoirs\ResOpsUS\ancillary\LISFLOOD\197.csv doesn't exist
Z:\nahaUsers\casadje\datasets\reservoirs\ResOpsUS\ancillary\LISFLOOD\323.csv doesn't exist
Z:\nahaUsers\casadje\datasets\reservoirs\ResOpsUS\ancillary\LISFLOOD\068.csv doesn't exist
Z:\nahaUsers\casadje\datasets\reservoirs\ResOpsUS\ancillary\LISFLOOD\185.csv doesn't exist
Z:\nahaUsers\casadje\datasets\reservoirs\ResOpsUS\ancillary\LISFLOOD\512.csv doesn't exist
Z:\nahaUsers\casadje\datasets\reservoirs\ResOpsUS\ancillary\LISFLOOD\465.csv doesn't exist
110 reservoirs in GloFAS time series


### GRanD
#### Attributes

In [7]:
grand = pd.read_csv(PATH_RESOPS / VERSION / 'attributes' / 'grand_attributes.csv', index_col='GRAND_ID')
print(f'{grand.shape[0]} reservoirs in GRanD attributes')

526 reservoirs in GRanD attributes


### Correct reservoir capacity

In [8]:
# try:
#     # import DataFrame with the fraction fill and the selected data source
#     ff = pd.read_excel('fraction_fill.xlsx', index_col='ResID')
# except:

#     # create DataFrame with the fraction fill according to each data source
#     ff = pd.DataFrame(columns=['GLOFAS', 'GRAND'], dtype=float)
#     ff.index.name = 'ResID'
#     for ID in glofas.index:
#         cap_glofas = glofas.loc[ID, ['ResID', 'CAP_GLWD']]
#         cap_resops = resops.loc[ID, 'CAP_RESOPS']
#         cap_grand = grand.loc[ID, 'CAP_RESOPS']
#         if np.isnan(cap_resops):
#             continue
#         ff.loc[ResID, :] = cap_resops / cap_glofas, cap_resops / cap_grand
#      # export
#     ff.to_excel('fraction_fill.xlsx', index=True)

# # define the capacity  ('CAP') as that of the most reliable source
# glofas['CAP'] = np.nan
# for ID in glofas.index:
#     ResID = glofas.loc[ID, 'ResID']
#     if ff.loc[ResID, 'selection'] == 'GLOFAS':
#         glofas.loc[ID, 'CAP'] = glofas.loc[ID, 'CAP_GLWD']
#     elif ff.loc[ResID, 'selection']:
#         glofas.loc[ID, 'CAP'] = glofas.loc[ID, 'CAP_GRAND']

## Time series

In [9]:
# reservoir attributes used to normalize the dataset
area_sm = xr.DataArray.from_series(grand.AREA_SKM) * 1e6 # m2
capacity_cm = xr.DataArray.from_series(grand.CAP_MCM) * 1e6 # m3
catchment_sm = xr.DataArray.from_series(grand.CATCH_SKM) * 1e6 # m2

### Observed

In [10]:
obs_norm = deepcopy(obs)

# convert variables in hm3 to fraction of reservoir capacity [-]
obs_norm['storage'] *= 1e6 / capacity_cm
obs_norm['evaporation'] *= 1e6 / capacity_cm

# convert variables in m3/s to fraction of reservoir capacity [-]
obs_norm['inflow'] *= 24 * 3600 / capacity_cm
obs_norm['outflow'] *= 24 * 3600 / capacity_cm

### Simulated

In [11]:
sim_norm = deepcopy(sim)

# convert variables in hm3 to fraction of reservoir capacity [-]
sim_norm['storage'] = sim_norm['storage'] * 1e6 / capacity_cm

# convert variables in m3/s to fraction of reservoir capacity [-]
sim_norm['inflow'] = sim_norm['inflow'] * 24 * 3600 / capacity_cm
sim_norm['outflow'] = sim_norm['outflow'] * 24 * 3600 / capacity_cm

# rename variables
sim_norm = sim_norm.rename_vars({var: f'{var}_glofas' for var in list(sim_norm)})

### Export

In [12]:
path_csv = PATH_OUT / 'csv'
path_csv.mkdir(parents=True, exist_ok=True)
path_nc = PATH_OUT / 'netcdf'
path_nc.mkdir(parents=True, exist_ok=True)

for ID in tqdm(resops.index, desc='Exporting time series'):    

    # concatenate time series
    ds = obs_norm.sel(GRAND_ID=ID).drop(['GRAND_ID'])
    if ID in sim.GRAND_ID.data:
        ds = xr.merge((ds, sim_norm.sel(GRAND_ID=ID).drop(['GRAND_ID'])))

    # delete empty variables
    for var in list(ds.data_vars):
        if (ds[var].isnull().all()):
            del ds[var]

    # trim time series to the observed period
    start, end = resops.loc[ID, ['TIME_SERIES_START', 'TIME_SERIES_END']]
    ds = ds.sel(date=slice(start, end))

    # create time series of temporal attributes
    dates = pd.date_range(start, end, freq='D')
    ds['year'] = xr.DataArray(dates.year.values, dims='date', name='year')
    ds['month'] = xr.DataArray(dates.month.values, dims='date', name='month')
    ds['weekofyear'] = xr.DataArray(dates.isocalendar().week.values, dims='date', name='weekofyear')
    ds['dayofyear'] = xr.DataArray(dates.dayofyear.values, dims='date', name='dayofyear')
    ds['dayofweek'] = xr.DataArray(dates.dayofweek.values, dims='date', name='dayofweek')

    # export CSV
    # ..........
    ds.to_pandas().to_csv(path_csv / f'{ID}.csv')

    # export NetCDF
    # .............
    ds.to_netcdf(path_nc / f'{ID}.nc')

Exporting time series:   0%|          | 0/526 [00:00<?, ?it/s]