In [4]:
import sys
sys.path.append('../..')
import numpy as np
import pandas as pd
from pathlib import Path
from model_utils import split_samples, split_periods
from neuralhydrology.utils.config import Config
import xarray as xr
from tqdm.auto import tqdm
import pickle

In [5]:
import matplotlib.pyplot as plt

## Configuración

In [6]:
# cargar archivo de configuración
config_file = Path('config_HP.yml')
cfg = Config(config_file)

target = cfg.target_variables[0]

print(cfg.experiment_name)
print('epochs:', cfg.epochs, sep='\t\t')
print('hidden size:', cfg.hidden_size, sep='\t')
print('batch size:', cfg.batch_size, sep='\t')
print('dropout:', cfg.output_dropout, sep='\t')
print('clip gradients:', cfg.clip_gradient_norm, sep='\t')

FileNotFoundError: config_HP.yml

## Samples 

In [110]:
# load attribute table
attrs = pd.read_csv(cfg.data_dir / 'attributes' / 'attributes_combined.csv', index_col='reservoir_id')
print(attrs.shape)

split_samples(attrs.index.tolist(), cal=.6, val=.2, path=Path('./data'), seed=0)

(291, 7)


## Periods

In [90]:
# define train, validation and tes periods
periods = {}
for file in tqdm((cfg.data_dir / 'time_series').glob('*.nc')):
    df = xr.open_dataset(file)[cfg.target_variables].to_pandas()
    periods[file.stem] = split_periods(df, cal=.6, val=.2)
periods_da = xr.concat([da.assign_coords(id=id) for id, da in periods.items()], dim='id')

# make sure there's 1 year of data before the start date
cutoff_date = np.datetime64('1992-01-02', 'ns')
periods_da = periods_da.where(periods_da >= cutoff_date, other=cutoff_date)
del periods

# reorganize periods as a dictionary
periods_dct = {}
for period in periods_da.period.data:
    periods_dct[period] = {}
    for ID in periods_da.id.data:
        periods_dct[period][ID] = {}
        for date in periods_da.date.data:
            periods_dct[period][ID][f'{date}_dates'] = [pd.Timestamp(periods_da.sel(period=period, id=ID, date=date).to_numpy().item())]

# export as Pickle
for period, dct in periods_dct.items():
    with open(f'./data/periods_{period}.pkl', 'wb') as file:
        pickle.dump(dct, file)

0it [00:00, ?it/s]

***

In [49]:
min_date = pd.Timestamp(1900, 1, 1)
max_date = pd.Timestamp(2100, 1, 1)
for period, ids_dct in periods_dct.items():
    for ID, dates_dct in ids_dct.items():
        min_date = max(min_date, dates_dct['start_dates'][0])
        max_date = min(max_date, dates_dct['end_dates'][0])

In [50]:
min_date, max_date

(Timestamp('2015-04-29 00:00:00'), Timestamp('1993-05-08 00:00:00'))

In [57]:
min_date = pd.Timestamp(1900, 1, 1)
max_date = pd.Timestamp(2100, 1, 1)
for id in periods_da.id.data:
    min_date = max(min_date, periods_da.sel(id=id).min().to_numpy())
    max_date = min(max_date, periods_da.sel(id=id).max().to_numpy())
min_date, max_date

(array('2011-09-13T00:00:00.000000000', dtype='datetime64[ns]'),
 array('2006-09-30T00:00:00.000000000', dtype='datetime64[ns]'))

In [93]:
periods_da.sel(id='2511').to_pandas()

period,train,validation,test
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
start,2007-11-19,2002-06-11,1997-01-01
end,2024-03-12,2007-11-18,2002-06-10


In [81]:
periods_da.sel(id=2511).to_pandas()

period,train,validation,test
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
start,2007-11-19,2002-06-11,1997-01-01
end,2024-03-12,2007-11-18,2002-06-10


In [82]:
periods_da.sel(id=2536).to_pandas()

period,train,validation,test
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
start,2002-04-22,1996-02-26,1992-01-02
end,2020-09-30,2002-04-21,1996-02-25


***

In [67]:
PATH_RESOPS = Path('./data/ResOpsES')

In [68]:
ID = 2511

In [71]:
cfg.dynamic_inputs

['inflow_efas5', 'evaporation_emo1']

In [72]:
cfg.target_variables

['volume']

In [84]:
with open('./data/periods_train.pkl', 'rb') as file:
    periods_train = pickle.load(file)

In [85]:
periods_train

{10: {'start_dates': [Timestamp('2003-09-13 00:00:00')],
  'end_dates': [Timestamp('2020-09-30 00:00:00')]},
 1019: {'start_dates': [Timestamp('2002-04-22 00:00:00')],
  'end_dates': [Timestamp('2020-10-01 00:00:00')]},
 1026: {'start_dates': [Timestamp('2002-04-22 00:00:00')],
  'end_dates': [Timestamp('2020-10-01 00:00:00')]},
 1033: {'start_dates': [Timestamp('2006-08-09 00:00:00')],
  'end_dates': [Timestamp('2020-10-01 00:00:00')]},
 1038: {'start_dates': [Timestamp('2002-04-22 00:00:00')],
  'end_dates': [Timestamp('2020-09-30 00:00:00')]},
 1047: {'start_dates': [Timestamp('2002-04-22 00:00:00')],
  'end_dates': [Timestamp('2020-09-30 00:00:00')]},
 1062: {'start_dates': [Timestamp('2009-07-14 00:00:00')],
  'end_dates': [Timestamp('2024-03-12 00:00:00')]},
 1064: {'start_dates': [Timestamp('2002-04-22 00:00:00')],
  'end_dates': [Timestamp('2020-10-01 00:00:00')]},
 1065: {'start_dates': [Timestamp('2002-04-22 00:00:00')],
  'end_dates': [Timestamp('2020-09-30 00:00:00')]},
 10

In [None]:
period = 'validation'

sample_train = pd.read_csv(f'./data/sample_{period}.txt', header=None).squeeze().tolist()

PATH_PLOT = PATH_RESOPS / 'time_series' / 'plots' / period
PATH_PLOT.mkdir(parents=True, exist_ok=True)

for ID in tqdm(sample_train):
    df = xr.open_dataset(PATH_RESOPS / 'time_series' / f'{ID}.nc').to_pandas()
    df = df[cfg.dynamic_inputs + cfg.target_variables]

    fig, ax = plt.subplots(figsize=(12, 4))
    df.plot(ax=ax, lw=1)
    ax.set(title=ID,
           xlim=(df.index.min(), df.index.max()));
    for x in periods_da.sel(id=str(ID), date='start').to_numpy():
        plt.axvline(x, ls='-', lw=.5, color='k', zorder=0)
    plt.savefig(PATH_PLOT / f'{ID}.jpg', dpi=300, bbox_inches='tight')

***

In [94]:
from neuralhydrology.datasetzoo import BaseDataset

In [96]:
BaseDataset(cfg, is_train=True, period='train')

NotImplementedError: 