In [1]:
%load_ext autoreload
%autoreload 2

In [58]:
from pathlib import Path
from typing import Union, Optional, Dict, List, Tuple
import pickle
import xarray as xr
from neuralhydrology.utils.config import Config
from neuralhydrology.evaluation.evaluate import start_evaluation
import pandas as pd

def eval_run(run_dir: Path, period: str, epoch: Optional[int] = None, gpu: Optional[int] = None, data_dir: Optional[str] = None) -> pd.DataFrame:
    """
    Start evaluating a trained model.

    Parameters
    ----------
    run_dir : Path
        Path to the run directory.
    period : {'train', 'validation', 'test'}
        The period to evaluate.
    epoch : int, optional
        Define a specific epoch to use. By default, the weights of the last epoch are used.
    gpu : int, optional
        GPU id to use. Will override config argument 'device'. A value less than zero indicates CPU.
    data_dir : str, optional
        Directory containing the data. If provided, it updates the 'data_dir' in the configuration.

    Returns
    -------
    pd.DataFrame
        DataFrame containing evaluation results.
    """
    config = Config(run_dir / "config.yml")

    if data_dir is not None:
        config.update_config({'data_dir': data_dir})

    config.update_config({'run_dir': run_dir})

    # check if a GPU has been specified as a command line argument. If yes, overwrite config
    if gpu is not None and gpu >= 0:
        config.device = f"cuda:{gpu}"
    if gpu is not None and gpu < 0:
        config.device = "cpu"

    return start_evaluation(cfg=config, run_dir=run_dir, epoch=epoch, period=period)

def cached_eval_run(run_dir: Path, period: str, data_dir: Optional[str] = None) -> pd.DataFrame:
    """
    Load cached data or generate and save if not found.

    Parameters
    ----------
    run_dir : Path
        Path to the run directory.
    period : {'train', 'validation', 'test'}
        The period to load or generate data for.
    data_dir : str, optional
        Directory containing the data. If provided, it updates the 'data_dir' in the configuration.

    Returns
    -------
    pd.DataFrame
        DataFrame containing cached or generated data.
    """
    postprocess_dir = Path(run_dir, "postprocess")
    postprocess_dir.mkdir(exist_ok=True, parents=True)

    cache_file = postprocess_dir / f"{period}_cached_data.pkl"

    if cache_file.is_file():
        print(f"Loading cached data for {period} from {cache_file}")
        with open(cache_file, 'rb') as file:
            return pickle.load(file)
    else:
        print(f"Cached data for {period} not found. Generating and saving...")
        data = eval_run(run_dir=run_dir, period=period, data_dir=data_dir)
        with open(cache_file, 'wb') as file:
            pickle.dump(data, file)
        return data

def combine_datasets(results: Dict[int, Dict[str, Dict[str, Union[str, pd.DataFrame]]]]) -> pd.DataFrame:
    """
    Combine datasets from multiple folds.

    Parameters
    ----------
    results : Dict[int, Dict[str, Dict[str, Union[str, pd.DataFrame]]]]
        Dictionary containing results for different folds.

    Returns
    -------
    pd.DataFrame
        Combined DataFrame.
    """
    basins = [basin for fold in results.values() for basin in fold]
    data_values = [entry['1D']['xr'] for fold in results.values() for entry in fold.values()]
    combined_dataset = xr.concat(data_values, dim='basin').assign_coords(basin=basins)
    return combined_dataset.to_dataframe()

def create_pivot_tables(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Create pivot tables for simulated and observed streamflow.

    Parameters
    ----------
    df : pd.DataFrame
        DataFrame containing the streamflow data.

    Returns
    -------
    Tuple[pd.DataFrame, pd.DataFrame]
        Tuple containing pivot tables for simulated and observed streamflow.
    """
    pivot_table_simulated = df.pivot_table(values='streamflow_mmd_sim', index='date', columns='basin')
    pivot_table_observed = df.pivot_table(values='streamflow_mmd_obs', index='date', columns='basin')
    return pivot_table_simulated, pivot_table_observed


In [59]:

# Set parameters
predictors = 'pe'
period = "test"
folds = [0, 1]
bp = Path('/Users/sho108/projects/neuralhydrology/workflows/camelaus_lstm/petrichore/runs/')

result = {
    fold: cached_eval_run(
        run_dir=bp/f'spatial_twofold_{fold}_{predictors}_2501_181026',
        period=period,
        data_dir='/Users/sho108/Desktop/z/Data/CAMELS_AUS'
    )
    for fold in folds
}


# Extract basins and corresponding data values from the result dictionary
df = combine_datasets(result)

# Create pivot tables for simulated and observed streamflow
pivot_table_simulated, pivot_table_observed = create_pivot_tables(df)

# Display the resulting DataFrames
pivot_table_simulated.head(), pivot_table_observed.head()


Cached data for test not found. Generating and saving...
# Evaluation: 100%|██████████| 111/111 [08:37<00:00,  4.66s/it]


FileNotFoundError: [Errno 2] No such file or directory: '/Users/sho108/projects/neuralhydrology/workflows/camelaus_lstm/petrichore/runs/spatial_twofold_0_pe_2501_181026/postprocess/test_cached_data.pkl'

In [None]:
/datasets/work/d61-coastal-forecasting-wp3/work/sho108/neuralhydrology/workflows/camelaus_lstm/runs/spatial_twofold_0_e_2501_181026/train_data
/Users/sho108/projects/neuralhydrology/workflows/camelaus_lstm/petrichore/runs/spatial_twofold_0_e_2501_181026/train_data