In [1]:
%load_ext lab_black

In [137]:
import pickle
from pathlib import Path
from typing import Union, Dict
from collections import defaultdict

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm

from camelsml.metrics import calc_nse

In [89]:
def load_results(folder: Union[Path, str], model_type: str, seed: int) -> Dict:
    if isinstance(folder, str):
        folder = Path(folder)
    if not folder.exists():
        raise FileNotFoundError(f"No directory found at {folder}")
    nse_values = defaultdict(dict)
    epochs = len(list(folder.rglob(f"{model_type}_seed{seed}_epoch_*.p")))
    if epochs == 0:
        raise FileNotFoundError(f"No validation runs found")
    for epoch in range(1, epochs + 1):
        file = folder / f"{model_type}_seed{seed}_epoch_{epoch}.p"
        with open(file, "rb") as results_file:
            results = pickle.load(results_file)
        for basin in results.keys():
            nse_values[f"epoch_{epoch}"][basin] = calc_nse(
                obs=results[basin]["qobs"].to_numpy(),
                sim=results[basin]["qsim"].to_numpy(),
            )
    return nse_values


def load_cv_results(
    main_folder: Union[Path, str], k, model_type: str, seed: int, epoch: int = 30
) -> Dict:
    if isinstance(main_folder, str):
        main_folder = Path(main_folder)
    if not main_folder.exists():
        raise FileNotFoundError(f"No directory found at {main_folder}")
    results = {}
    full_results = defaultdict(dict)
    for i in tqdm(range(k)):
        folder = main_folder / str(i)
        folder = list(folder.glob("*"))
        if len(folder) != 1:
            warnings.warn(f"Check your file structure in {i}")
        folder = folder[0]
        results[i] = load_results(folder, model_type, seed)
        for key in list(results[i].keys())[:epoch]:
            for basin in results[i][key].keys():
                full_results[key][basin] = results[i][key][basin]
    return full_results

In [126]:
def parse_results(folder: Union[str, Path], k: int = 5) -> Dict:
    folder = Path(folder)
    results = []
    for i in range(k):
        file = folder / f"{i}" / "i_list.pickle"
        with open(file, "rb") as infile:
            results.append(pickle.load(infile))
    return results


def calc_i(
    permutations: Dict, nse_values: Dict, epoch: int = 30, k: int = 5
) -> pd.DataFrame:
    nse_values = nse_values[f"epoch_{epoch}"]
    means = defaultdict(lambda: defaultdict(lambda: 0))
    for i in range(k):
        fold = permutations[i]
        features = list(fold.keys())
        for feature in features:
            feature_results = fold[feature]
            for k_ in list(feature_results.keys()):
                for basin in feature_results[k_].keys():
                    feature_results[k_][basin]
                    means[feature][basin] += feature_results[k_][basin] / len(
                        list(feature_results.keys())
                    )
    for feature in features:
        for basin in means[feature].keys():
            means[feature][basin] = nse_values[basin] - means[feature][basin]
    return pd.DataFrame.from_dict(means)

In [182]:
permutations = parse_results(
    "/home/bernhard/git/Master-Thesis/runs/correlation_reduction/all_features_cv/permutation/",
    k=5,
)
nse_values = load_cv_results(
    main_folder="/home/bernhard/git/Master-Thesis/runs/correlation_reduction/all_features_cv/",
    model_type="ealstm",
    seed="19970204",
    k=5,
)
importance_all_features = calc_i(permutations, nse_values, k=5)

100%|██████████| 5/5 [00:07<00:00,  1.52s/it]


In [215]:
print(importance_all_features.idxmin())
nse_values["epoch_30"]["39036"]

gauge_easting     39036
gauge_northing    39036
dtype: object


-20.826532323937354

In [225]:
def importance_metrics(importances: pd.DataFrame, limit: float = 0.1) -> pd.DataFrame:
    fracs = defaultdict(dict)
    for feature in importances.columns:
        fracs[feature][f"Amount i > {limit}"] = (
            importances[feature].abs() > limit
        ).sum()
        fracs[feature][f"Fraction i > {limit}"] = fracs[feature][
            f"Amount i > {limit}"
        ] / len(importances[feature])
        fracs[feature][f"Max |i|"] = importances[feature].max()
        fracs[feature][
            "i"
        ] = f"|{importances[feature].mean():.4f}|+-{importances[feature].std():.2f}"
        fracs[feature][f"median(i)"] = np.median(importances[feature])
    return pd.DataFrame.from_dict(fracs).T


fracs = importance_metrics(importance_all_features, limit=0.5)
fracs

Unnamed: 0,Amount i > 0.5,Fraction i > 0.5,Max |i|,i,median(i)
gauge_easting,11,0.0218688,0.709588,|0.0091|+-0.26,0.0105563
gauge_northing,15,0.0298211,0.853332,|-0.0045|+-0.31,0.00707913
