In [1]:
%load_ext lab_black

In [2]:
import pickle
from pathlib import Path
from typing import Union, Dict, Optional, List
from collections import defaultdict
import warnings

import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import pandas as pd
from tqdm import tqdm

from camelsml.metrics import calc_nse

In [3]:
plt.style.use("../figstyle.mplstyle")

In [4]:
def load_results(folder: Union[Path, str], model_type: str, seed: int) -> Dict:
    if isinstance(folder, str):
        folder = Path(folder)
    if not folder.exists():
        raise FileNotFoundError(f"No directory found at {folder}")
    nse_values = defaultdict(dict)
    epochs = len(list(folder.rglob(f"{model_type}_seed{seed}_epoch_*.p")))
    if epochs == 0:
        raise FileNotFoundError(f"No validation runs found")
    for epoch in range(1, epochs + 1):
        file = folder / f"{model_type}_seed{seed}_epoch_{epoch}.p"
        with open(file, "rb") as results_file:
            results = pickle.load(results_file)
        for basin in results.keys():
            nse_values[f"epoch_{epoch}"][basin] = calc_nse(
                obs=results[basin]["qobs"].to_numpy(),
                sim=results[basin]["qsim"].to_numpy(),
            )
    return nse_values


def load_cv_results(
    main_folder: Union[Path, str], k, model_type: str, seed: int, epoch: int = 30
) -> Dict:
    if isinstance(main_folder, str):
        main_folder = Path(main_folder)
    if not main_folder.exists():
        raise FileNotFoundError(f"No directory found at {main_folder}")
    results = {}
    full_results = defaultdict(dict)
    for i in tqdm(range(k)):
        folder = main_folder / str(i)
        folder = list(folder.glob("*"))
        if len(folder) != 1:
            warnings.warn(f"Check your file structure in {i}")
        folder = folder[0]
        results[i] = load_results(folder, model_type, seed)
        for key in list(results[i].keys())[:epoch]:
            for basin in results[i][key].keys():
                full_results[key][basin] = results[i][key][basin]
    return full_results

In [5]:
def parse_results(
    folder: Union[str, Path], k: int = 5, skip: Optional[Union[int, List[int]]] = None
) -> Dict:
    if isinstance(skip, int):
        skip = [skip]
    folder = Path(folder)
    results = {}
    for i in range(k):
        if not skip is None and i in skip:
            continue
        file = folder / f"{i}" / "i_list.pickle"
        with open(file, "rb") as infile:
            results[i] = pickle.load(infile)
    return results


def calc_i(
    permutations: List,
    nse_values: Dict,
    epoch: int = 30,
    k: int = 5,
    min_nse=0,
    skip: Optional[Union[int, List[int]]] = None,
) -> pd.DataFrame:
    if isinstance(skip, int):
        skip = [skip]
    features = list(
        permutations[list(permutations.keys())[0]].keys()
    )  # permutations.keys()[0]])
    nse_values = nse_values[f"epoch_{epoch}"]
    means = defaultdict(lambda: defaultdict(lambda: 0))
    index = -1
    for i in range(k):
        if not skip is None and i in skip:
            print(f"skipped {i}!")
            index += 1
            continue
        else:
            index += 1
        fold = permutations[index]
        # features = list(fold.keys())
        for feature in features:
            feature_results = fold[feature]
            for k_ in list(feature_results.keys()):
                # print(k_, len(list(feature_results.keys())))
                for basin in feature_results[k_].keys():
                    means[feature][basin] += feature_results[k_][basin] / len(
                        list(feature_results.keys())
                    )

    for feature in features:
        for basin in means[feature].keys():
            if nse_values[basin] >= min_nse:
                means[feature][basin] = nse_values[basin] - means[feature][basin]
            else:
                means[feature][basin] = np.nan
    return pd.DataFrame.from_dict(means).dropna()

In [6]:
def boxplot_feature_importance(
    importances: pd.DataFrame, features: List[str], ax: Optional[plt.Axes] = None
):
    if ax is None:
        fig, ax = plt.subplots(1, 1)
    ax.boxplot(importances[features].values)
    ax.set_xticklabels(
        [feature.replace("_", "\_") for feature in features], rotation=45
    )
    # ax.set_yscale("log")
    ax.grid("on")

In [7]:
k = 5
permutations = parse_results(
    "/home/bernhard/git/Master-Thesis/runs/correlation_reduction/all_features_cv/permutation/",
    k=k,
)
nse_values = load_cv_results(
    main_folder="/home/bernhard/git/Master-Thesis/runs/correlation_reduction/all_features_cv/",
    model_type="ealstm",
    seed="19970204",
    k=k,
)
importance_all_features = calc_i(permutations, nse_values, k=k, min_nse=0.5, epoch=20)

100%|██████████| 5/5 [00:18<00:00,  3.70s/it]


In [8]:
def importance_metrics(importances: pd.DataFrame) -> pd.DataFrame:
    fracs = defaultdict(dict)
    for feature in importances.columns:
        # fracs[feature][f"Amount i > {limit}"] = (importances[feature] > limit).sum()
        # fracs[feature][f"Fraction i > {limit}"] = fracs[feature][
        #    f"Amount i > {limit}"
        # ] / len(importances[feature])
        # fracs[feature][f"Max"] = importances[feature].max()
        # fracs[feature][
        #    "|i|"
        # ] = f"{importances[feature].mean():.4f}+-{importances[feature].std():.2f}"
        fracs[feature]["95%"] = np.percentile(importances[feature], q=95)
        fracs[feature]["75%"] = np.percentile(importances[feature], q=75)
        fracs[feature]["Median"] = np.median(importances[feature])
        fracs[feature]["25%"] = np.percentile(importances[feature], q=25)
        fracs[feature]["5%"] = np.percentile(importances[feature], q=5)
    # df = pd.DataFrame.from_dict(fracs).T.sort_values(
    #    f"Fraction i > {limit}", ascending=False
    # )
    df = pd.DataFrame.from_dict(fracs).T.sort_values("Median", ascending=False)
    return df


fracs_all_features = importance_metrics(importance_all_features)
folder = Path("../doc/thesis/tables/results_section")
folder.mkdir(parents=True, exist_ok=True)
fracs_all_features.drop(fracs_all_features.index[20:]).to_latex(
    folder / "overfit_importance.tex", float_format="%.2f"
)
fracs_all_features.drop(fracs_all_features.index[20:])

Unnamed: 0,95%,75%,Median,25%,5%
Q95,9.965331,0.423366,0.122163,0.039146,-0.01222
baseflow_index_ceh,1.223817,0.213468,0.079608,0.021573,-0.008008
p_mean,0.329857,0.112807,0.042575,0.007947,-0.017017
aridity,0.104584,0.036468,0.013234,0.000704,-0.021357
Q5,0.60951,0.059715,0.009635,0.000348,-0.027473
p_seasonality,0.118962,0.016138,0.003102,-0.000706,-0.021852
area,0.113298,0.014959,0.002624,-0.000407,-0.012037
inwater_perc,0.119842,0.015252,0.002568,-0.001016,-0.02058
elev_10,0.100327,0.015951,0.002489,-0.001558,-0.022612
low_prec_dur,0.061404,0.010816,0.0023,-0.001176,-0.020019


In [9]:
def plot_importances(
    importances: pd.DataFrame,
    feature: str,
    ax: Optional[plt.Axes] = None,
    color: str = "blue",
    label: Optional[str] = None,
):
    x = importances[feature][importances[feature] <= 1]
    if ax is None:
        fig, ax = plt.subplots(1, 1)
    ax.hist(x, facecolor=color, bins=100, density=True, label=label)


"""
fig, axes = plt.subplots(2, 2, sharex=True, sharey=False, figsize=[4.7747, 4.7747])
axes = axes.flatten()

features = ["Q95", "baseflow_index_ceh", "porosity_hypres_5", "num_reservoir"]

save_path = Path("../doc/thesis/figures/permutation/all_features_cv")
save_path.mkdir(exist_ok=True, parents=True)

for i, feature in enumerate(features):
    plot_importances(importance_all_features, feature, ax=axes[i])
    print(
        feature.replace("_", "\\_"),
        feature.replace("_", "\\_").encode(encoding="UTF-8"),
    )
    axes[i].set_title(feature.replace("_", "\_"))
    axes[i].grid()
fig.tight_layout()
matplotlib.use("pgf")
matplotlib.rcParams.update(
    {
        "pgf.texsystem": "pdflatex",
        "font.family": "serif",
        "text.usetex": True,
        "pgf.rcfonts": False,
    }
)
fig.savefig(save_path / "histogram_all.pgf")
fig.savefig(save_path / "histogram_all.pdf")
# plt.show()
"""

'\nfig, axes = plt.subplots(2, 2, sharex=True, sharey=False, figsize=[4.7747, 4.7747])\naxes = axes.flatten()\n\nfeatures = ["Q95", "baseflow_index_ceh", "porosity_hypres_5", "num_reservoir"]\n\nsave_path = Path("../doc/thesis/figures/permutation/all_features_cv")\nsave_path.mkdir(exist_ok=True, parents=True)\n\nfor i, feature in enumerate(features):\n    plot_importances(importance_all_features, feature, ax=axes[i])\n    print(\n        feature.replace("_", "\\_"),\n        feature.replace("_", "\\_").encode(encoding="UTF-8"),\n    )\n    axes[i].set_title(feature.replace("_", "\\_"))\n    axes[i].grid()\nfig.tight_layout()\nmatplotlib.use("pgf")\nmatplotlib.rcParams.update(\n    {\n        "pgf.texsystem": "pdflatex",\n        "font.family": "serif",\n        "text.usetex": True,\n        "pgf.rcfonts": False,\n    }\n)\nfig.savefig(save_path / "histogram_all.pgf")\nfig.savefig(save_path / "histogram_all.pdf")\n# plt.show()\n'

## The above results seem to imply that the EA-LSTM model likes extra precipitation info.
Possibly because the timeseries do not contain enough information? Or maybe sequence_length should be increased?

## The results below indicate that not many basins care about the "physical" features
This is model dependent, will do the same analysis of the reduced model to see if this is still the case

In [10]:
chosen_features = np.genfromtxt(
    "../runs/correlation_reduction/chosen_features/use_features.txt", dtype="str"
)
# BUG!!
chosen_features = chosen_features[
    np.logical_and(chosen_features != "dpsbar", chosen_features != "dom_land_cover")
]
fracs_all_features.loc[chosen_features, :].sort_values(
    "Amount i > 0.1", ascending=False
)

KeyError: 'Amount i > 0.1'

In [None]:
"""fig, axes = plt.subplots(2, 2, sharex=True, sharey=True)
axes = axes.flatten()

features = ["area", "elev_10", "elev_50", "urban_perc"]

for i, feature in enumerate(features):
    plot_importances(importance_all_features, feature, ax=axes[i])
    axes[i].set_title(feature.replace("_", "\_"))
    axes[i].grid()
# fig.tight_layout()
plt.show()"""

## Importance of reduced, physically based system.

In [None]:
k = 5
permutations_chosen_features = parse_results(
    "/home/bernhard/git/Master-Thesis/runs/correlation_reduction/chosen_features_cv/permutation/",
    k=k,
)
nse_values_chosen_features = load_cv_results(
    main_folder="/home/bernhard/git/Master-Thesis/runs/correlation_reduction/chosen_features_cv/",
    model_type="ealstm",
    seed="19970204",
    k=k,
    epoch=13,
)
importance_chosen_features = calc_i(
    permutations_chosen_features, nse_values_chosen_features, k=k, min_nse=0.5, epoch=13
)

In [None]:
fracs_chosen_features = importance_metrics(importance_chosen_features, limit=0.1)
fracs_chosen_features

In [None]:
"""fig, axes = plt.subplots(
    2, 2, sharex=True, sharey=False, figsize=[10, 10]
)  # , figsize=[4.7747, 4.7747])
axes = axes.flatten()

features = ["inwater_perc", "no_gw_perc", "area", "nsig_low_perc"]

for i, feature in enumerate(features):
    plot_importances(
        importance_chosen_features, feature, ax=axes[i], color="blue", label="Chosen"
    )
    plot_importances(
        importance_all_features, features, ax=axes[i], color="red", label="All"
    )
    axes[i].legend()
    axes[i].set_title(feature.replace("_", "\_"))
    axes[i].grid()
fig.tight_layout()
plt.show()"""

## Importance of fixed all feature run

In [None]:
k = 5
permutations_all_features_fixed = parse_results(
    "/home/bernhard/git/Master-Thesis/runs/correlation_reduction/all_features_fixed_cv/permutation/",
    k=k,
)
nse_values_all_features_fixed = load_cv_results(
    main_folder="/home/bernhard/git/Master-Thesis/runs/correlation_reduction/all_features_fixed_cv/",
    model_type="ealstm",
    seed="19970204",
    k=k,
    epoch=20,
)
importance_all_features_fixed = calc_i(
    permutations_all_features_fixed,
    nse_values_all_features_fixed,
    k=k,
    min_nse=0.7,
    epoch=20,
)

In [None]:
fracs_all_features_fixed = importance_metrics(importance_all_features_fixed, limit=0.1)
fracs_all_features_fixed

In [None]:
"""fig, axes = plt.subplots(
    2, 2, sharex=True, sharey=False, figsize=[10, 10]
)  # , figsize=[4.7747, 4.7747])
axes = axes.flatten()

features = ["inwater_perc", "frac_high_perc", "clay_perc", "low_nsig_perc"]

for i, feature in enumerate(features):
    plot_importances(
        importance_chosen_features, feature, ax=axes[i], color="blue", label="Chosen"
    )
    plot_importances(
        importance_all_features_fixed, features, ax=axes[i], color="red", label="All"
    )
    axes[i].legend()
    axes[i].set_title(feature.replace("_", "\_"))
    axes[i].grid()
    axes[i].set_yscale("log")
fig.tight_layout()
plt.show()"""

## Importance CAMELS-US vs CAMELS-GB
### CAMELS-US:

There is a bug in the permutation test code that caused the test to skip root_depth in camels-us...

In [None]:
k = 5
permutations_common_features_us = parse_results(
    "/home/bernhard/git/Master-Thesis/runs/camels_us/chosen_features_cv_us/permutation/",
    k=k,
)
nse_values_common_features_us = load_cv_results(
    main_folder="/home/bernhard/git/Master-Thesis/runs/camels_us/chosen_features_cv_us/",
    model_type="ealstm",
    seed="19970204",
    k=k,
    epoch=20,
)
importance_common_features_us = calc_i(
    permutations_common_features_us,
    nse_values_common_features_us,
    k=k,
    min_nse=0.7,
    epoch=20,
)

In [None]:
fracs_common_features_us = importance_metrics(importance_common_features_us)
fracs_common_features_us

### CAMELS-GB:

In [None]:
k = 5
permutations_common_features_gb = parse_results(
    "/home/bernhard/git/Master-Thesis/runs/camels_us/chosen_features_cv_gb/permutation/",
    k=k,
)
nse_values_common_features_gb = load_cv_results(
    main_folder="/home/bernhard/git/Master-Thesis/runs/camels_us/chosen_features_cv_gb/",
    model_type="ealstm",
    seed="19970204",
    k=k,
    epoch=20,
)
importance_common_features_gb = calc_i(
    permutations_common_features_gb,
    nse_values_common_features_gb,
    k=k,
    min_nse=0.7,
    epoch=20,
)

In [None]:
fracs_common_features_gb = importance_metrics(importance_common_features_gb, limit=0.1)
fracs_common_features_gb

In [None]:
"""fig, axes = plt.subplots(
    5, 2, sharex=True, sharey=False, figsize=[10, 30]
)  # , figsize=[4.7747, 4.7747])
axes = axes.flatten()

features = {
    "root_depth": "missing",
    "clay_perc": "clay_frac",
    "conductivity_cosby": "soil_conductivity",
    "elev_50": "elev_mean",
    "organic_perc": "organic_frac",
    "inwater_perc": "water_frac",
    "area": "area_gages2",
    "silt_perc": "silt_frac",
    "sand_perc": "sand_frac",
    "soil_depth_pelletier": "soil_depth_pelletier",
}

for i, feature in enumerate(features.keys()):
    plot_importances(
        importance_common_features_gb, feature, ax=axes[i], color="blue", label="GB"
    )
    axes[i].set_title(
        f"{feature} (GB), missing in US because of NaN".replace("_", "\_")
    )
    try:
        plot_importances(
            importance_common_features_us,
            features[feature],
            ax=axes[i],
            color="red",
            label="US",
        )
        axes[i].set_title(
            f"{feature} (GB), {features[feature]} (US)".replace("_", "\_")
        )
    except KeyError:
        print(f"Skipped {feature} in CAMELS-US")
    axes[i].legend()

    axes[i].grid()
    # axes[i].set_yscale("log")
fig.tight_layout()
plt.show()
"""

### Kratzert third paper features

In [None]:
k = 5
permutations_kratzert_features_us = parse_results(
    "/home/bernhard/git/Master-Thesis/runs/camels_us/kratzert_features_cv_us/permutation/",
    k=k,
)

nse_values_kratzert_features_us = load_cv_results(
    main_folder="/home/bernhard/git/Master-Thesis/runs/camels_us/kratzert_features_cv_us/",
    model_type="lstm",
    seed="19970204",
    k=k,
    epoch=13,
)
importance_kratzert_features_us = calc_i(
    permutations_kratzert_features_us,
    nse_values_kratzert_features_us,
    k=k,
    min_nse=0.5,
    epoch=13,
)

In [None]:
fracs_kratzert_features_us = importance_metrics(importance_kratzert_features_us)
fracs_kratzert_features_us.to_latex(
    folder / "kratzert_features_importance.tex", float_format="%.2f"
)
fracs_kratzert_features_us

In [None]:
fig, ax = plt.subplots(1, 1)
boxplot_feature_importance(
    importances=importance_kratzert_features_us,
    features=[
        "frac_snow",
        "elev_mean",
        "aridity",
        "max_water_content",
        "carbonate_rocks_frac",
    ],
    ax=ax,
)
ax.set_ylim([-0.1, 1])

fracs_kratzert_features_us = importance_metrics(
    importance_kratzert_features_us, limit=0.1
)
fracs_kratzert_features_us

## Notes on feature importances
Kratzert et al did not run a static feature importance analysis on the third paper (the one with testing on ungauged basins). On the second paper when he ran a feature importance analysis he observed that mean precipitation was the most important feature. To me this could in theory imply that the gauged basin model uses the mean precipitation as a simple way to "recognize" each basin (this possibility Felix mentioned earlier too). My results here indicate that the snow fraction is more important than in Kratzert's analysis. What these results do agree with however is that hydrological information is more important than land coverage and soil attributes. A bit dissapointing for physical understanding, perhaps? Could always train a new model without these hydrological features to see how that performs, though!

Also, the fact that static information about precipitation like high_prec_frec is still deemed important to me implies that there is potential for improving our LSTM model in the future. All that information should be able to be taken from the time series, implying to me that the LSTM has potential to learn more time dependencies than it does now. 

## Combined dataset. Some notes.

* Times series are differnent. Only 3 in common. (I use 6 right now for CAMELS-GB, Kratzert uses 5 for CAMELS-US)
    - This I already knew though, and it would be interesting to look at a reduced set of itme series anyway.
* Land coverage information is structured differently. 
    - CAMELS-US only has fractions for the dominant land cover for each basin. CAMELS-GB has much more information here as it contains fractions from several land covers.
* There are some attributes I think could be comparable to each other. 
    - |CAMELS-US | CAMELS-GB |
      |-------------------|-----------------|
      |sand_frac|sand_perc|
      |silt_frac|silt_perc|
      |clay_frac|clay_perc|
      |organic_frac|organic_perc|
      |max_water_content|tawc (unsure)|
      |root_depth_XX|root_depth (Both are available as percentiles)|
      |soil_depth_pelletier|soil_depth_pelletier|
      |soil_conductivity|conductivity_(hypres or cosby?)|
      |soil_porosity|soil_porosity_(hypres or cosby?)|
      |||



Camels US: Dominating land cover [string],   CAMELS GB: all land covers
           coverage fraction of dominating,  coverage fraction of several types
           
This leads to:

1: The model gets the fraction of dominating, but no context for it.

2: Same as above, but also include land cover type as input (this input is non-numerical)

In [None]:
k = 5
permutations_mixed = parse_results(
    "/home/bernhard/git/Master-Thesis/runs/combined_dataset/mixed/permutation/",
    k=k,
)
nse_values_mixed = load_cv_results(
    main_folder="/home/bernhard/git/Master-Thesis/runs/combined_dataset/mixed/",
    model_type="ealstm",
    seed="19970204",
    k=k,
    epoch=30,
)

In [None]:
def calc_i_transfer(
    permutations: List,
    nse_values: Dict,
    epoch: int = 30,
    k: int = 5,
    min_nse=0,
    skip: Optional[Union[int, List[int]]] = None,
) -> pd.DataFrame:
    if isinstance(skip, int):
        skip = [skip]
    if isinstance(skip, list):
        dividor = k - len(skip)
    else:
        dividor = k
    nse_values = nse_values[f"epoch_{epoch}"]
    means = defaultdict(lambda: defaultdict(lambda: 0))
    index = -1
    for i in range(k):
        if not skip is None and i in skip:
            print(f"skipped {i}!")
            continue
        else:
            index += 1
        fold = permutations[index]
        features = list(fold.keys())
        for feature in features:
            feature_results = fold[feature]
            for k_ in list(feature_results.keys()):
                for basin in feature_results[k_].keys():
                    feature_results[k_][basin]
                    means[feature][basin] += feature_results[k_][basin] / (
                        len(list(feature_results.keys())) * dividor
                    )
        for feature in features:
            for basin in means[feature].keys():
                if nse_values[basin] >= min_nse:
                    means[feature][basin] = nse_values[basin] - means[feature][basin]
                else:
                    means[feature][basin] = np.nan

    return pd.DataFrame.from_dict(means).dropna()

In [None]:
importance_mixed = calc_i(
    permutations_mixed,
    nse_values_mixed,
    k=k,
    min_nse=0,
    epoch=30,
)
importance_mixed

In [None]:
fracs_mixed = importance_metrics(importance_mixed, limit=0.1)
fracs_mixed

In [None]:
def split_importances(importance_mixed: pd.DataFrame) -> pd.DataFrame:
    importance_us = importance_mixed[importance_mixed.index.str.contains("us")]
    importance_gb = importance_mixed[importance_mixed.index.str.contains("gb")]
    return importance_us, importance_gb

In [None]:
importance_mixed_us, importance_mixed_gb = split_importances(importance_mixed)

In [None]:
fracs_mixed_us = importance_metrics(importance_mixed_us, limit=0.1)
fracs_mixed_us

In [None]:
fracs_mixed_gb = importance_metrics(importance_mixed_gb, limit=0.1)
fracs_mixed_gb

In [None]:
k = 5
permutations_train_us_val_gb_no_organic_no_gvf = parse_results(
    "/home/bernhard/git/Master-Thesis/runs/combined_dataset/train_us_val_gb_no_organic_no_gvf/permutation/",
    k=k,
)

nse_values_train_us_val_gb_no_organic_no_gvf = defaultdict(lambda: defaultdict(int))

for i in tqdm(range(k)):  # (range(1, 3)):  # Remember to fix
    results_ = load_results(
        folder=f"/home/bernhard/git/Master-Thesis/runs/combined_dataset/train_us_val_gb_no_organic_no_gvf/val_gb/{i}",
        model_type="ealstm",
        seed="19970204",
    )
    for epoch in results_:
        for basin in results_[epoch]:
            nse_values_train_us_val_gb_no_organic_no_gvf[epoch][basin] += (
                results_[epoch][basin] / k
            )

In [None]:
importance_train_us_val_gb_no_organic_no_gvf = calc_i_transfer(
    permutations_train_us_val_gb_no_organic_no_gvf,
    nse_values_train_us_val_gb_no_organic_no_gvf,
    k=k,
    min_nse=0.5,
    epoch=14,
)
importance_train_us_val_gb_no_organic_no_gvf

In [None]:
fracs_train_us_val_gb_no_organic_no_gvf = importance_metrics(
    importance_train_us_val_gb_no_organic_no_gvf, limit=0.6
)
fracs_train_us_val_gb_no_organic_no_gvf

In [None]:
k = 5
permutations_all_features_concat_seq_len_gb = parse_results(
    "/home/bernhard/git/Master-Thesis/runs/correlation_reduction/all_features_concat_seq_len_cv/permutation/",
    k=k,
)


nse_values_common_features_gb = load_cv_results(
    main_folder="/home/bernhard/git/Master-Thesis/runs/correlation_reduction/all_features_concat_seq_len_cv/",
    model_type="lstm",
    seed="19970204",
    k=k,
    epoch=15,
)

In [None]:
importance_all_features_concat_seq_len_cv = calc_i(
    permutations_all_features_concat_seq_len_gb,
    nse_values_common_features_gb,
    k=k,
    min_nse=0.5,
    epoch=15,
)
importance_all_features_concat_seq_len_cv

In [None]:
fracs_all_features_concat_seq_len_cv = importance_metrics(
    importance_all_features_concat_seq_len_cv
)
folder = Path("../doc/thesis/tables/results_section")
folder.mkdir(parents=True, exist_ok=True)
fracs_all_features_concat_seq_len_cv.drop(
    fracs_all_features_concat_seq_len_cv.index[20:]
).to_latex(folder / "all_features_concat_seq_len_importance.tex", float_format="%.2f")

In [None]:
fig, ax = plt.subplots(1, 1)
boxplot_feature_importance(
    importances=importance_all_features_concat_seq_len_cv,
    features=[
        "low_prec_dur",
        "low_prec_freq",
        "tawc",
        "no_gw_perc",
        "inter_high_perc",
        "num_reservoir",
    ],
    ax=ax,
)
ax.set_ylim([-0.1, 0.2])

In [None]:
k = 5
permutations_mixed_concat = parse_results(
    "/home/bernhard/git/Master-Thesis/runs/combined_dataset/mixed_concat/permutation/",
    k=k,
)


nse_values_mixed_concat = load_cv_results(
    main_folder="/home/bernhard/git/Master-Thesis/runs/combined_dataset/mixed_concat",
    model_type="lstm",
    seed="19970204",
    k=k,
    epoch=15,
)

In [None]:
importance_mixed_concat = calc_i(
    permutations_mixed_concat,
    nse_values_mixed_concat,
    k=k,
    min_nse=0.5,
    epoch=15,
)
importance_mixed_concat

In [None]:
def importance_metrics_mixed(importances: pd.DataFrame) -> pd.DataFrame:
    gb = importances.loc[importances.index.str.contains("gb_")]
    us = importances.loc[importances.index.str.contains("us_")]
    us = importance_metrics(importances=us)
    gb = importance_metrics(importances=gb)
    mixed = (
        pd.concat({"GB": gb, "US": us}, axis=1)
        .swaplevel(0, 1, axis=1)
        .sort_index(axis=1)
    )
    mixed = mixed[["95%", "75%", "Median", "25%", "5%"]]
    return mixed

In [None]:
fracs_mixed_concat = importance_metrics_mixed(importance_mixed_concat)
folder = Path("../doc/thesis/tables/results_section")
folder.mkdir(parents=True, exist_ok=True)
fracs_mixed_concat.to_latex(
    folder / "mixed_concat_importance.tex",
    float_format="%.2f",
    multicolumn_format="c",
)
fracs_mixed_concat