In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
from collections import defaultdict

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from panda.utils.eval_utils import get_summary_metrics_dict
from panda.utils.plot_utils import (
    apply_custom_style,
    make_box_plot,
    plot_all_metrics_by_prediction_length,
)

apply_custom_style("../../config/plotting.yaml")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
DEFAULT_COLORS = list(plt.rcParams["axes.prop_cycle"].by_key()["color"])

In [4]:
figs_save_dir = os.path.join("../../figures", "eval_metrics")
os.makedirs(figs_save_dir, exist_ok=True)

outputs_save_dir = os.path.join("../../outputs", "eval_metrics")
os.makedirs(outputs_save_dir, exist_ok=True)

In [5]:
WORK_DIR = os.getenv("WORK", "")
DATA_DIR = os.path.join(WORK_DIR, "data")

In [18]:
data_split = "test_zeroshot"

run_metrics_dir_dict = {
    "Panda": os.path.join(
        WORK_DIR,
        "eval_results",
        "patchtst",
        # "pft_stand_rff_only_pretrained-0",
        # "pft_chattn_noembed_pretrained_correct-0",
        "pft_chattn_emb_w_poly-0",
        # "pft_linattnpolyemb_from_scratch-0",
        data_split,
    ),
    # "Panda Univariate": os.path.join(
    #     WORK_DIR,
    #     "eval_results",
    #     "patchtst_univariate",
    #     # "pft_stand_rff_only_pretrained-0",
    #     # "pft_chattn_noembed_pretrained_correct-0",
    #     "pft_chattn_emb_w_poly-0",
    #     # "pft_linattnpolyemb_from_scratch-0",
    #     data_split,
    # ),
    "Dynamix": os.path.join(WORK_DIR, "eval_results", "dynamix", data_split),
    "Chronos 20M SFT": os.path.join(
        WORK_DIR,
        "eval_results",
        # "chronos_nondeterministic",
        # "chronos_sft",
        "chronos",
        "chronos_t5_mini_ft-0",
        data_split,
    ),
    "Chronos 46M SFT": os.path.join(WORK_DIR, "eval_results", "chronos", "chronos_small_ft_equalized-13", data_split),
    # "Chronos 20M": os.path.join(
    #     WORK_DIR,
    #     "eval_results",
    #     # "chronos_nondeterministic",
    #     "chronos",
    #     "chronos_mini_zeroshot",
    #     data_split,
    # ),
    # "Time MOE 50M": os.path.join(
    #     WORK_DIR,
    #     "eval_results",
    #     "timemoe",
    #     "timemoe-50m",
    #     data_split,
    # ),
    # "TimesFM 200M": os.path.join(
    #     WORK_DIR,
    #     "eval_results",
    #     "timesfm",
    #     "timesfm-200m",
    #     data_split,
    # ),
    "Chronos 200M": os.path.join(
        WORK_DIR,
        "eval_results",
        "chronos",
        # "chronos_nondeterministic",
        "chronos_base_zeroshot",
        data_split,
    ),
    "Chronos 200M Probabilistic": os.path.join(
        WORK_DIR,
        "eval_results",
        # "chronos",
        "chronos_nondeterministic",
        "chronos_base_zeroshot",
        data_split,
    ),
    # "Mean": os.path.join(
    #     WORK_DIR,
    #     "eval_results",
    #     "baselines",
    #     "mean",
    #     data_split,
    # ),
    # "Fourier": os.path.join(
    #     WORK_DIR,
    #     "eval_results",
    #     "baselines",
    #     "fourier",
    #     data_split,
    # ),
}

In [19]:
metrics_all_runs = defaultdict(dict)
for model_name, run_metrics_dir in run_metrics_dir_dict.items():
    if not os.path.exists(run_metrics_dir):
        print(f"Run metrics dir does not exist: {run_metrics_dir}")
        continue
    for file in sorted(
        filter(lambda x: x.endswith(".csv"), os.listdir(run_metrics_dir)),
        key=lambda x: int(x.split("_pred")[1].split(".csv")[0]),
    ):
        if file.endswith(".csv"):
            prediction_length = int(file.split("_pred")[1].split(".csv")[0])
            with open(os.path.join(run_metrics_dir, file)) as f:
                metrics = pd.read_csv(f).to_dict()
                metrics_all_runs[model_name][prediction_length] = metrics

Run metrics dir does not exist: /stor/work/Gilpin/eval_results/chronos/chronos_base_zeroshot/test_zeroshot
Run metrics dir does not exist: /stor/work/Gilpin/eval_results/chronos_nondeterministic/chronos_base_zeroshot/test_zeroshot


In [25]:
metrics_all_runs["Panda"][64]["mse"]

{0: '[0.039060335606336594, 0.00021469825878739357, 0.07476677745580673, 0.00015758676454424858, 0.00032846187241375446, 0.004642735235393047, 0.0016302639851346612, 0.0006162309437058866, 0.004991636145859957, 0.0006318216910585761, 0.01746196486055851, 0.0015416244277730584, 0.000758585287258029, 0.008591764606535435, 0.007268272805958986, 0.043941184878349304]',
 1: '[0.6305766105651855, 0.0018407216994091868, 0.002493147272616625, 1806.2879638671875, 50.76014709472656, 0.008383686654269695, 8731.7802734375, 0.5351545214653015]',
 2: '[0.03741774335503578, 1715743096832.0, 1666418606080.0, 971819.9375, 796926476288.0, 861473931264.0, 1639271890944.0, 172.25254821777344, 949483274240.0, 0.025774912908673286, 479133564928.0, 946831949824.0, 427193139200.0, 598081470464.0, 23376490.0, 533747105792.0, 349.0740661621094, 0.07465428858995438]',
 3: '[0.0012141873594373465, 0.008048392832279205, 0.004864222835749388, 0.007624994497746229, 0.0024682495277374983, 0.0035352774430066347, 0.002

In [9]:
len(metrics_all_runs["Chronos 20M SFT"][64]["smape"])

480

In [10]:
unrolled_metrics = defaultdict(dict)
for model_name, all_metrics_of_model in metrics_all_runs.items():
    for prediction_length, metrics in all_metrics_of_model.items():
        systems = metrics["system"]
        metrics_unrolled = {k: list(v.values()) for k, v in metrics.items() if k != "system"}
        unrolled_metrics[model_name][prediction_length] = metrics_unrolled

n_runs = len(unrolled_metrics.keys())

In [16]:
unrolled_metrics["Dynamix"]

{64: {'mse': ['[0.10186439752578735, 6.542663322761655e-05, 0.3471556603908539, 2.0847486666752957e-05, 0.0006396964308805764, 0.009281417354941368, 0.0017131129279732704, 0.00033775833435356617, 0.004785067867487669, 0.008032598532736301, 0.02606048434972763, 0.0018039558781310916, 0.002819779561832547, 0.06932687014341354, 43.38600158691406, 0.1157594695687294]',
   '[0.3594861328601837, 0.0009028299245983362, 0.004229324404150248, 0.0002809033903758973, 0.017037471756339073, 0.26934629678726196, 0.010307474993169308, 0.6570448279380798]',
   '[0.03526995703577995, 0.013427559286355972, 0.004806571640074253, 0.04012104123830795, 0.08698278665542603, 0.0699833482503891, 0.02894260734319687, 0.004522006958723068, 0.005330398678779602, 0.13669027388095856, 0.009549432434141636, 0.0742625966668129, 0.07204984873533249, 0.1130780503153801, 0.0660998597741127, 0.07639845460653305, 0.056472573429346085, 0.11079556494951248]',
   '[0.027310820296406746, 4.38641881942749, 5.214029788970947, 3

In [12]:
import pandas as pd
from scipy.stats import wilcoxon
from statsmodels.stats.multitest import multipletests

p_values = defaultdict(dict)
for baseline, baseline_metrics_per_pred_len in unrolled_metrics.items():
    if baseline == "Panda":
        continue
    for prediction_length, baseline_metrics in baseline_metrics_per_pred_len.items():
        for metric, baseline_metric_values in baseline_metrics.items():
            if metric not in ["mae", "mse", "smape"]:
                continue
            panda_metric_values = unrolled_metrics["Panda"][prediction_length][metric]
            print(panda_metric_values)
            result = wilcoxon(panda_metric_values, baseline_metric_values, correction=True)

            for key in ["pvalue", "statistic"]:
                metric_key = f"{metric}_{key}"
                value = getattr(result, key)
                if metric_key in p_values[prediction_length]:
                    p_values[prediction_length][metric_key][baseline] = value
                else:
                    p_values[prediction_length][metric_key] = {baseline: value}


pvals_128 = pd.DataFrame(p_values[128]).dropna()
pvals_256 = pd.DataFrame(p_values[256]).dropna()
pvals_512 = pd.DataFrame(p_values[512]).dropna()

for df in [pvals_128, pvals_256, pvals_512]:
    for col in filter(lambda x: "pvalue" in x, df.columns):
        correction = multipletests(df[col])
        df[f"{col}_pval_adj"] = correction[1]
        df[f"{col}_reject"] = correction[0]

pvals_128.to_csv(f"{outputs_save_dir}/pvals_128.csv")
pvals_256.to_csv(f"{outputs_save_dir}/pvals_256.csv")
pvals_512.to_csv(f"{outputs_save_dir}/pvals_512.csv")

['[0.039060335606336594, 0.00021469825878739357, 0.07476677745580673, 0.00015758676454424858, 0.00032846187241375446, 0.004642735235393047, 0.0016302639851346612, 0.0006162309437058866, 0.004991636145859957, 0.0006318216910585761, 0.01746196486055851, 0.0015416244277730584, 0.000758585287258029, 0.008591764606535435, 0.007268272805958986, 0.043941184878349304]', '[0.6305766105651855, 0.0018407216994091868, 0.002493147272616625, 1806.2879638671875, 50.76014709472656, 0.008383686654269695, 8731.7802734375, 0.5351545214653015]', '[0.03741774335503578, 1715743096832.0, 1666418606080.0, 971819.9375, 796926476288.0, 861473931264.0, 1639271890944.0, 172.25254821777344, 949483274240.0, 0.025774912908673286, 479133564928.0, 946831949824.0, 427193139200.0, 598081470464.0, 23376490.0, 533747105792.0, 349.0740661621094, 0.07465428858995438]', '[0.0012141873594373465, 0.008048392832279205, 0.004864222835749388, 0.007624994497746229, 0.0024682495277374983, 0.0035352774430066347, 0.002747929887846112

DTypePromotionError: The DType <class 'numpy._FloatAbstractDType'> could not be promoted by <class 'numpy.dtypes.StrDType'>. This means that no common DType exists for the given inputs. For example they cannot be stored in a single array unless the dtype is `object`. The full list of DTypes is: (<class 'numpy.dtypes.StrDType'>, <class 'numpy.dtypes.StrDType'>, <class 'numpy._FloatAbstractDType'>)

In [None]:
pvals_128

In [None]:
pvals_256

In [None]:
pvals_512

In [None]:
default_colors = DEFAULT_COLORS[: n_runs + 1]
default_colors = default_colors[:3] + default_colors[4:]
print(default_colors)

In [None]:
selected_metric = "smape"
legend_handles = make_box_plot(
    unrolled_metrics=unrolled_metrics,
    prediction_length=128,
    metric_to_plot=selected_metric,  # Specify which metric to plot
    sort_runs=True,  # Optionally sort runs by their metric values
    colors=default_colors,
    title=None,
    title_kwargs={"fontsize": 10},
    use_inv_spearman=True,
    order_by_metric="smape",
    save_path=f"{figs_save_dir}/{selected_metric}_128.pdf",
    ylabel_fontsize=12,
    show_xlabel=False,
    box_percentile_range=(25, 75),
    whisker_percentile_range=(5, 95),
    alpha_val=0.8,
    fig_kwargs={"figsize": (2, 4)},
    box_width=1.0,
)

In [None]:
selected_metric = "mae"
legend_handles = make_box_plot(
    unrolled_metrics=unrolled_metrics,
    prediction_length=128,
    metric_to_plot=selected_metric,  # Specify which metric to plot
    sort_runs=True,  # Optionally sort runs by their metric values
    colors=default_colors,
    title=None,
    title_kwargs={"fontsize": 10},
    use_inv_spearman=True,
    order_by_metric="smape",
    save_path=f"{figs_save_dir}/{selected_metric}_128.pdf",
    ylabel_fontsize=12,
    show_xlabel=False,
    box_percentile_range=(25, 75),
    whisker_percentile_range=(10, 80),
    alpha_val=0.8,
    fig_kwargs={"figsize": (2, 4)},
    box_width=1.0,
)

In [None]:
plt.figure(figsize=(6, 1))

# Add the legend with the combined handles
legend = plt.legend(
    handles=legend_handles,
    loc="upper center",
    frameon=True,
    ncol=5,
    framealpha=1.0,
    fontsize=16,
)

plt.xticks([])
plt.yticks([])
plt.tight_layout(pad=0)
plt.savefig(f"{figs_save_dir}/baselines_legend_horizontal_patches.pdf", bbox_inches="tight")
plt.show()
plt.close()

In [None]:
plt.figure(figsize=(3, 2))

# Add the legend with the combined handles
legend = plt.legend(
    handles=legend_handles,
    loc="upper center",
    frameon=True,
    ncol=1,
    framealpha=1.0,
    fontsize=16,
)

plt.xticks([])
plt.yticks([])
plt.tight_layout(pad=0)
plt.savefig(f"{figs_save_dir}/baselines_legend_vertical_patches.pdf", bbox_inches="tight")
plt.show()
plt.close()

In [None]:
smape_metrics_dict, has_nans = get_summary_metrics_dict(unrolled_metrics, "smape")

In [None]:
metrics = ["mse", "mae", "smape", "spearman"]
metrics_dicts, has_nans = zip(*[get_summary_metrics_dict(unrolled_metrics, metric) for metric in metrics])
all_metrics_dict = {m: metrics_dicts[i] for i, m in enumerate(metrics)}
has_nans_dict = {m: has_nans[i] for i, m in enumerate(metrics)}

In [None]:
# Count the number of NaNs for each metric and model
nan_counts = {}
for metric_name, metric_data in all_metrics_dict.items():
    nan_counts[metric_name] = {}
    for model_name, model_data in metric_data.items():
        # all_vals = np.concatenate(model_data["all_vals"])
        all_vals_pred128 = model_data["all_vals"][1]
        nan_count = np.isnan(all_vals_pred128).sum()
        nan_counts[metric_name][model_name] = nan_count
        if nan_count > 0:
            print(f"Found {nan_count} NaNs in {model_name} for {metric_name}")

print(nan_counts)

In [None]:
has_nans_dict

Order model names by sMAPE

In [None]:
model_names_ordering = []  # sorted by median smape at 128
for model_name, data in all_metrics_dict["smape"].items():
    median_metrics_128 = data["medians"][1]
    model_names_ordering.append((model_name, median_metrics_128))
model_names_ordering = sorted(model_names_ordering, key=lambda x: x[1])
model_names_ordering = [x[0] for x in model_names_ordering]
print(model_names_ordering)

# Reorder all_metrics_dict according to model_names_ordering for each metric
reordered_metrics_dict = {}
for metric_name, metric_data in all_metrics_dict.items():
    reordered_metric_data = {}

    # Add models in the order specified by model_names_ordering
    for model_name in model_names_ordering:
        if model_name in metric_data:
            reordered_metric_data[model_name] = metric_data[model_name]
        else:
            raise ValueError(f"Model {model_name} not found in {metric_name}")

    reordered_metrics_dict[metric_name] = reordered_metric_data
all_metrics_dict = reordered_metrics_dict

In [None]:
legend_handles = plot_all_metrics_by_prediction_length(
    all_metrics_dict,
    ["mse", "mae", "smape", "spearman"],
    metrics_to_show_envelope=["mae", "smape"],
    n_cols=4,
    n_rows=1,
    save_path=f"{figs_save_dir}/zeroshot_metrics_autoregressive_rollout_metrics.pdf",
    show_legend=False,
    legend_kwargs={"loc": "upper left", "frameon": True, "fontsize": 10},
    colors=default_colors,
    use_inv_spearman=True,
    percentile_range=(40, 60),
    has_nans=has_nans_dict,
)

In [None]:
plt.figure(figsize=(6, 1))

# Add the legend with the combined handles
legend = plt.legend(
    handles=legend_handles,
    loc="upper center",
    frameon=True,
    ncol=5,
    framealpha=1.0,
    fontsize=16,
)

plt.xticks([])
plt.yticks([])
plt.tight_layout(pad=0)
plt.savefig(f"{figs_save_dir}/baselines_legend_horizontal.pdf", bbox_inches="tight")
plt.show()
plt.close()

In [None]:
plt.figure(figsize=(3, 2))

# Add the legend with the combined handles
legend = plt.legend(
    handles=legend_handles,
    loc="upper center",
    frameon=True,
    ncol=1,
    framealpha=1.0,
    fontsize=16,
)

plt.xticks([])
plt.yticks([])
plt.tight_layout(pad=0)
plt.savefig(f"{figs_save_dir}/baselines_legend_vertical.pdf", bbox_inches="tight")
plt.show()
plt.close()

In [None]:
metric_to_plot = "smape"
plot_all_metrics_by_prediction_length(
    all_metrics_dict,
    [metric_to_plot],
    metrics_to_show_envelope=[metric_to_plot],
    n_cols=1,
    n_rows=1,
    individual_figsize=(4, 4.5),
    save_path=f"{figs_save_dir}/zeroshot_{metric_to_plot}_autoregressive_rollout_metrics.pdf",
    show_legend=False,
    legend_kwargs={"frameon": True, "fontsize": 10, "loc": "lower right"},
    colors=default_colors,
    percentile_range=(40, 60),
    has_nans=has_nans_dict,
)

In [None]:
metric_to_plot = "mae"
plot_all_metrics_by_prediction_length(
    all_metrics_dict,
    [metric_to_plot],
    metrics_to_show_envelope=[metric_to_plot],
    n_cols=1,
    n_rows=1,
    individual_figsize=(4, 4.5),
    save_path=f"{figs_save_dir}/zeroshot_{metric_to_plot}_autoregressive_rollout_metrics.pdf",
    show_legend=False,
    legend_kwargs={"frameon": True, "fontsize": 10, "loc": "lower right"},
    colors=default_colors,
    percentile_range=(40, 60),
    has_nans=has_nans_dict,
)

In [None]:
metric_to_plot = "mse"
plot_all_metrics_by_prediction_length(
    all_metrics_dict,
    [metric_to_plot],
    # metrics_to_show_envelope=[metric_to_plot],
    n_cols=1,
    n_rows=1,
    individual_figsize=(4, 4.5),
    save_path=f"{figs_save_dir}/zeroshot_{metric_to_plot}_autoregressive_rollout_metrics.pdf",
    show_legend=False,
    legend_kwargs={"frameon": True, "fontsize": 10, "loc": "lower right"},
    colors=default_colors,
    percentile_range=(40, 60),
    has_nans=has_nans_dict,
)

In [None]:
metric_to_plot = "spearman"
plot_all_metrics_by_prediction_length(
    all_metrics_dict,
    [metric_to_plot],
    metrics_to_show_envelope=[metric_to_plot],
    n_cols=1,
    n_rows=1,
    individual_figsize=(4, 4.5),
    save_path=f"{figs_save_dir}/zeroshot_{metric_to_plot}_autoregressive_rollout_metrics.pdf",
    show_legend=False,
    legend_kwargs={"frameon": True, "fontsize": 10, "loc": "lower right"},
    colors=default_colors,
    percentile_range=(40, 60),
    has_nans=has_nans_dict,
    use_inv_spearman=True,
)