# Figuras y Tablas

In [None]:
# TODO: train with d2_log_loss instead of neg_log_loss score (!)
# TODO: De-ignore from gitinfo docs/img and docs/data

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import json
import logging
import pickle
from itertools import product
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib.pyplot import close
from sklearn.datasets import make_circles

from fkdc import config
from fkdc.config import _get_run_seeds, clasificadores, grillas, main_seed
from fkdc.datasets import Dataset
from fkdc.datasets import synth_datasets, found_datasets, datasets

from fkdc.tarea import Tarea
from fkdc.viz import (
    decision_boundary,
    default_palette,
    load_infos,
    loss_contour,
    parse_basic_info,
)

# Los experimentos del 29.10.2024 corrieron con main_seed = 2024
# Los experimentos del 19.05.2025 corrieron con main_seed = 2206
# Los experimentos del 23.05.2025 corrieron con main_seed = 2411
# Los experimentos del 07.06.2025 corrieron con main_seed = 1312
# Los experimentos del 12.06.2025 corrieron con main_seed = 1312 (igual a 07.06.2025)
run_seeds = _get_run_seeds()
plotting_seed = run_seeds[0]

root_dir = Path("/Users/gonzalo/Git/fkdc")
run_dir = root_dir / "sandbox/v5/infos"
datasets_dir = run_dir / "../datasets"
img_dir = root_dir / "docs/img"
data_dir = root_dir / "docs/data"
for directory in [data_dir, img_dir, run_dir, datasets_dir]:
    directory.mkdir(exist_ok=True)

In [None]:
logging.basicConfig(
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
    level=logging.INFO,
    handlers=[logging.StreamHandler()],
)
logger = logging.getLogger("ipynb")

In [None]:
import warnings

from sklearn.exceptions import InconsistentVersionWarning

warnings.filterwarnings("ignore", category=InconsistentVersionWarning)

infos = load_infos(run_dir)
bi = basic_info = parse_basic_info(infos, main_seed)

# Datasets


### For 2D datasets, for every clf & seeds
#### Decision boundaries


In [None]:
dataset, clf = "lunas_lo", "fkdc"
info, ds, tarea = decision_boundary(dataset, plotting_seed, clf)

In [None]:
def R2(mean_log_loss: pd.Series, logvero_base: float, n_eval: int) -> pd.Series:
    return 1 - (mean_log_loss * n_eval) / logvero_base


n_eval = len(tarea.y_eval)
logvero_base = info["base"].logvero
assert 1 - info[clf].logvero / info["base"].logvero == info[clf].r2
pd.Series(info[clf].busqueda.cv_results_["mean_test_score"]).apply(
    R2, logvero_base=logvero_base, n_eval=n_eval
).describe()

In [None]:
datasets_D2 = [
    "circulos_lo",
    "circulos_hi",
    "lunas_lo",
    "lunas_hi",
    "espirales_lo",
    "espirales_hi",
    "anteojos",
]

In [None]:
for dataset, clf in product(datasets_D2, clasificadores):
    logger.debug([dataset, clf])
    fig, ax = plt.subplots(layout="tight")
    decision_boundary(dataset, plotting_seed, clf, ax=ax)
    fpath = img_dir / f"{dataset}-{clf}-decision_boundary.svg"
    logger.info(fpath)
    fig.savefig(fpath)
    close(fig)
    plt.close()

#### contourplot of score (loss?) surface



In [None]:
ret = loss_contour(
    "espirales_lo",
    seed=run_seeds[10],
    clf="fkdc",
    x="bandwidth",
    y="alpha",
    # other_params={"weights": "uniform"},
    cmap="viridis",
)
# TODO: loss in R^2 terms & clear color gradient
ax = ret[1]
ax.set_xscale("log")
# ret[1].set_xlim(1, 30)

In [None]:
ret = loss_contour(
    "espirales_lo",
    seed=run_seeds[5],
    clf="fkn",
    x="n_neighbors",
    y="alpha",
    other_params={"weights": "uniform"},
    cmap="viridis",
)
# TODO: loss in R^2 terms & clear color gradient
# ax.set_xscale("log")
# ret[1].set_xlim(1, 30)

In [None]:
for dataset, seed in product(datasets_D2, run_seeds):
    logger.info([dataset, seed])
    clf, x, y = "fkdc", "bandwidth", "alpha"
    fig, ax, *_ = loss_contour(dataset, seed, clf, x, y)
    ax.set_xscale("log")
    fpath = img_dir / f"{dataset}-{seed}-{clf}-{x}-{y}-loss_contour.svg"
    logger.info(fpath)
    fig.savefig(fpath)
    close(fig)
    plt.close()

In [None]:
for dataset, seed in product(datasets_D2, run_seeds):
    logger.info([dataset, seed])
    clf, x, y = "fkn", "n_neighbors", "alpha"
    fig, ax, *_ = loss_contour(
        dataset, seed, clf, x, y, other_params={"weights": "uniform"}
    )
    ax.set_xscale("log")
    fpath = img_dir / f"{dataset}-{seed}-{clf}-{x}-{y}-loss_countour.svg"
    logger.info(fpath)
    fig.savefig(fpath)
    close(fig)
    plt.close()

### Custom Plots

In [None]:
ds = Dataset.de_fabrica(make_circles, n_samples=800, noise=0.05)
sns.jointplot(x=ds.X[:, 0], y=ds.X[:, 1], hue=ds.y, legend=False)
plt.xlim(-1.5, 1.5)
plt.ylim(-1.5, 1.5)
plt.savefig(img_dir / "dos-circulos-jointplot.svg")
# plt.close()

In [None]:
fpath = datasets_dir / f"espirales_lo-{plotting_seed}.pkl"
with open(fpath, "rb") as fp:
    ds = pickle.load(fp)
sns.jointplot(x=ds.X[:, 0], y=ds.X[:, 1], hue=ds.y, legend=False)
plt.xlim(-4, 4)
plt.ylim(-4, 4)
plt.savefig(img_dir / (fpath.stem + "-jointplot.svg"))
# plt.close()

In [None]:
dataset = "espirales_lo"
compare = "fkn"
base = "kn"
semillas_segun_delta_r2_desc = (
    bi[bi.dataset.eq(dataset) & bi.clf.isin([compare, base])]
    .set_index(["semilla", "clf"])
    .r2.unstack()
    .assign(delta_r2=lambda row: np.abs(row[compare] - row[base]))
    .sort_values("delta_r2", ascending=True)
    .index.tolist()
)
mejores_semillas, peores_semillas = (
    semillas_segun_delta_r2_desc[:3],
    semillas_segun_delta_r2_desc[-3:],
)

In [None]:
dataset = "helices_0"

infos_relevantes = {
    k: pd.DataFrame(v[k[2]]["busqueda"].cv_results_)
    for k, v in infos.items()
    if k[0] == dataset and k[2].endswith("kdc")
}
df = pd.concat(
    infos_relevantes.values(),
    keys=infos_relevantes.keys(),
    names=("dataset", "seed", "clf", "main_seed", "scoring", "run"),
)

In [None]:
best_estimators = {
    k: v[k[2]]["busqueda"].best_estimator_
    for k, v in infos.items()
    if k[0] == dataset and k[2].endswith("kdc")
}
best_params = [
    {
        "clf": k[2],
        "semilla": int(k[1 if dataset in synth_datasets else 3]),
        "alpha": v.get_params()["alpha"],  # if k[2] == "fkdc" else 1,
        "bandwidth": v.get_params()["bandwidth"],
    }
    for k, v in best_estimators.items()
]
best_params = pd.DataFrame.from_records(best_params).set_index(
    ["semilla", "clf"]
)  # .unstack("clf")

In [None]:
scores = (
    bi[bi.dataset.eq(dataset) & bi.clf.str.endswith("kdc")]
    .set_index(["semilla", "clf"])
    .r2
)  # .unstack("clf")

In [None]:
best_params["r2"] = scores

In [None]:
best_params.reset_index()[
    ["clf", "alpha", "bandwidth"]
].value_counts().sort_index().reset_index().round(4).to_csv(
    data_dir / f"{dataset}-best_params.csv", index=False
)

In [None]:
best_params.reset_index().groupby(
    ["clf", "alpha", "bandwidth"]
).r2.agg(["count", "mean", "median"]).round(4)

In [None]:
fig, ax = plt.subplots(layout="tight")
best_params["r2"] = scores
sns.scatterplot(best_params, x="bandwidth", y="r2", hue="clf", ax=ax)
fig.savefig(img_dir / f"{dataset}-[f]kdc-r2-vs-bandwidth.png")

In [None]:
bi[bi.dataset.eq(dataset)].groupby("clf").r2.median().round(4).sort_values(ascending=False)

In [None]:
per_seed = best_params.unstack("clf").assign(
    delta_h=lambda df: df.bandwidth.kdc.sub(df.bandwidth.fkdc),
    delta_r2=lambda df: df.r2.kdc.sub(df.r2.fkdc),
)

In [None]:
fig, ax = plt.subplots(layout="tight")
sns.scatterplot(per_seed, x="delta_h", y="delta_r2")
fig.savefig(img_dir / f"{dataset}-[f]kdc-delta_r2-vs-delta_h.png")

In [None]:
best_params.groupby(["clf", "bandwidth"]).r2.agg("median").reset_index().sort_values(
    "bandwidth"
)

fkdc:
  - 0.03162277660168379
  - 0.05623413251903491
  - 0.1
  - 0.1778279410038923
  - 0.31622776601683794
  - 0.5623413251903491
  - 1.0

kdc:
  - 0.055994858066093015
  - 0.06755066513294422
  - 0.08149127469020749
  - 0.09830884473994828
  - 0.11859710123376706
  - 0.14307229891937587
  - 0.17259850793256232
  - 0.20821811885006605
  - 0.25118864315095824
  - 0.3030271082866399
  - 0.36556361467894144
  - 0.44100594541767413

In [None]:
(1.00000456 ** 5) / 1.00000456

In [None]:
df = pd.concat(
    infos_relevantes.values(),
    keys=infos_relevantes.keys(),
    names=("dataset", "seed", "clf", "main_seed", "scoring", "run"),
)

In [None]:
df.xs("fkdc", level="clf").query("rank_test_score == 1")[
    ["param_alpha", "param_bandwidth"]
].value_counts().reset_index().to_csv(
    data_dir / "lunas_lo-best_test_params.csv", index=False
)

## 2D, low noise: lunas_lo, circulos_lo, espirales_lo
d = 1, D = 2, k = 2, n = 800 con split 50/50

In [None]:
clfs = config.clasificadores.keys()
tuple(clfs)

In [None]:
("fkdc", "kdc", "gnb", "kn", "fkn", "lr", "slr", "svc", "gbt")

In [None]:
for metric, dataset, sufijo in product(
    ("accuracy", "r2"), ("lunas_lo", "circulos_lo", "espirales_lo"), ("kn", "kdc")
):
    x = sufijo
    y = f"f{sufijo}"
    data = (
        bi[bi.dataset.eq(dataset) & bi.clf.str.endswith(sufijo)]
        .set_index(["semilla", "clf"])[metric]
        .unstack()
    )
    fig, ax = plt.subplots(layout="tight")
    data.plot(kind="scatter", y=y, x=x, ax=ax)
    range = data.max().max() - data.min().min()
    x_left = data.min()[x] - 0.1 * range
    ax.set_xlim(x_left)
    ax.set_ylim(data.min()[y] - 0.1 * range)
    ax.axline((x_left, x_left), slope=1, color="gray", linestyle="dotted")
    ax.set_title(f"$R^2$ por semilla para {y} y {x} en `{dataset}`")
    fpath = img_dir / f"{dataset}-{x}-{y}-{metric}-scatter.svg"
    logger.info(fpath)
    fig.savefig(fpath)
    close(fig)
    plt.close()

In [None]:
for metric in ["r2", "accuracy"]:
    best = (
        bi.dropna(subset=metric)
        .groupby(["dataset", "clf"])[metric]
        .median()
        .sort_values()
        .reset_index("clf")
        .groupby("dataset")
        .last()
        .clf.reset_index()
        .groupby("clf")
        .agg([len, ", ".join])
    )
    best.columns = ["cant", "datasets"]
    best.sort_values("cant", ascending=False).to_csv(
        open(data_dir / f"mejor-clf-por-dataset-segun-{metric}-mediano.csv", "w")
    )

In [None]:
bi.clf.unique()

In [None]:
bi.dataset.nunique() * bi.clf.nunique() * 25

In [None]:
(
    bi[bi.dataset.eq("helices_0") & bi.clf.str.endswith("kn")]
    .set_index(["semilla", "clf"])
    .r2.unstack("clf")
    .plot(x="kn", y="fkn", kind="scatter")
)

In [None]:
bi[bi.dataset.eq("eslabones_0") & bi.clf.str.endswith("kn")].set_index(
    ["semilla", "clf"]
).r2.unstack().plot(kind="scatter", y="fkn", x="kn")
# .assign(delta=lambda df: df.fkn.sub(df.kn)).mul(100).round(2)
plt.axline((0.5, 0.5), slope=1, color="gray", linestyle="dotted")
plt.title("$R^2$ por semilla para FKN y KN en `eslabones_0`")

In [None]:
pd.DataFrame(
    {
        k[1:3]: info[k[2]].busqueda.best_estimator_.get_params()
        for k, info in infos.items()
        if k[0] == "helices_0" and k[2].endswith("kn")
    }
).T[["alpha", "n_neighbors"]].unstack().reorder_levels([1, 0], axis=1).sort_index(
    axis=1
).drop(columns=("kn", "alpha"))  # .value_counts().reset_index().round(4)

In [None]:
pd.DataFrame(
    {
        k[1:3]: info[k[2]].busqueda.best_estimator_.get_params()
        for k, info in infos.items()
        if k[0] == "helices_0" and k[2].endswith("kn")
    }
)

In [None]:
# Detalle en espirales_lo, perfiles pérdida
run_seed = 7060
alpha = 2.25
clave = ("espirales_lo", str(run_seed), "fkdc", str(main_seed), "neg_log_loss")
orig_info = infos[clave]
busqueda = pd.DataFrame(orig_info.fkdc.busqueda.cv_results_)

In [None]:
orig_mean_test_scores = (
    busqueda[busqueda.param_alpha == alpha].set_index("param_bandwidth").mean_test_score
)  # / len(info.fkdc.preds)
orig_mean_test_scores.plot(figsize=(18, 5))
plt.scatter(orig_mean_test_scores.index, orig_mean_test_scores)
plt.xscale("symlog")

In [None]:
espirales_lo = Dataset.cargar(datasets_dir / f"espirales_lo-{run_seed}.pkl")
espirales_lo.scatter()

In [None]:
from fkdc.fermat import KDClassifier

clf = KDClassifier(metric="fermat", alpha=1.5)

In [None]:
tarea = Tarea(
    espirales_lo,
    {
        bandwidth: (clf, {"bandwidth": [bandwidth]})
        for bandwidth in orig_mean_test_scores.index
    },
    seed=main_seed,
    scoring="neg_log_loss",
    split_evaluacion=0.5,
)
tarea.entrenar()
tarea.evaluar()
new_info = tarea.info

In [None]:
n_eval = len(new_info[0.001].preds)
new_metrics = pd.DataFrame.from_dict(
    {
        bw: {
            "eval_logvero": info.logvero,
            "eval_r2": info.r2,
            "mean_test_score": info.busqueda.cv_results_["mean_test_score"][0],
            "std_test_score": info.busqueda.cv_results_["std_test_score"][0],
            "eval_accuracy": info.accuracy,
        }
        for bw, info in new_info.items()
        if bw != "base"
    },
    orient="index",
)
new_metrics["rank_test_score"] = (-new_metrics.mean_test_score).rank().astype(int)
# assert orig_mean_test_scores.equals(new_metrics.mean_test_score)
fig, ax1 = plt.subplots(figsize=(12, 5))
new_metrics.eval_logvero.plot(label="eval")
new_metrics.mean_test_score.mul(n_eval).plot(label="test")
ax1.set_ylabel("logvero (solid lines)")
plt.legend()
ax2 = ax1.twinx()
new_metrics.eval_accuracy.plot(ax=ax2, linestyle="dotted")
ax2.set_ylabel("accuracy (dotted lines)")
plt.xscale("log")

In [None]:
max_score, sd = busqueda[busqueda.rank_test_score.eq(1)][
    ["mean_test_score", "std_test_score"]
].min()
busqueda[busqueda.mean_test_score.ge(max_score - sd)].sort_values(
    ["param_alpha", "param_bandwidth"], ascending=[True, False]
)[
    [
        "rank_test_score",
        "param_alpha",
        "param_bandwidth",
        "mean_test_score",
        "std_test_score",
        "mean_train_score",
    ]
]

In [None]:
new_max_score, new_sd = new_metrics[new_metrics.rank_test_score.eq(1)][
    ["mean_test_score", "std_test_score"]
].min()
new_metrics[new_metrics.mean_test_score.ge(new_max_score - new_sd)].sort_index(
    ascending=False
)

### 2D, high noise

In [None]:
import pickle

fig, axs = plt.subplots(1, 3, figsize=(15, 5))
seed = run_seeds[0]
datasets_2d = ["lunas", "espirales", "circulos"]
for nombre, ax in zip(datasets_2d, axs, strict=False):
    ds = pickle.load(open(datasets_dir / f"{nombre}_hi-{seed}.pkl", "rb"))
    ds.scatter(ax=ax)
    ax.set_title(nombre)
plt.tight_layout()
fig.savefig(img_dir / "datasets-lunas-circulos-espirales-hi-new.svg")

In [None]:
datasets = [ds for ds in bi.dataset.unique() if ds.endswith("_hi")]

In [None]:
(
    bi[bi.dataset.isin(datasets)]
    .groupby(["clf", "dataset"])[["r2", "accuracy"]]
    .mean()
    .unstack()
    .reorder_levels([1, 0], axis=1)
    .sort_index(axis=1)
    .sort_values(("circulos_hi", "r2"), ascending=False)
    .mul(100)
    .round(2)
)
# TBL 2: datsets 2d low noise

- R^2 consistentemente el mejor para (f)KDC, pero sin diferencias entre fermat y euclideo, pero muy jodido en gral
- accuracy no mucho peor que SVC (que sigue siendo el rey)
  - en lunas_hi gana FKDC!
  - en circulos_hi, lunas_hi GBT competitivo

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(12, 4))
metric = "r2"
for idx, dataset in enumerate(datasets):
    ax = axs[idx]
    data = bi[
        bi.dataset.eq(dataset) & bi[metric].notna()
    ].sort_values("clf")
    sns.boxplot(data, hue="clf", y=metric, gap=0.2, ax=ax, palette=default_palette)
    ax.set_title(dataset)
    ax.axhline(
        data.groupby("clf")[metric].median().max(), linestyle="dotted", color="gray"
    )
    ybot, ytop = np.percentile(data[metric].dropna(), [25, 100])
    ax.set_ylim(ybot * 0.99, ytop * 1.01)
    if idx != 0:
        ax.get_legend().set_visible(False)
fig.tight_layout()
fig.savefig(img_dir / "boxplot-lunas-espirales-circulos-new.svg")
# IMG 2: Detalle metric boxplot

In [None]:
bi2d = bi[bi.dataset.str.endswith(("_lo", "_hi"))].copy()
bi2d[["figura", "ruido"]] = bi2d.dataset.str.split("_", expand=True)

In [None]:
drops = (
    bi2d.groupby(["figura", "ruido", "clf"])[["r2", "accuracy"]]
    .mean()
    .unstack("ruido")
    
)
drops


In [None]:

for figura, metric in product(bi2d.figura.unique(), ["r2", "accuracy"]):
    fig, ax = plt.subplots(layout="tight")
    (
        drops.xs(figura)[metric][["lo", "hi"]]
        .sort_values("hi", ascending=False)
        .plot(kind="bar", ax=ax)
    )
    plt.title(f"Caída absoluta de {metric} en {figura}")
    fig.savefig(img_dir / f"{figura}-caida_{metric}.svg")

- Los órdenes se mantienen: quien andaba mejor en lo, anda mejor en hi, +- un cachito, pero los que _mejor_ andaban, más pierden

### 3d, low dim

In [None]:
import pickle

fig = plt.figure(figsize=(10, 10))
seed = run_seeds[0]  # 1134
nombres_datasets = ["pionono", "eslabones", "helices", "hueveras"]

datasets = {
    nombre: pickle.load(open(datasets_dir / f"{nombre}_0-{seed}.pkl", "rb"))
    for nombre in nombres_datasets
}
for idx, (nombre, ds) in enumerate(datasets.items(), start=1):
    ax = fig.add_subplot(2, 2, idx, projection="3d")
    ds.scatter_3d(ax=ax)
    ax.set_title(nombre)
plt.tight_layout()
fig.savefig(img_dir / "datasets-3d-0.svg")
# IMG 2: datasets 2d, low noise

In [None]:
import pickle

nombres_datasets = ["pionono", "eslabones", "helices", "hueveras"]

datasets = {
    nombre: pickle.load(open(datasets_dir / f"{nombre}_0-{plotting_seed}.pkl", "rb"))
    for nombre in nombres_datasets
}
for nombre, ds in datasets.items():
    fig, ax = plt.subplots(layout="tight", subplot_kw={'projection': '3d'})
    ds.scatter_3d(ax=ax)
    ax.set_title(nombre)
    fpath = img_dir / f"{nombre}-scatter-3d.svg"
    logger.debug(fpath)
    fig.savefig(fpath)
    close(fig)
    plt.close()

In [None]:
for nombre, ds in datasets.items():
    ds.pairplot(dims=[2, 1, 0], height=2, plot_kws={"alpha": 0.5, "s": 5}, corner=True)
    plt.suptitle(nombre)
    break

In [None]:
(
    bi[bi.dataset.isin(f"{nombre}_0" for nombre in nombres_datasets)]
    .groupby(["clf", "dataset"])[["r2", "accuracy"]]
    .mean()
    .unstack()
    .reorder_levels([1, 0], 1)
    .sort_index(axis=1)
    .mul(100)
    .round(2)
    .sort_values(("pionono_0", "r2"), ascending=False)
)  # TBL 2: datsets 2d low noise

- helices y eslabones muy fáciles
- en heveras a fKDC le duele la varianza agregada versus eKDC
- (f)KDC best in class for acc & r2 (acc like SVC, plus R^2), but mostly no diff
  - r^2 empeora algo en helices con fkdc, sigue siendo mucho mejor que (f)KN, resto ni computa

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(8, 8))
metric = "r2"
for (idx, nombre), ax in zip(enumerate(nombres_datasets), axs.flatten(), strict=False):
    data = bi[
        bi.dataset.eq(f"{nombre}_0") & ~bi.clf.isin(exclude_clfs) & bi[metric].notna()
    ].sort_values("clf")
    sns.boxplot(data, hue="clf", y=metric, gap=0.2, ax=ax, palette=default_palette)
    ax.set_title(nombre)
    ax.axhline(
        data.groupby("clf")[metric].median().max(), linestyle="dotted", color="gray"
    )
    ybot, ytop = np.percentile(data[metric].dropna(), [40, 100])
    ax.set_ylim(ybot, None)
    if idx != 0:
        ax.get_legend().set_visible(False)
fig.tight_layout()
fig.savefig(img_dir / "3d-low-r2.svg")
# IMG 2: Detalle metric boxplot

In [None]:
clave_fkdc = ("pionono_0", str(run_seeds[6]), "fkdc", str(main_seed), "neg_log_loss")
clave_kdc = ("pionono_0", str(run_seeds[6]), "kdc", str(main_seed), "neg_log_loss")

Algún comentario sobre pionono de sapienza???

### 3d, high dim
- no hace falta reproducir, pues el resto son puro ruido

In [None]:
import pickle

fig = plt.figure(figsize=(10, 10))
seed = run_seeds[0]  # 1134
nombres_datasets = ["pionono", "eslabones", "helices", "hueveras"]

datasets = {
    nombre: pickle.load(open(datasets_dir / f"{nombre}_12-{seed}.pkl", "rb"))
    for nombre in nombres_datasets
}

In [None]:
for nombre, ds in datasets.items():
    ds.pairplot(dims=[3, 4, 5], height=2, plot_kws={"alpha": 0.5, "s": 5}, corner=True)
    plt.suptitle(nombre)
    break

In [None]:
(
    bi[bi.dataset.isin(f"{nombre}_12" for nombre in nombres_datasets)]
    .groupby(["clf", "dataset"])[["r2", "accuracy"]]
    .mean()
    .unstack()
    .reorder_levels([1, 0], 1)
    .sort_index(axis=1)
    .mul(100)
    .round(2)
    .sort_values(("eslabones_12", "accuracy"), ascending=False)
)  # TBL 2: datsets 2d low noise

- eslabones: (f)[kdc|kn] _zafan_ contra svc,, pero pierden con gnb y gbt brillan:
  - gnb: las dimensioens de ruido le duelen muy poquito, pues todo es independiente
  - gbt: bien noparam, puede "ignorar" fácilmente las dimensioens extra
Una manera de verlo: generic feature importances (permutation importance)

- helices, hueveras: muy dificiles para todos (en acc y r2), pero (f)kdc encima dan r^2 _negativo_, con confianza le erran
- NO parece culpa de fermat: en fkn vs kn, r^2 da igualito
  - interesante: por qué el r^2 de kcd es tanto peor que el de fkdc en hueveras?

- pionono: gbt brilla+, gnb brilla, resto malísimo: acá de hecho LR anda mejor que (f)[kdc|kn]

LA MALDICIÓN DE LA DIMENSIONALIDAD SIGUE JODIENDO...

In [None]:
from sklearn.inspection import permutation_importance

# https://scikit-learn.org/stable/modules/generated/sklearn.inspection.permutation_importance.html

In [None]:
nombre = "pionono_12"
semilla = 7060
ds = Dataset.cargar(datasets_dir / f"{nombre}-{semilla}.pkl")

In [None]:
main_seed

In [None]:
imps = {}
for clf in clasificadores.keys():
    logger.info(f"Processing permutation importance for {clf}")
    scoring = "neg_log_loss" if clf != "svc" else "accuracy"
    info = infos[(nombre, str(semilla), clf, str(main_seed), scoring)]
    best = info[clf].busqueda.best_estimator_
    importances = permutation_importance(best, ds.X, ds.y, scoring=scoring)
    imps[clf] = importances["importances_mean"]

In [None]:
imps = pd.DataFrame(imps)
imps[["fkdc", "gbt", "svc", "gnb"]].loc[:5].plot(kind="bar", color=default_palette)
plt.yscale("log")

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(8, 8))
metric = "r2"
for (idx, nombre), ax in zip(enumerate(nombres_datasets), axs.flatten(), strict=False):
    data = bi[
        bi.dataset.eq(f"{nombre}_12") & ~bi.clf.isin(exclude_clfs) & bi[metric].notna()
    ].sort_values("clf")
    sns.boxplot(data, hue="clf", y=metric, gap=0.2, ax=ax, palette=default_palette)
    ax.set_title(nombre)
    ax.axhline(
        data.groupby("clf")[metric].median().max(), linestyle="dotted", color="gray"
    )
    ybot, ytop = np.percentile(data[metric].dropna(), [10, 100])
    ax.set_ylim(ybot, None)
    if idx != 0:
        ax.get_legend().set_visible(False)
fig.tight_layout()
fig.savefig(img_dir / "3d-low-r2.svg")
# IMG 2: Detalle metric boxplot

In [None]:
r2s = bi[bi.dataset.eq("hueveras_12")].set_index(["semilla", "clf"]).r2.unstack()
r2s[["fkn", "kn"]].plot(kind="bar")
# ínfima diferencia, pero en genral pareja p/ambos

In [None]:
(-r2s[["fkdc", "kdc"]]).plot(kind="bar")
# mucha diferencia, siempre en contra de KDC. los best params?

In [None]:
nombre = "hueveras_12"
clf = "fkdc"
scoring = "neg_log_loss" if clf != "svc" else "accuracy"
info = infos[(nombre, str(semilla), clf, str(main_seed), scoring)][clf]

In [None]:
busqueda = pd.DataFrame(info.busqueda.cv_results_)
data = busqueda.set_index(["param_alpha", "param_bandwidth"]).mean_test_score.unstack()
X = data.columns.values
Y = data.index.values
Z = data.values

fig, ax = plt.subplots(layout="constrained")
zmin, zmax = Z.min(), Z.max()
CS = ax.contourf(X, Y, Z, 15, cmap="viridis")
# CS = ax.contour(X, Y, Z, 15, cmap="viridis")
ax.set_title("Exactitud para $\\alpha$ y $h$")
ax.set_xlabel("$h$")
ax.set_ylabel("$\\alpha$")
ax.scatter(X[Z.argmax(axis=1)], Y, marker="x", color="red")
# Make a colorbar for the ContourSet returned by the contourf call.
cbar = fig.colorbar(CS)
cbar.ax.set_ylabel("Exactitud")
ax.set_xscale("log")
# plt.tight_layout()
fig.savefig(img_dir / "heatmap-fkdc-2d-lo-new.svg")

da todo igual!

In [None]:
nombre = "hueveras_12"
clf = "kdc"
scoring = "neg_log_loss" if clf != "svc" else "accuracy"
info = infos[(nombre, str(semilla), clf, str(main_seed), scoring)][clf]

In [None]:
busqueda = pd.DataFrame(info.busqueda.cv_results_)
busqueda.set_index("param_bandwidth")[
    [
        "mean_test_score",
        "mean_train_score",
    ]
].plot()
plt.axvline(info.busqueda.best_estimator_.bandwidth, color="gray", linestyle="dotted")
plt.xscale("log")

In [None]:
pd.concat(
    [
        pd.DataFrame.from_dict(
            {
                k[1]: v.fkdc.busqueda.best_params_
                for k, v in infos.items()
                if k[0] == "hueveras_12" and k[2] == "fkdc"
            },
            orient="index",
        ).add_prefix("fkdc_"),
        pd.DataFrame.from_dict(
            {
                k[1]: v.kdc.busqueda.best_params_
                for k, v in infos.items()
                if k[0] == "hueveras_12" and k[2] == "kdc"
            },
            orient="index",
        ).add_prefix("kdc_"),
        bi[bi.clf.str.endswith("kdc") & bi.dataset.eq("hueveras_12")]
        .assign(semilla=lambda df: df.semilla.astype(str))
        .set_index(["semilla", "clf"])
        .r2.mul(10000)
        .astype(int)
        .unstack()
        .add_suffix("_logvero"),
    ],
    axis=1,
).sort_index(axis=1)

Los dos estimadores están contra los "bordes" de su caja de hiperparámetros, y el paisaje/superficie de pérdida es súper plana, la diferencia es que por alguna razón fkdc paró mejor en el borde de máxima parsimonia (1, 100), mientras que kdc encontró mínimas diferencias numéricas y está agarrando "óptimos" muy chiquitos que son marginalmente mejores que 10K (y cuando lo agarra, da simplemente 0 el R^2). 

Otro caso más para empujar una regla tipo 1SD rule arriba del refit parsimonioso.

In [None]:
bi3d = bi[bi.dataset.str.endswith(("_0", "_12"))].copy()
bi3d[["figura", "dims_ruido"]] = bi3d.dataset.str.split("_", expand=True)

In [None]:
metric = "accuracy"
drops = (
    bi3d.groupby(["figura", "dims_ruido", "clf"])[["r2", "accuracy"]]
    .mean()[metric]
    .unstack("dims_ruido")
    .assign(rel_drop=lambda df: (df["0"] - df["12"]).div(df["0"]))
)
drops.rel_drop.unstack("figura").dropna().mul(100).round(2)

In [None]:
for figura in bi3d.figura.unique():
    (
        drops.xs(figura)
        .drop(columns="rel_drop")[["0", "12"]]
        .sort_values("12", ascending=False)
        .plot(kind="bar")
    )
    plt.title(f"Caída absoluta de {metric} en {figura}")

### Multi-k

In [None]:
import pickle

fig, axs = plt.subplots(2, 2, figsize=(10, 10))
datasets_multik = ["anteojos", "iris", "vino", "pinguinos"]
for nombre, ax in zip(datasets_multik, axs.flatten(), strict=False):
    ds = pickle.load(open(datasets_dir / f"{nombre}.pkl", "rb"))
    ds.scatter(ax=ax)
    ax.set_title(f"{nombre} (n={ds.n}; p={ds.p}, k={ds.k})")
plt.tight_layout()
# fig.savefig(img_dir / "datasets-lunas-circulos-espirales-hi-new.svg")

In [None]:
(
    bi[bi.dataset.isin(datasets_multik)]
    .groupby(["clf", "dataset"])[["r2", "accuracy"]]
    .mean()
    .unstack()
    .reorder_levels([1, 0], axis=1)
    .sort_index(axis=1)
    .sort_values(("iris", "r2"), ascending=False)
    .mul(100)
    .round(2)
)
# TBL 2: datsets 2d low noise

#### Anteojos
espectacular performance de fKDC en R2, con acc ==SVC, pero a esta altura es imposible encontrar una diferencia entre ambos, clarito.

In [None]:
nombre_dataset = "anteojos"
seed = 1434
# clfs = ["fkdc", "kdc", "kn", "fkn", "gbt", "lr", "slr", "gnb", "svc"]
clfs = basic_info.clf.unique().tolist()
dataset = pickle.load(open(datasets_dir / f"{nombre_dataset}.pkl", "rb"))
# dataset = pickle.load(open(datasets_dir / f"{nombre_dataset}-{seed}.pkl", "rb"))

In [None]:
tarea = Tarea(dataset, {}, seed=seed, split_evaluacion=0.5)


#### Iris
Acc esta OK, pero R2 sufre bocha en el promedio por algunas muy malas semillas que llevan el R2 a terreno negativo

In [None]:
bi[bi.clf.str.endswith("kdc") & bi.dataset.eq("iris")].pivot(
    index="semilla", columns="clf", values="r2"
).sort_values("fkdc").plot(kind="bar")

In [None]:
bad_seed = 1182
clf = "fkdc"
clave = ("iris", str(None), clf, str(bad_seed), "neg_log_loss")
info = infos[clave][clf]

In [None]:
busqueda = pd.DataFrame(info.busqueda.cv_results_)
data = busqueda.set_index(["param_alpha", "param_bandwidth"]).mean_test_score.unstack()
X = data.columns.values
Y = data.index.values
Z = data.values

fig, ax = plt.subplots(layout="constrained")
zmin, zmax = Z.min(), Z.max()
CS = ax.contourf(X, Y, Z, 15, cmap="viridis")
# CS = ax.contour(X, Y, Z, 15, cmap="viridis")
ax.set_title("Exactitud para $\\alpha$ y $h$")
ax.set_xlabel("$h$")
ax.set_ylabel("$\\alpha$")
ax.scatter(X[Z.argmax(axis=1)], Y, marker="x", color="red")
# Make a colorbar for the ContourSet returned by the contourf call.
cbar = fig.colorbar(CS)
cbar.ax.set_ylabel("Exactitud")
ax.set_xscale("log")
# plt.tight_layout()
fig.savefig(img_dir / "heatmap-fkdc-2d-lo-new.svg")

In [None]:
bad_seed = 5640
clf = "fkdc"
clave = ("iris", str(None), clf, str(bad_seed), "neg_log_loss")
info = infos[clave][clf]

In [None]:
busqueda = pd.DataFrame(info.busqueda.cv_results_)
data = busqueda.set_index(["param_alpha", "param_bandwidth"]).mean_test_score.unstack()
X = data.columns.values
Y = data.index.values
Z = data.values

fig, ax = plt.subplots(layout="constrained")
zmin, zmax = Z.min(), Z.max()
CS = ax.contourf(X, Y, Z, 15, cmap="viridis")
# CS = ax.contour(X, Y, Z, 15, cmap="viridis")
ax.set_title("Exactitud para $\\alpha$ y $h$")
ax.set_xlabel("$h$")
ax.set_ylabel("$\\alpha$")
ax.scatter(X[Z.argmax(axis=1)], Y, marker="x", color="red")
# Make a colorbar for the ContourSet returned by the contourf call.
cbar = fig.colorbar(CS)
cbar.ax.set_ylabel("Exactitud")
ax.set_xscale("log")
# plt.tight_layout()
# fig.savefig(img_dir / "heatmap-fkdc-2d-lo-new.svg")

In [None]:
nombre = "iris"
clf = "kdc"
scoring = "neg_log_loss" if clf != "svc" else "accuracy"
# seeds = [1134, 3923, 6825, 4505]  # good, bad, bad, good
seeds = _get_run_seeds()[:4]
height = 5
fig, axs = plt.subplots(1, len(seeds), figsize=(height * len(seeds), height))
for seed, ax in zip(seeds, axs, strict=False):
    info = infos[(nombre, str(None), clf, str(seed), scoring)][clf]
    busqueda = pd.DataFrame(info.busqueda.cv_results_)
    busqueda.set_index("param_bandwidth")[
        [
            "mean_test_score",
            "mean_train_score",
        ]
    ].plot(ax=ax)
    ax.set_title(f"{seed}")
    ax.axvline(
        info.busqueda.best_estimator_.bandwidth, color="gray", linestyle="dotted"
    )
    ax.axhline(info.logvero / len(info.preds), color="C2", label="mean_eval_score")
    ax.legend()
    ax.set_xscale("log")

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(8, 8))
metric = "r2"
for idx, dataset in enumerate(datasets_multik):
    ax = axs.flatten()[idx]
    data = bi[
        bi.dataset.eq(dataset) & ~bi.clf.isin(exclude_clfs) & bi[metric].notna()
    ].sort_values("clf")
    sns.boxplot(data, hue="clf", y=metric, gap=0.2, ax=ax, palette=default_palette)
    ax.set_title(dataset)
    ax.axhline(
        data.groupby("clf")[metric].median().max(), linestyle="dotted", color="gray"
    )
    # ybot, ytop = np.percentile(data[metric].dropna(), [25, 100])
    # ax.set_ylim(ybot * 0.99, ytop * 1.01)
    if idx != 0:
        ax.get_legend().set_visible(False)
fig.tight_layout()
# fig.savefig(img_dir / "boxplot-lunas-espirales-circulos-new.svg")
# IMG 2: Detalle metric boxplot

In [None]:
bad_seed = 5640
clf = "fkdc"
clave = ("vino", str(None), clf, str(bad_seed), "neg_log_loss")
info = infos[clave][clf]

In [None]:
busqueda = pd.DataFrame(info.busqueda.cv_results_)
data = busqueda.set_index(["param_alpha", "param_bandwidth"]).mean_test_score.unstack()
X = data.columns.values
Y = data.index.values
Z = data.values

fig, ax = plt.subplots(layout="constrained")
zmin, zmax = Z.min(), Z.max()
CS = ax.contourf(X, Y, Z, 15, cmap="viridis")
# CS = ax.contour(X, Y, Z, 15, cmap="viridis")
ax.set_title("Exactitud para $\\alpha$ y $h$")
ax.set_xlabel("$h$")
ax.set_ylabel("$\\alpha$")
ax.scatter(X[Z.argmax(axis=1)], Y, marker="x", color="red")
# Make a colorbar for the ContourSet returned by the contourf call.
cbar = fig.colorbar(CS)
cbar.ax.set_ylabel("Exactitud")
ax.set_xscale("log")
# plt.tight_layout()
fig.savefig(img_dir / "heatmap-fkdc-2d-lo-new.svg")

In [None]:
nombre = "iris"
clf_metrics = {"kdc": "bandwidth", "gnb": "var_smoothing"}
scoring = "neg_log_loss" if clf != "svc" else "accuracy"
seed = 1182
height = 5
fig, axs = plt.subplots(1, 2, figsize=(height * len(clf_metrics), height))
for (clf, metric), ax in zip(clf_metrics.items(), axs, strict=False):
    info = infos[(nombre, str(None), clf, str(seed), scoring)][clf]
    busqueda = pd.DataFrame(info.busqueda.cv_results_)
    busqueda.set_index(f"param_{metric}")[
        [
            "mean_test_score",
            "mean_train_score",
        ]
    ].plot(ax=ax)
    ax.set_title(f"{clf} by {metric}")
    ax.axvline(
        getattr(info.busqueda.best_estimator_, metric), color="gray", linestyle="dotted"
    )
    ax.axhline(info.logvero / len(info.preds), color="C2", label="mean_eval_score")
    ax.legend()
    ax.set_xscale("log")

#### Pinguinos
OK, pongámosle que en `iris` (f)KDC da tan mal porque tuvimos mala suerte con las semillas, una regla más parsimoniosa hubiese ayudado. Pero en `iris`, da _todo_ mal ("peor"); por qué? Tiene 3 clases fácilmente diferenciables a ojo en el eje `(0, 1)`!

In [None]:
pinguinos = pickle.load(open(datasets_dir / "pinguinos.pkl", "rb"))
pinguinos.pairplot(height=2)

Hmmm, en el eje `(0, 1)` (o `(0, 2)`, o `(0, 3)`) las tres clases son fácilmente diferenciables a ojo, pero en las dims `(1,2,3)` Adelie y Chinstrap están muy encimadas. ¿Será que funcionan como "dimensiones de ruido" y confunden al clasificador más de lo que lo ayudan?
Si así fuese, esperaríamos ver que en la matriz de confusión, "Gentoo" aparece bien clasificada, y "Adelie/Chinstrap" están confundidas. Si así fuese,

1. La matriz de confusión debería verse "diagonal por bloques", y
2. Reentrenar el clasificador (digamos que KDC para simplificar) con sólo dos dimensiones debería mejorar la performance.

Veamos

In [None]:
mejor_semilla = (
    bi[bi.dataset.eq("pinguinos") & bi.clf.eq("kdc")]
    .sort_values("r2", ascending=False)
    .iloc[0]
    .semilla
)

In [None]:
tarea_pinguinos = Tarea(
    pinguinos, algoritmos={}, seed=mejor_semilla, split_evaluacion=0.5
)

In [None]:
clave = ("pinguinos", "None", "kdc", str(mejor_semilla), "neg_log_loss")
info = infos[clave].kdc
mejor_kdc = info.busqueda.best_estimator_

In [None]:
from sklearn.metrics import accuracy_score

expected_accuracy = info.accuracy
y_true = tarea_pinguinos.y_eval
y_pred = mejor_kdc.predict(tarea_pinguinos.X_eval)
actual_accuracy = accuracy_score(y_true, y_pred)
assert actual_accuracy == expected_accuracy

In [None]:
pd.DataFrame({"true": y_true, "pred": y_pred}).value_counts().unstack()

In [None]:
pd.DataFrame(
    {
        "eval": pd.Series(y_true).value_counts(normalize=True),
        "pop": pd.Series(ds.y).value_counts(normalize=True),
    }
)

Efectivamente, Gentoo y Adelie están esencialmente bien clasificados, pero la clase Chinstrap está completamente predicha como Adelie, que es la clase mayoritaria en su cluster.
Las proporciones son muy similares en evaluación que en la población en general, también.

In [None]:
pinguinos_corto = Dataset(pinguinos.X[:, [0, 1]], pinguinos.y)

In [None]:
clf = KDClassifier(metric="euclidean")
tarea = Tarea(
    pinguinos_corto,
    {"kdc": (clasificadores["kdc"], grillas["kdc"])},
    seed=mejor_semilla,
    scoring="neg_log_loss",
    split_evaluacion=0.5,
)
tarea.entrenar()
tarea.evaluar()
pinguinos_corto_info = tarea.info
n_eval = len(tarea.y_eval)

In [None]:
info.accuracy, tarea.info["kdc"].accuracy

In [None]:
info.r2, tarea.info["kdc"].r2

In [None]:
comp = pd.DataFrame(
    {
        "true": tarea.y_eval,
        "orig_pred": y_pred,
        "new_pred": tarea.info.kdc.busqueda.best_estimator_.predict(tarea.X_eval),
    }
)

In [None]:
comp[["true", "orig_pred"]].value_counts().unstack()

In [None]:
comp[["true", "new_pred"]].value_counts().unstack()

¡Increíble! Debería repetirse similarmente para `vino`. E examinar el pairplot, sabemos que de las 13 dimesiones, `(6, 9, 12)` son bastante difenretes de a pares:

#### Vino

In [None]:
vino = pickle.load(open(datasets_dir / "vino.pkl", "rb"))
vino.pairplot(dims=[6, 9, 12])

In [None]:
mejor_semilla = (
    bi[bi.dataset.eq("vino") & bi.clf.eq("kdc")]
    .sort_values("accuracy", ascending=False)
    .iloc[0]
    .semilla
)

In [None]:
tarea_vino = Tarea(vino, algoritmos={}, seed=mejor_semilla, split_evaluacion=0.5)

In [None]:
clave = ("vino", "None", "kdc", str(mejor_semilla), "neg_log_loss")
info = infos[clave].kdc
mejor_kdc = info.busqueda.best_estimator_

In [None]:
from sklearn.metrics import accuracy_score

expected_accuracy = info.accuracy
y_true = tarea_vino.y_eval
y_pred = mejor_kdc.predict(tarea_vino.X_eval)
actual_accuracy = accuracy_score(y_true, y_pred)
assert actual_accuracy == expected_accuracy

In [None]:
actual_accuracy

In [None]:
pd.DataFrame({"true": y_true, "pred": y_pred}).value_counts().unstack()

In [None]:
pd.DataFrame(
    {
        "eval": pd.Series(y_true).value_counts(normalize=True),
        "pop": pd.Series(vino.y).value_counts(normalize=True),
    }
)

In [None]:
vino_corto = Dataset(vino.X[:, [6, 9, 12]], vino.y)

In [None]:
clf = KDClassifier(metric="euclidean")
tarea = Tarea(
    vino_corto,
    {"kdc": (clasificadores["kdc"], grillas["kdc"])},
    seed=mejor_semilla,
    scoring="neg_log_loss",
)
tarea.entrenar()
tarea.evaluar()
vino_corto_info = tarea.info
n_eval = len(tarea.y_eval)

In [None]:
info.accuracy, tarea.info["kdc"].accuracy

In [None]:
info.r2, tarea.info["kdc"].r2

In [None]:
comp = pd.DataFrame(
    {
        "true": tarea.y_eval,
        "orig_pred": y_pred,
        "new_pred": tarea.info.kdc.busqueda.best_estimator_.predict(tarea.X_eval),
    }
)

In [None]:
comp[["true", "orig_pred"]].value_counts().unstack()

In [None]:
comp[["true", "new_pred"]].value_counts().unstack()

Mejor, pero no _mucho_ mejor. Evidentemente hay más cosas a examinar...

### Digitos

In [None]:
(
    bi[bi.dataset.isin(["digitos", "mnist"])]
    .groupby(["clf", "dataset"])[["r2", "accuracy"]]
    .mean()
    .unstack()
    .reorder_levels([1, 0], axis=1)
    .sort_index(axis=1)
    .sort_values(("mnist", "accuracy"), ascending=False)
    .mul(100)
    .round(2)
)
# TBL 2: datsets 2d low noise

#### digitos
bomba! aunque tambien andan muy bien todos, hasta LR tiene excelente acc yR2; GNB pifia en R2 pero still competitivo y los tiempos deben ser minimos
TODO: mirar tiempos, maybe su propia sección

#### mnist
curioso: KDC super competitivo, fKDC un desastre. analizar

In [None]:
pd.concat(
    [
        pd.DataFrame.from_dict(
            {
                k[1]: v.fkdc.busqueda.best_params_
                for k, v in infos.items()
                if k[0] == "mnist" and k[2] == "fkdc"
            },
            orient="index",
        ).add_prefix("fkdc_"),
        pd.DataFrame.from_dict(
            {
                k[1]: v.kdc.busqueda.best_params_
                for k, v in infos.items()
                if k[0] == "mnist" and k[2] == "kdc"
            },
            orient="index",
        ).add_prefix("kdc_"),
        bi[bi.clf.str.endswith("kdc") & bi.dataset.eq("mnist")]
        .assign(semilla=lambda df: df.semilla.astype(str))
        .set_index(["semilla", "clf"])
        .r2.mul(10000)
        .astype(int)
        .unstack()
        .add_suffix("_logvero"),
    ],
    axis=1,
).sort_index(axis=1)

In [None]:
grillas["fkdc"]["bandwidth"].max(), grillas["kdc"]["bandwidth"].max()

In [None]:
display(run_seed)
mnist_ds = Dataset.cargar(datasets_dir / f"mnist-{run_seed}.pkl")

In [None]:
mnist_ds.X.min(), mnist_ds.X.shape

In [None]:
from sklearn.datasets import fetch_openml
from sklearn.decomposition import PCA

n_components = 96
X, y = fetch_openml("mnist_784", version=1, return_X_y=True, as_frame=False)
X.shape, y.shape

In [None]:
ratio_components = n_components / X.shape[1]

In [None]:
pca = PCA(n_components).fit(X)

In [None]:
pca = PCA(ratio_components).fit(X)

In [None]:
list(infos.keys())[:5]
best_params = pd.DataFrame(
    [
        {
            "seed": int(main_seed if run_seed == "None" else run_seed),
            "clf": clf,
            "ds": ds,
            "best_params": [
                {"param": key, "value": value}
                for key, value in info[clf].busqueda.best_params_.items()
            ],
        }
        for (ds, run_seed, clf, main_seed, _), info in infos.items()
        if clf.endswith("kdc")
    ]
).set_index(["seed", "clf", "ds"])

In [None]:
best_params

In [None]:
idx = best_params.index
data = best_params.explode("best_params")
best_params = (
    pd.DataFrame.from_records(data.best_params.values, index=data.index)
    .pivot(columns="param", values="value")
    .unstack("clf")
    .reorder_levels(order=[1, 0], axis=1)
    .sort_index(axis=1)
    .sort_index(level="ds")
)

In [None]:
best_params

In [None]:
best_params.xs("helices_12", level="ds")

In [None]:
nombre = "helices_12"
clf_metrics = {"kdc": "bandwidth", "gnb": "var_smoothing"}
scoring = "neg_log_loss" if clf != "svc" else "accuracy"
seed = run_seed  # 3031
height = 5
fig, axs = plt.subplots(1, 2, figsize=(height * len(clf_metrics), height))
for (clf, metric), ax in zip(clf_metrics.items(), axs, strict=False):
    info = infos[(nombre, str(seed), clf, str(main_seed), scoring)][clf]
    # info = infos[(nombre, str(None), clf, str(seed), scoring)][clf]
    busqueda = pd.DataFrame(info.busqueda.cv_results_)
    busqueda.set_index(f"param_{metric}")[
        [
            "mean_test_score",
            "mean_train_score",
        ]
    ].plot(ax=ax)
    ax.set_title(f"{clf} by {metric}")
    ax.axvline(
        getattr(info.busqueda.best_estimator_, metric), color="gray", linestyle="dotted"
    )
    ax.axhline(info.logvero / len(info.preds), color="C2", label="mean_eval_score")
    ax.legend()
    ax.set_xscale("log")

In [None]:
clasificadores["gnb"]

In [None]:
grillas["gnb"]

In [None]:
from sklearn.naive_bayes import GaussianNB

clf = KDClassifier(metric="euclidean")
clf = GaussianNB()
ds = Dataset.cargar(datasets_dir / f"helices_0-{run_seed}.pkl")

tarea = Tarea(
    ds,
    {x: (clf, {"var_smoothing": [x]}) for x in np.logspace(-9, 9, 181)},
    # {"gnb": (clf, {"var_smoothing": np.logspace(-9, 9, 181)})},
    seed=main_seed,
    scoring="accuracy",
    # scoring="neg_log_loss",
    split_evaluacion=0.5,
)
tarea.entrenar()
tarea.evaluar()
tarea_info = tarea.info
n_eval = len(tarea.y_eval)
new_metrics = pd.DataFrame.from_dict(
    {
        x: {
            "eval_logvero": info.logvero,
            "eval_r2": info.r2,
            "mean_test_score": info.busqueda.cv_results_["mean_test_score"][0],
            "std_test_score": info.busqueda.cv_results_["std_test_score"][0],
            "eval_accuracy": info.accuracy,
        }
        for x, info in tarea_info.items()
        if x != "base"
    },
    orient="index",
)
new_metrics["rank_test_score"] = (-new_metrics.mean_test_score).rank().astype(int)

In [None]:
grillas

In [None]:
from numbers import Number

specs_limits = []
for clf, hipers in grillas.items():
    for hiper, values in hipers.items():
        if len(values) <= 2:
            continue
        first, last = values[0], values[-1]
        if isinstance(first, Number):
            specs_limits.append((clf, hiper, first, last))
specs_limits = pd.DataFrame(
    specs_limits, columns=["clf", "hiper", "min", "max"]
).set_index(["clf", "hiper"])

In [None]:
specs_limits

In [None]:
limits = []
nolimits = 0
for (ds, *_), info in infos.items():
    for clf in info.keys():
        if clf == "base":
            continue
        clf_limits = specs_limits.xs(clf)
        best_params = info[clf].busqueda.best_params_
        for hiper, value in best_params.items():
            if (hiper in clf_limits.index) and (value in clf_limits.xs(hiper).values):
                limits.append((clf, ds, hiper, value))
            else:
                nolimits += 1
limits = pd.DataFrame(limits, columns=["ds", "clf", "hiper", "value"])
print(nolimits)
limits

In [None]:
np.linspace(1, 4, 13)

In [None]:
np.logspace(-3, 5, 101)[-15:], np.logspace(-3, 6, 118)[-15:]

In [None]:
grillas["fkdc"]["bandwidth"]

In [None]:
limits.value_counts().xs("fkdc").unstack(("hiper", "value"))

In [None]:
fig, ax1 = plt.subplots(figsize=(12, 5))
# new_metrics.eval_logvero.plot(label="eval")
new_metrics.mean_test_score.plot(label="test")
# new_metrics.mean_test_score.mul(n_eval).plot(label="test")
ax1.set_ylabel("logvero (solid lines)")
plt.legend()
ax2 = ax1.twinx()
new_metrics.eval_accuracy.plot(ax=ax2, linestyle="dotted")
ax2.set_ylabel("accuracy (dotted lines)")
plt.xscale("log")
# plt.xlim(None, 10**-2)

In [None]:
from sklearn.neighbors import KernelDensity

In [None]:
from fkdc.config import n_samples
from fkdc.utils import sample

pca = PCA(n_components).fit(X)
_X = pca.transform(X)
datasets_mnist = {
    ("mnist", seed): Dataset(*sample(_X, y, n_samples=n_samples, random_state=seed))
    for seed in run_seeds
}

## Tomados de otros notebooks

### Sapienza's Swiss Roll
> We use the well-known example coined “Swiss roll”, Figure 1(a) and 1(b). We consider a dataset composed of 4 subsets steaming from independent Normal distributions (restricted to the unit square) with mean $\mu_1 = (.3, .3), \mu_2 = (.3, .7), \mu_3 = (.7, .3), \mu_4 = (.7, .7)$ respectively and constant variance, Figure 1(a). Then, we apply the Swiss Roll transformation, Figure 1(b).

## Sandbox Modesto

Distancia de Fermat en Clasificadores de Densidad por Núcleos

La mayoría de los algoritmos de clasificación asumen que las observaciones yacen sobre un espacio euclídeo: son escasos los métodos que también son válidos cuando el dominio de las variables aleatorias es una variedad arbitraria. En este ámbito, Loubes & Pelletier [1] proponen un clasificador basado en estimación de densidad por núcleos ("KDC") útil en variedades de Riemann conocidas.

Más aún, no siempre es conocida la variedad en que yacen los datos: una imagen de 1 megapíxel tiene 1.000.000 de píxeles, pero típicamente representa un objeto (un dígito, una letra, un animal) que - hipotetizamos - podríamos describir con (muchas) menos dimensiones. En estos contextos, se pueden aprender _distancias basadas en densidad_ (DBDs) que permiten estimar la variedad intrínseca de las observaciones a partir de la misma muestra, como la Distancia (muestral) de Fermat,  investigada por Groisman et al. [2]

En este trabajo, nos proponemos (1) programar el clasificador KDC, (2) extenderlo para utilizar la distancia muestral de Fermat ("F-KDC"), y (3) analizar comparativamente su _performance_ en distintas tareas de clasificación.

[1] J.-M. Loubes y B. Pelletier, «A Kernel-Based Classifier on a Riemannian Manifold», Statistics &
Decisions, vol. 26, n.º 1, pp. 35-51, mar. 2008, doi: 10.1524/stnd.2008.0911.
[2] P. Groisman, M. Jonckheere, y F. Sapienza, «Nonhomogeneous Euclidean First-Passage Percola-
tion and Distance Learning», n.º arXiv:1810.09398. arXiv, diciembre de 2019.

In [None]:
from math import pi

import numpy as np
import scipy as sp

In [None]:
xs = np.arange(-6, 6, 0.01)


def phi(x):
    return (2 * pi) ** (-1 / 2) * np.exp(-1 / 2 * x**2)


def ind(x, lb=-1 / 2, ub=1 / 2):
    return np.where((x >= lb) & (x < ub), 1 / (ub - lb), 0)

In [None]:
from matplotlib import pyplot as plt

plt.plot(xs, phi(xs))
plt.plot(xs, ind(xs, -1.2, 1.2))

In [None]:
xs = np.sort(sp.stats.norm().rvs(200)).reshape(-1, 1)

In [None]:
import numpy as np

for bw in [0.03, 0.1, 0.3, 1]:
    kde = KernelDensity(kernel="gaussian", bandwidth=bw).fit(xs)
    grid = np.arange(-5, 5, 0.01).reshape(-1, 1)
    dens = np.exp(kde.score_samples(grid))
    plt.plot(grid, dens, label=f"h = {bw}")
plt.legend()
plt.show()

In [None]:
sp.stats.norm().pdf(grid).max()

In [None]:
import numpy as np
from sklearn.neighbors import KernelDensity

grid = np.arange(-5, 5, 0.01).reshape(-1, 1)
fig, axs = plt.subplots(1, 2, figsize=(16, 6))
for kernel, ax in zip(["gaussian", "tophat"], axs, strict=False):
    ax.plot(
        grid, sp.stats.norm().pdf(grid), alpha=0.5, color="gray", linestyle="dashed"
    )
    for bw in [0.1, 0.3, 1, 3]:
        kde = KernelDensity(kernel=kernel, bandwidth=bw).fit(xs)
        dens = np.exp(kde.score_samples(grid))
        ax.plot(grid, dens, label=f"h = {bw}", alpha=0.5)
        ax.set_title(f"Kernel = {kernel}")
        ax.legend()
plt.show()

In [None]:
import numpy as np
import pandas as pd

hs = np.array([0.1, 0.5, 0.9, 0.98])
ds = np.array([1, 2, 5, 10, 20, 25])
df = pd.DataFrame([(h, d, h**d) for h in hs for d in ds], columns=["h", "d", "h**d"])

In [None]:
df.pivot(index="h", columns="d", values="h**d")

In [None]:
from scipy import stats

lb, ub = -1, 1
span = ub - lb
N = 100_000
d = 3
X = stats.uniform(lb, span).rvs((N, d))

In [None]:
def fhat_0(X, h):
    N, d = X.shape
    return sum(np.apply_along_axis(all, 1, (np.abs(X) < h))) / (N * (2 * h) ** d)

In [None]:
expected_fhat_0_1k = pd.Series({h: fhat_0(X, h) for h in np.linspace(0.01, 1, 201)})

In [None]:
# expected_fhat_0.plot()
# expected_fhat_0_10k.plot()
expected_fhat_0_1k.plot()

In [None]:
fhat_0(X, 0.3)

In [None]:
from sklearn.neighbors import KernelDensity

h = 0.88

sk_fhat = KernelDensity(kernel="tophat", bandwidth=h).fit(X)
np.exp(sk_fhat.score([np.zeros_like(X[0, :])])), fhat_0(X, h)

In [None]:
import numpy as np
import pandas as pd

# hs = np.array([0.1, 0.5, 0.9, 0.98])
hs = [0.25, 0.5, 0.9, 0.95]
ds = np.array([1, 2, 5, 10, 20, 25])
ds = np.arange(1, 51, 1)
df = pd.DataFrame([(h, d, h**d) for h in hs for d in ds], columns=["h", "d", "h**d"])

In [None]:
from matplotlib import pyplot as plt

data = df.set_index(["d", "h"]).unstack()["h**d"]
data.plot(figsize=(12, 4))
plt.title("Proporción de las X dentro de un $d$-cubo de lado $h$")
# plt.xscale("log")

In [None]:
0.95**50

# p-wspm's

In [None]:
import matplotlib.pyplot as plt
import numpy as np

p = np.arange(1, 100)
y = np.power(2, (p + 1) / p)
plt.plot(y)

### Ideas
- gbt con mayot max_depth; empata top perfs?