### Tablas y Figuras

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import pickle

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.datasets import make_blobs

from fkdc.config import _get_run_seeds, clasificadores, grillas, main_seed
from fkdc.datasets import Dataset
from fkdc.tarea import Tarea
from fkdc.utils import refit_parsimoniously

from pathlib import Path

# Los experimentos del 29.10.2024 corrieron con main_seed = 2024
# Los experimentos del 19.05.2025 corrieron con main_seed = 2206
# Los experimentos del 23/05.2025 corrieron con main_seed = 2411
main_seed = 2411
run_seeds = _get_run_seeds()

root_dir = Path("/Users/gonzalo/Git/fkdc/sandbox/v2")
# root_dir = Path("/Users/gonzalo/Git/fkdc/")
data_dir = root_dir / "data"
img_dir = root_dir / "img"
for directory in [data_dir, img_dir]:
    directory.mkdir(exist_ok=True)
run_dir = root_dir / "infos"
datasets_dir = root_dir / "datasets"

In [None]:
seed = main_seed
blobs = Dataset.de_fabrica(
    make_blobs,
    n_samples=(400, 400),
    n_features=2,
    centers=((0, 0), (10, 0)),
    random_state=seed,
)
blobs.guardar(datasets_dir / f"2blobs-{seed}-dataset.pkl")
blobs.scatter()
# IMG 1: 2  blbos

In [None]:
run_2blobs = data_dir / f"2blobs-{seed}-info.pkl"
force = False
if run_2blobs.exists() and not force:
    info = pickle.load(open(run_2blobs, "rb"))
else:
    tareas = {}
    for predicts_proba in [True, False]:
        tarea = tareas[predicts_proba] = Tarea(
            blobs,
            {
                nombre: (clf, grillas[nombre])
                for nombre, clf in clasificadores.items()
                if hasattr(clf, "predict_proba") is predicts_proba
            },
            seed=seed,
            scoring="neg_log_loss" if predicts_proba else "accuracy",
            busqueda_params=dict(
                # Importante en scoring = 'accuracy' por empates
                refit=refit_parsimoniously,
                return_train_score=True,
                cv=5,
                n_jobs=-1,
            ),
        )
        tarea.entrenar()
        tarea.evaluar()
    info = {**tareas[True].info, **tareas[False].info}
    pickle.dump(info, open(run_2blobs, "wb"))

In [None]:
pd.DataFrame(info).T[["logvero", "r2", "accuracy"]].astype(float).round(4)
# TBL 1: 2blobs - r2 & acc

In [None]:
# campos = {"logvero": "$cal(l)$", "r2": "$R^2$", "accuracy": "exac"}
# tabla = pd.DataFrame(info).T[campos.keys()].rename(columns=campos).astype(float)
# tabla.index = "#" + tabla.index
# tabla = pd.DataFrame(info)
# tabla.round(4)

In [None]:
import pickle

fig, axs = plt.subplots(1, 3, figsize=(15, 5))
seed = run_seeds[0]
datasets_2d = ["lunas", "espirales", "circulos"]
for nombre, ax in zip(datasets_2d, axs):
    ds = pickle.load(open(datasets_dir / f"{nombre}_lo-{seed}.pkl", "rb"))
    ds.scatter(ax=ax)
    ax.set_title(nombre)
plt.tight_layout()
fig.savefig(img_dir / "datasets-lunas-circulos-espirales.svg")
# IMG 2: datasets 2d, low noise

In [None]:
def load_infos(dir=run_dir):
    return {
        tuple(fn.stem.split("-")): pickle.load(open(fn, "rb"))
        for fn in Path(dir).glob("*.pkl")
    }


infos = load_infos()

In [None]:
# def pluck(D, filter):
#     if callable(filter):
#         return {k: v for k, v in D.items() if filter(k)}
#     elif isinstance(filter, (list, tuple)):
#         return {k: v for k, v in D.items() if k in filter}

In [None]:
import numpy as np


basic_fields = ["accuracy", "r2", "logvero"]
basic_infos = {}
for k, v in infos.items():
    clf = k[2]
    basic_infos[k] = {k: v for k, v in v[clf].items() if k in basic_fields}
    if clf == "fkdc":
        basic_infos[(k[0], k[1], "base", k[3], k[4])] = {
            k: v for k, v in v["base"].items() if k in basic_fields
        }

basic_info = pd.DataFrame.from_records(
    list(basic_infos.values()),
    index=pd.MultiIndex.from_tuples(
        basic_infos.keys(), names=["dataset", "ds_seed", "clf", "run_seed", "score"]
    ),
).reset_index()
assert all((basic_info.ds_seed == "None") | (basic_info.run_seed == str(main_seed)))
basic_info["semilla"] = np.where(
    basic_info.ds_seed == "None", basic_info.run_seed, basic_info.ds_seed
).astype(int)
bi = basic_info = basic_info.drop(columns=["ds_seed", "run_seed"])
basic_info.info()

### 2D, low noise

In [None]:
datasets = [ds for ds in bi.dataset.unique() if ds.endswith("_lo")]

In [None]:
colors = dict(zip(bi.clf.unique(), sns.color_palette("Set3")))
# exclude_clfs = ["base", "gnb", "lr", "]
# exclude_clfs = []
exclude_clfs = ["base"]

In [None]:
bi[bi.dataset.isin(datasets)].groupby(["clf", "dataset"])[
    ["r2", "accuracy"]
].mean().unstack().reorder_levels([1, 0], axis=1).sort_index(axis=1).mul(100).round(2)
# TBL 2: datsets 2d low noise

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(12, 4))
for idx, dataset in enumerate(datasets):
    ax = axs[idx]
    data = bi[bi.dataset.eq(dataset) & ~bi.clf.isin(exclude_clfs)].sort_values("clf")
    sns.boxplot(data, hue="clf", y="r2", gap=0.2, ax=ax, palette=colors)
    ax.set_title(dataset)
    ax.axhline(data.groupby("clf").r2.median().max(), linestyle="dotted", color="gray")
    # ybot = np.percentile(data.r2.dropna(), 35)
    # ax.set_ylim(ybot, None)
    if idx != 0:
        ax.get_legend().set_visible(False)
fig.tight_layout()
fig.savefig(img_dir / "boxplot-lunas-espirales-circulos-new.svg")
# IMG 2: Detalle R2 boxplot

In [None]:
clave_fkdc = ("circulos_lo", str(run_seeds[6]), "fkdc", str(main_seed), "neg_log_loss")
clave_kdc = ("circulos_lo", str(run_seeds[6]), "kdc", str(main_seed), "neg_log_loss")

In [None]:
infos[clave_fkdc].fkdc.busqueda.best_params_

In [None]:
infos[clave_kdc].kdc.busqueda.best_params_

In [None]:
# import json
# import numpy as np


# class NumpyEncoder(json.JSONEncoder):
#     """Special json encoder for numpy types"""

#     def default(self, obj):
#         if isinstance(obj, np.integer):
#             return int(obj)
#         elif isinstance(obj, np.floating):
#             return float(obj)
#         elif isinstance(obj, np.ndarray):
#             return obj.tolist()
#         return json.JSONEncoder.default(self, obj)

In [None]:
(infos[clave_fkdc].fkdc.r2, infos[clave_kdc].kdc.r2)

In [None]:
busqueda = pd.DataFrame(infos[clave_fkdc]["fkdc"].busqueda.cv_results_)
data = busqueda.set_index(["param_alpha", "param_bandwidth"]).mean_test_score.unstack()
X = data.columns.values
Y = data.index.values
Z = data.values

fig, ax = plt.subplots(layout="constrained")
zmin, zmax = Z.min(), Z.max()
CS = ax.contourf(X, Y, Z, 15, cmap="viridis")
# CS = ax.contour(X, Y, Z, 15, cmap="viridis")
ax.set_title("Exactitud para $\\alpha$ y $h$")
ax.set_xlabel("$h$")
ax.set_ylabel("$\\alpha$")
ax.scatter(X[Z.argmax(axis=1)], Y, marker="x", color="red")
# Make a colorbar for the ContourSet returned by the contourf call.
cbar = fig.colorbar(CS)
cbar.ax.set_ylabel("Exactitud")
ax.set_xscale("log")
# plt.tight_layout()
fig.savefig(img_dir / "heatmap-fkdc-2d-lo-new.svg")

In [None]:
# probably throw this away?
sns.lineplot(busqueda, y="mean_test_score", x="param_bandwidth", hue="param_alpha")
plt.xscale("log")

In [None]:
from itertools import product

some_seeds = np.random.choice(run_seeds, 3)
# some_seeds = [5167, 7446, 9083]  # well behaved
# some_seeds = [1134, 7815, 9616]  # weird
some_seeds = [1134, 7815, 9083]  # mix
fig, axs = plt.subplots(3, 3, figsize=(35, 30), layout="constrained")
for (ds, seed), ax in zip(product(datasets_2d, some_seeds), axs.flatten()):
    key = (ds + "_lo", str(seed), "fkdc", str(main_seed), "neg_log_loss")
    df = pd.DataFrame(infos[key]["fkdc"].busqueda.cv_results_)
    data = df.pivot(
        index="param_alpha", columns="param_bandwidth", values="mean_test_score"
    )
    X = data.columns.values
    Y = data.index.values
    Z = data.values
    zmin, zmax = Z.min(), Z.max()
    CS = ax.contourf(X, Y, Z, 25, cmap="viridis")
    ax.set_title(f"Exactitud para {key}")
    ax.set_xlabel("$h$")
    ax.set_ylabel("$\\alpha$")
    ax.scatter(X[Z.argmax(axis=1)], Y, marker="x", color="red")
    # Make a colorbar for the ContourSet returned by the contourf call.
    cbar = fig.colorbar(CS)
    # cbar.ax.set_ylabel("Exactitud")
    ax.set_xscale("log")
    # plt.tight_layout()
fig.savefig(img_dir / "many-heatmaps-fkdc-2d-lo-new.svg")

In [None]:
# Detalle en espirales_lo, perfiles pérdida
run_seed = 7815
alpha = 1.5625
clave = ("espirales_lo", str(run_seed), "fkdc", str(main_seed), "neg_log_loss")
orig_info = infos[clave]
busqueda = pd.DataFrame(orig_info.fkdc.busqueda.cv_results_)

In [None]:
orig_mean_test_scores = (
    busqueda[busqueda.param_alpha == alpha].set_index("param_bandwidth").mean_test_score
)  # / len(info.fkdc.preds)
orig_mean_test_scores.plot()
plt.scatter(orig_mean_test_scores.index, orig_mean_test_scores)
plt.xscale("log")

In [None]:
espirales_lo = Dataset.cargar(datasets_dir / f"espirales_lo-{run_seed}.pkl")
espirales_lo.scatter()

In [None]:
from fkdc.fermat import KDClassifier


clf = KDClassifier(metric="fermat", alpha=1.5625)

In [None]:
tarea = Tarea(
    espirales_lo,
    {
        bandwidth: (clf, {"bandwidth": [bandwidth]})
        for bandwidth in orig_mean_test_scores.index
    },
    seed=main_seed,
    scoring="neg_log_loss",
    busqueda_params=dict(
        refit=True,
        return_train_score=True,
        cv=5,
        n_jobs=-1,
    ),
    split_evaluacion=0.5,
)
tarea.entrenar()
tarea.evaluar()
new_info = tarea.info

In [None]:
n_eval = len(new_info[0.001].preds)
new_metrics = pd.DataFrame.from_dict(
    {
        bw: {
            "eval_logvero": info.logvero,
            "eval_r2": info.r2,
            "mean_test_score": info.busqueda.cv_results_["mean_test_score"][0],
            "std_test_score": info.busqueda.cv_results_["std_test_score"][0],
            "eval_accuracy": info.accuracy,
        }
        for bw, info in new_info.items()
        if bw != "base"
    },
    orient="index",
)
new_metrics["rank_test_score"] = (-new_metrics.mean_test_score).rank().astype(int)
assert orig_mean_test_scores.equals(new_metrics.mean_test_score)
fig, ax1 = plt.subplots(figsize=(12, 5))
new_metrics.eval_logvero.plot(label="eval")
new_metrics.mean_test_score.mul(n_eval).plot(label="test")
ax1.set_ylabel("logvero (solid lines)")
plt.legend()
ax2 = ax1.twinx()
new_metrics.eval_accuracy.plot(ax=ax2, linestyle="dotted")
ax2.set_ylabel("accuracy (dotted lines)")
plt.xscale("log")

In [None]:
max_score, sd = busqueda[busqueda.rank_test_score.eq(1)][
    ["mean_test_score", "std_test_score"]
].min()
busqueda[busqueda.mean_test_score.ge(max_score - sd)].sort_values(
    ["param_alpha", "param_bandwidth"], ascending=[True, False]
)[
    [
        "rank_test_score",
        "param_alpha",
        "param_bandwidth",
        "mean_test_score",
        "std_test_score",
        "mean_train_score",
    ]
]

In [None]:
new_max_score, new_sd = new_metrics[new_metrics.rank_test_score.eq(1)][
    ["mean_test_score", "std_test_score"]
].min()
new_metrics[new_metrics.mean_test_score.ge(new_max_score - new_sd)].sort_index(
    ascending=False
)

### 2D, high noise

In [None]:
import pickle

fig, axs = plt.subplots(1, 3, figsize=(15, 5))
seed = run_seeds[0]
datasets_2d = ["lunas", "espirales", "circulos"]
for nombre, ax in zip(datasets_2d, axs):
    ds = pickle.load(open(datasets_dir / f"{nombre}_hi-{seed}.pkl", "rb"))
    ds.scatter(ax=ax)
    ax.set_title(nombre)
plt.tight_layout()
fig.savefig(img_dir / "datasets-lunas-circulos-espirales-hi-new.svg")

In [None]:
datasets = [ds for ds in bi.dataset.unique() if ds.endswith("_hi")]

In [None]:
(
    bi[bi.dataset.isin(datasets)]
    .groupby(["clf", "dataset"])[["r2", "accuracy"]]
    .mean()
    .unstack()
    .reorder_levels([1, 0], axis=1)
    .sort_index(axis=1)
    .sort_values(("circulos_hi", "accuracy"), ascending=False)
    .mul(100)
    .round(2)
)
# TBL 2: datsets 2d low noise

- R^2 consistentemente el mejor para (f)KDC, pero sin diferencias entre fermat y euclideo, pero muy jodido en gral
- accuracy no mucho peor que SVC (que sigue siendo el rey)
  - en lunas_hi gana FKDC!
  - en circulos_hi, lunas_hi GBT competitivo

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(12, 4))
metric = "r2"
for idx, dataset in enumerate(datasets):
    ax = axs[idx]
    data = bi[
        bi.dataset.eq(dataset) & ~bi.clf.isin(exclude_clfs) & bi[metric].notna()
    ].sort_values("clf")
    sns.boxplot(data, hue="clf", y=metric, gap=0.2, ax=ax, palette=colors)
    ax.set_title(dataset)
    ax.axhline(
        data.groupby("clf")[metric].median().max(), linestyle="dotted", color="gray"
    )
    ybot, ytop = np.percentile(data[metric].dropna(), [25, 100])
    ax.set_ylim(ybot * 0.99, ytop * 1.01)
    if idx != 0:
        ax.get_legend().set_visible(False)
fig.tight_layout()
fig.savefig(img_dir / "boxplot-lunas-espirales-circulos-new.svg")
# IMG 2: Detalle metric boxplot

In [None]:
bi2d = bi[bi.dataset.str.endswith(("_lo", "_hi"))].copy()
bi2d[["figura", "ruido"]] = bi2d.dataset.str.split("_", expand=True)

In [None]:
metric = "accuracy"
drops = (
    bi2d.groupby(["figura", "ruido", "clf"])[["r2", "accuracy"]]
    .mean()[metric]
    .unstack("ruido")
    .assign(rel_drop=lambda df: (df.hi - df.lo).div(df.lo))
)
drops.rel_drop.unstack("figura").dropna().mul(100).round(2)

In [None]:
for figura in bi2d.figura.unique():
    (
        drops.xs(figura)
        .drop(columns="rel_drop")[["lo", "hi"]]
        .sort_values("hi", ascending=False)
        .plot(kind="bar")
    )
    plt.title(f"Caída absoluta de {metric} en {figura}")

- Los órdenes se mantienen: quien andaba mejor en lo, anda mejor en hi, +- un cachito, pero los que _mejor_ andaban, más pierden

### 3d, low dim

In [None]:
run_seeds[0]

In [None]:
import pickle

fig = plt.figure(figsize=(10, 10))
seed = run_seeds[0]  # 1134
nombres_datasets = ["pionono", "eslabones", "helices", "hueveras"]
datasets = {
    nombre: pickle.load(open(datasets_dir / f"{nombre}_0-{seed}.pkl", "rb"))
    for nombre in nombres_datasets
}
for idx, (nombre, ds) in enumerate(datasets.items(), start=1):
    ax = fig.add_subplot(2, 2, idx, projection="3d")
    ds.scatter_3d(ax=ax)
    ax.set_title(nombre)
plt.tight_layout()
fig.savefig(img_dir / "datasets-3d-0.svg")
# IMG 2: datasets 2d, low noise

In [None]:
for nombre, ds in datasets.items():
    ds.pairplot(dims=[2, 1, 0], height=2, plot_kws=dict(alpha=0.5, s=5), corner=True)
    plt.suptitle(nombre)
    break

In [None]:
(
    bi[bi.dataset.isin(f"{nombre}_0" for nombre in nombres_datasets)]
    .groupby(["clf", "dataset"])[["r2", "accuracy"]]
    .mean()
    .unstack()
    .reorder_levels([1, 0], 1)
    .sort_index(axis=1)
    .mul(100)
    .round(2)
    .sort_values(("pionono_0", "r2"), ascending=False)
)  # TBL 2: datsets 2d low noise

- helices y eslabones muy fáciles
- en heveras a fKDC le duele la varianza agregada versus eKDC
- (f)KDC best in class for acc & r2 (acc like SVC, plus R^2), but mostly no diff
  - r^2 empeora algo en helices con fkdc, sigue siendo mucho mejor que (f)KN, resto ni computa

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(8, 8))
metric = "r2"
for (idx, nombre), ax in zip(enumerate(nombres_datasets), axs.flatten()):
    data = bi[
        bi.dataset.eq(f"{nombre}_0") & ~bi.clf.isin(exclude_clfs) & bi[metric].notna()
    ].sort_values("clf")
    sns.boxplot(data, hue="clf", y=metric, gap=0.2, ax=ax, palette=colors)
    ax.set_title(nombre)
    ax.axhline(
        data.groupby("clf")[metric].median().max(), linestyle="dotted", color="gray"
    )
    ybot, ytop = np.percentile(data[metric].dropna(), [40, 100])
    ax.set_ylim(ybot, None)
    if idx != 0:
        ax.get_legend().set_visible(False)
fig.tight_layout()
fig.savefig(img_dir / "3d-low-r2.svg")
# IMG 2: Detalle metric boxplot

## Tomados de otros notebooks

### Decision Boundary Display

In [None]:
tarea.info

In [None]:
plt.cm.cool

In [None]:
key

In [None]:
nombre_dataset = "lunas_lo"
seed = 5248
clfs = ["fkdc", "kdc", "kn", "fkn", "gbt", "lr", "gnb", "svc"]
dataset = pickle.load(open(datasets_dir / f"{nombre_dataset}-{seed}.pkl", "rb"))

In [None]:
tarea = Tarea(dataset, {}, seed=seed, split_evaluacion=0.5)

In [None]:
from sklearn.inspection import DecisionBoundaryDisplay

# Set-up 2x2 grid for plotting.
fig, axs = plt.subplots(3, 3, figsize=(18, 18))
plt.subplots_adjust(wspace=0.1, hspace=0.1)

X = tarea.X_eval
y = tarea.y_eval
X0, X1 = X[:, 0], X[:, 1]

for clf, ax in zip(clfs, axs.flatten()):
    response_method, scoring = "predict_proba", "neg_log_loss"
    if clf == "svc":
        response_method, scoring = "predict", "accuracy"
    info = infos[(nombre_dataset, str(seed), clf, str(main_seed), scoring)][clf]
    disp = DecisionBoundaryDisplay.from_estimator(
        info.busqueda.best_estimator_,
        X,
        eps=0.05,
        response_method=response_method,
        cmap=plt.cm.coolwarm,
        alpha=0.8,
        ax=ax,
        xlabel="x",
        ylabel="y",
    )
    ax.scatter(X0, X1, c=y.astype(float), cmap=plt.cm.coolwarm, s=20, edgecolors="gray")
    ax.set_xticks(())
    ax.set_yticks(())
    ax.set_title(
        f"{clf} ({(info.accuracy * 100):.2f}% acc., {(info.get("r2", 0)):.3f} $R^2$)"
    )

plt.show()

### Pair Plot

In [None]:
dataset = Dataset.cargar(datasets_dir / f"helices_0-{seed}.pkl")
df = pd.DataFrame(dataset.X, columns=["x", "y", "z"])
df["clase"] = dataset.y
df

In [None]:
sns.pairplot(df, hue="clase")

### El grafico de abajo, pero en 3d?
https://matplotlib.org/stable/gallery/mplot3d/bars3d.html#sphx-glr-gallery-mplot3d-bars3d-py

### Sapienza's Swiss Roll
> We use the well-known example coined “Swiss roll”, Figure 1(a) and 1(b). We consider a dataset composed of 4 subsets steaming from independent Normal distributions (restricted to the unit square) with mean $\mu_1 = (.3, .3), \mu_2 = (.3, .7), \mu_3 = (.7, .3), \mu_4 = (.7, .7)$ respectively and constant variance, Figure 1(a). Then, we apply the Swiss Roll transformation, Figure 1(b).

In [None]:
pionono = Dataset.cargar(datasets_dir / f"pionono_0-{seed}.pkl")
df = pd.DataFrame(pionono.X, columns=["x", "y", "z"])
df["clase"] = pionono.y
df

In [None]:
from sklearn.utils import Bunch


sns.pairplot(df, hue="clase", plot_kws=Bunch(alpha=0.2))

### FacetGrid of Scores

In [None]:
data = (
    bi[bi.score.eq("neg_log_loss")]
    .groupby(["dataset", "clf"])[["accuracy", "r2"]]
    .mean()
    .reset_index()
)
data["is_fkdc"] = data.clf.eq("fkdc")
data

In [None]:
sns.catplot(
    data=data,
    x="clf",
    y="accuracy",
    col="dataset",
    col_wrap=5,
    hue="is_fkdc",
    order=sorted(data.clf.unique()),
    kind="bar",
)

In [None]:
sns.catplot(
    data=data.dropna(),
    x="clf",
    y="r2",
    col="dataset",
    col_wrap=5,
    hue="is_fkdc",
    order=sorted(data.dropna().clf.unique()),
    kind="bar",
)

In [None]:
sns.lineplot(
    df[(df.est == "kdc") | ((df.est == "fkdc") & (df.param_alpha == 1))],
    y="mean_test_score",
    x="param_bandwidth",
    hue="est",
)
plt.xscale("log")

In [None]:
sns.lineplot(
    df[(df.est == "fkdc")],
    y="mean_test_score",
    x="param_bandwidth",
    hue="param_alpha",
)
plt.xscale("log")