### Tablas y Figuras

In [None]:
from sklearn.datasets import make_blobs
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from fkdc.config import clasificadores, grillas
from fkdc.tarea import Tarea
from fkdc.datasets import Dataset

import os
import pickle

os.getcwd()
from pathlib import Path

# main_seed = 1732373523
main_seed = 3179636698

root_dir = Path("/Users/gonzalo/Git/fkdc")
data_dir = root_dir / "docs/data"
data_dir.mkdir(exist_ok=True)
img_dir = root_dir / "docs/img"
run_dir = root_dir / ("runs/run-%i" % main_seed)

In [None]:
seeds = pickle.load(open(run_dir / ("%i-run_seeds.pkl" % main_seed), "rb"))

In [None]:
seed = 2024
ds = Dataset.de_fabrica(
    make_blobs,
    n_samples=(400, 400),
    n_features=2,
    centers=((0, 0), (10, 0)),
    random_state=seed,
)
ds.scatter()

In [None]:
clasificadores

In [None]:
run_2blobs = run_dir / f"../2blobs-{seed}.pkl"
if run_2blobs.exists():
    tarea = pickle.load(open(run_2blobs, "rb"))
else:
    tarea = Tarea(
        ds,
        {nombre: (clf, grillas[nombre]) for nombre, clf in clasificadores.items()},
        seed=seed,
    )
    tarea.entrenar()
    tarea.evaluar()
    tarea.guardar(run_2blobs)

In [None]:
campos = {"logvero": "$cal(l)$", "r2": "$R^2$", "accuracy": "exac"}
tabla = pd.DataFrame(tarea.info).T[campos.keys()].rename(columns=campos).astype(float)
tabla.index = "#" + tabla.index
tabla

In [None]:
tabla.round(4).to_csv(data_dir / "2-blobs.csv")

In [None]:
import pickle

fig, axs = plt.subplots(1, 3, figsize=(15, 5))
seed = 4107
datasets_2d = ["lunas", "espirales", "circulos"]
for nombre, ax in zip(datasets_2d, axs):
    ds = pickle.load(open(run_dir / f"dataset-('{nombre}', {seed}, 'lo').pkl", "rb"))
    ds.scatter(ax=ax)
    ax.set_title(nombre)
plt.tight_layout()
fig.savefig(img_dir / "datasets-lunas-circulos-espirales.svg")

In [None]:
infos = {
    (nombre, seed): pickle.load(
        open(run_dir / f"info-('{nombre}', {seed}, 'lo').pkl", "rb")
    )
    for seed in seeds
    for nombre in datasets_2d
}

In [None]:
exacs = {k: pd.DataFrame(info).loc["accuracy"] for k, info in infos.items()}
exacs = (
    pd.DataFrame(exacs)
    .T.melt(ignore_index=False, var_name="clf", value_name="exac")
    .reset_index(names=["dataset", "semilla"])
)
exacs["exac"] = exacs.exac.astype(float)
exacs

In [None]:
exacs.groupby(["clf", "dataset"]).exac.mean().reset_index().pivot(index="clf", columns="dataset", values="exac").round(3)

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(12, 4))
seed = 4107
for idx, nombre in enumerate(datasets_2d):
    ax = axs[idx]
    data = exacs.query("dataset == @nombre")
    sns.boxplot(data, hue="clf", y="exac", gap=0.2, ax=ax)
    ax.set_title(nombre)
    ax.axhline(
        data.groupby("clf").exac.median().max(), linestyle="dotted", color="gray"
    )
    if idx != 0:
        ax.get_legend().set_visible(False)
fig.tight_layout()
fig.savefig(img_dir / "boxplot-lunas-espirales-circulos.svg")

In [None]:
tabla = (
    exacs.groupby(["dataset", "clf"])
    .exac.agg(["mean", "std"])
    .reset_index()
    .pivot(index="clf", columns="dataset", values=["mean", "std"])
)
tabla.columns = tabla.columns.reorder_levels([1, 0])

In [None]:
tabla = tabla.apply(lambda x: round(100 * x, 2))[tabla.columns.sortlevel()[0]].round(2)
tabla.to_csv(data_dir / "exac-ds-2d.csv")
tabla

In [None]:
list(infos.keys())[:5]

In [None]:
rsqs = {k: pd.DataFrame(info).loc["r2"] for k, info in infos.items()}
rsqs = (
    pd.DataFrame(rsqs)
    .T.melt(ignore_index=False, var_name="clf", value_name="r2")
    .reset_index(names=["dataset", "semilla"])
)
rsqs["r2"] = rsqs.r2.astype(float)
rsqs

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(12, 4))
seed = 4107
for idx, nombre in enumerate(datasets_2d):
    ax = axs[idx]
    data = rsqs.query("dataset == @nombre")
    sns.boxplot(data, hue="clf", y="r2", gap=0.2, ax=ax)
    ax.set_title(nombre)
    ax.axhline(
        data.groupby("clf").r2.median().max(), linestyle="dotted", color="gray"
    )
    ax.set_ylim(0, None)
    if idx != 0:
        ax.get_legend().set_visible(False)
fig.tight_layout()
fig.savefig(img_dir / "boxplot-r2-lunas-espirales-circulos.svg")

In [None]:
tabla = (
    rsqs.groupby(["dataset", "clf"])
    .r2.agg(["mean", "std"])
    .reset_index()
    .pivot(index="clf", columns="dataset", values=["mean", "std"])
)
tabla.columns = tabla.columns.reorder_levels([1, 0])

In [None]:
tabla = tabla[tabla.columns.sortlevel()[0]].round(3).dropna()
tabla.to_csv(data_dir / "r2-ds-2d.csv")
tabla

In [None]:
corrida = ("circulos", 4479)
df = pd.concat(
    {
        est: pd.DataFrame(infos[corrida][est].busqueda.cv_results_)
        for est in ("kdc", "fkdc")
    }, names=["est", "index"]
).reset_index()

In [None]:
coso = df.query("est == 'kdc'").param_alpha.unique()[0]

In [None]:
df.groupby("est").param_alpha.agg(lambda x: x.isna().mean())

In [None]:
import numpy as np

In [None]:
df[df.param_alpha.isna() | (df.param_alpha == 1)].groupby(
    ["est", "param_alpha"], dropna=False
).size()

In [None]:
sns.lineplot(
    df[(df.est == "kdc") | ((df.est == "fkdc") & (df.param_alpha == 1))],
    y="mean_test_score",
    x="param_bandwidth",
    hue="est",
)
plt.xscale("log")

In [None]:
sns.lineplot(
    df[(df.est == "fkdc")],
    y="mean_test_score",
    x="param_bandwidth",
    hue="param_alpha",
)
plt.xscale("log")

In [None]:
data = df[(df.est == "fkdc")].pivot(index="param_alpha", columns="param_bandwidth", values="mean_test_score")
X = data.columns.values
Y = data.index.values
Z = data.values

In [None]:
df[df.rank_test_score == 1].filter(like="param_")

In [None]:
infos[corrida].fkdc.busqueda.best_params_

In [None]:
import json
import numpy as np

class NumpyEncoder(json.JSONEncoder):
    """ Special json encoder for numpy types """
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)


In [None]:
data = {
    "corrida": corrida,
    "best_params": {
        est: info.busqueda.best_params_
        for est, info in infos[corrida].items()
        if est != "base"
    },
    "exac": {
        est: info.accuracy
        for est, info in infos[corrida].items()
        if est != "base"
    }
}

In [None]:
dumped = json.dumps(data, cls=NumpyEncoder)

with open(data_dir / f"best_params-2d-lo.json", "w") as f:
    json.dump(dumped, f)

In [None]:
from matplotlib import colors

fig, ax = plt.subplots(layout="constrained")
zmin, zmax = Z.min(), Z.max()
CS = ax.contourf(X, Y, Z**2, 15, cmap="viridis")
ax.set_title("Exactitud para $\\alpha$ y $h$")
ax.set_xlabel("$h$")
ax.set_ylabel("$\\alpha$")
ax.scatter(X[Z.argmax(axis=1)], Y, marker="x", color="red")
# Make a colorbar for the ContourSet returned by the contourf call.
cbar = fig.colorbar(CS)
cbar.ax.set_ylabel("Exactitud")
ax.set_xscale("log")
# plt.tight_layout()
fig.savefig(img_dir / "heatmap-fkdc-2d-lo.svg")

In [None]:
from matplotlib import colors

fig, ax = plt.subplots(layout='constrained')
zmin, zmax = Z.min(), Z.max()
CS = ax.contour(X, Y, Z**2, 15, cmap="viridis")
ax.set_title('Exactitud para $\\alpha$ y $h$')
ax.set_xlabel("$h$")
ax.set_ylabel("$\\alpha$")

# Make a colorbar for the ContourSet returned by the contourf call.
cbar = fig.colorbar(CS)
cbar.ax.set_ylabel('Exactitud')
plt.xscale("log")

In [None]:
sns.lineplot(
    df[(df.est == "fkdc")],
    y="mean_test_score",
    x="param_bandwidth",
    hue="param_alpha",
)
plt.xscale("log")

In [None]:
some_seeds = [5303, 1115, 7761]

In [None]:
from matplotlib import colors
from itertools import product

fig, axs = plt.subplots(3, 3, figsize=(35, 30), layout="constrained")
for corrida, ax in zip(product(datasets_2d, some_seeds), axs.flatten()):
    df = pd.concat(
        {
            est: pd.DataFrame(infos[corrida][est].busqueda.cv_results_)
            for est in ("kdc", "fkdc")
        },
        names=["est", "index"],
    ).reset_index()
    coso = df.query("est == 'kdc'").param_alpha.unique()[0]
    data = df[(df.est == "fkdc")].pivot(
        index="param_alpha", columns="param_bandwidth", values="mean_test_score"
    )
    X = data.columns.values
    Y = data.index.values
    Z = data.values
    zmin, zmax = Z.min(), Z.max()
    CS = ax.contourf(X, Y, Z, 15, cmap="viridis")
    ax.set_title(f"Exactitud para {corrida}")
    ax.set_xlabel("$h$")
    ax.set_ylabel("$\\alpha$")
    ax.scatter(X[Z.argmax(axis=1)], Y, marker="x", color="red")
    # Make a colorbar for the ContourSet returned by the contourf call.
    # cbar = fig.colorbar(CS)
    # cbar.ax.set_ylabel("Exactitud")
    ax.set_xscale("log")
    # plt.tight_layout()
fig.savefig(img_dir / "many-heatmaps-fkdc-2d-lo.svg")

In [None]:
import pickle

fig, axs = plt.subplots(1, 3, figsize=(15, 5))
seed = 4107
datasets_2d = ["lunas", "espirales", "circulos"]
for nombre, ax in zip(datasets_2d, axs):
    ds = pickle.load(open(run_dir / f"dataset-('{nombre}', {seed}, 'hi').pkl", "rb"))
    ds.scatter(ax=ax)
    ax.set_title(nombre)
plt.tight_layout()
fig.savefig(img_dir / "datasets-lunas-circulos-espirales-hi.svg")

In [None]:
infos = {
    (nombre, seed): pickle.load(
        open(run_dir / f"info-('{nombre}', {seed}, 'hi').pkl", "rb")
    )
    for seed in seeds
    for nombre in datasets_2d
}

In [None]:
exacs = {k: pd.DataFrame(info).loc["accuracy"] for k, info in infos.items()}
exacs = (
    pd.DataFrame(exacs)
    .T.melt(ignore_index=False, var_name="clf", value_name="exac")
    .reset_index(names=["dataset", "semilla"])
)
exacs["exac"] = exacs.exac.astype(float)
exacs

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(12, 4))
for idx, nombre in enumerate(datasets_2d):
    ax = axs[idx]
    data = exacs.query("dataset == @nombre")
    sns.boxplot(data, hue="clf", y="exac", gap=0.2, ax=ax)
    ax.set_title(nombre)
    ax.axhline(
        data.groupby("clf").exac.median().max(), linestyle="dotted", color="gray"
    )
    if idx != 0:
        ax.get_legend().set_visible(False)
fig.tight_layout()
fig.savefig(img_dir / "boxplot-lunas-espirales-circulos-hi.svg")

In [None]:
tabla = (
    exacs.groupby(["dataset", "clf"])
    .exac.agg(["mean", "std"])
    .reset_index()
    .pivot(index="clf", columns="dataset", values=["mean", "std"])
)
tabla.columns = tabla.columns.reorder_levels([1, 0])

In [None]:
tabla = tabla.apply(lambda x: round(100 * x, 2))[tabla.columns.sortlevel()[0]].round(2)
tabla.to_csv(data_dir / "exac-ds-2d.csv")
tabla

In [None]:
rsqs = {k: pd.DataFrame(info).loc["r2"] for k, info in infos.items()}
rsqs = (
    pd.DataFrame(rsqs)
    .T.melt(ignore_index=False, var_name="clf", value_name="r2")
    .reset_index(names=["dataset", "semilla"])
)
rsqs["r2"] = rsqs.r2.astype(float)
rsqs

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(12, 4))
seed = 4107
for idx, nombre in enumerate(datasets_2d):
    ax = axs[idx]
    data = rsqs.query("dataset == @nombre")
    sns.boxplot(data, hue="clf", y="r2", gap=0.2, ax=ax)
    ax.set_title(nombre)
    ax.axhline(
        data.groupby("clf").r2.median().max(), linestyle="dotted", color="gray"
    )
    if idx != 0:
        ax.get_legend().set_visible(False)
fig.tight_layout()
fig.savefig(img_dir / "boxplot-r2-lunas-espirales-circulos-hi.svg")

In [None]:
tabla = (
    rsqs.groupby(["dataset", "clf"])
    .r2.agg(["mean", "std"])
    .reset_index()
    .pivot(index="clf", columns="dataset", values=["mean", "std"])
)
tabla.columns = tabla.columns.reorder_levels([1, 0])

In [None]:
tabla = tabla[tabla.columns.sortlevel()[0]].round(3).dropna()
tabla.to_csv(data_dir / "r2-ds-2d.csv")
tabla

In [None]:
def get_seeds(main_seed):
    run_dir = root_dir / ("runs/run-%i" % main_seed)
    return pickle.load(open(run_dir / ("%i-run_seeds.pkl" % main_seed), "rb"))


def list_items(run_dir, kind):
    root = f"{kind}-"
    paths = run_dir.glob(f"{root}*.pkl")
    return {eval(p.stem.lstrip(root)): pickle.load(open(p, "rb")) for p in paths}


def list_infos(run_dir):
    return list_items(run_dir, "info")

In [None]:
seeds = get_seeds(main_seed)
run_dir = root_dir / ("runs/run-%i" % main_seed)
infos = list_infos(root_dir / ("runs/run-%i" % main_seed))

In [None]:
datasets_2d

## Tomados de otros notebooks

### Decision Boundary Display

In [None]:
from sklearn.inspection import DecisionBoundaryDisplay

# Set-up 2x2 grid for plotting.
fig, axs = plt.subplots(2, 3, figsize=(18, 12))
plt.subplots_adjust(wspace=0.1, hspace=0.1)

X = tarea.X_eval
y = tarea.y_eval
X0, X1 = X[:, 0], X[:, 1]

for (nombre, clf), ax in zip(tarea.clasificadores.items(), axs.flatten()):
    disp = DecisionBoundaryDisplay.from_estimator(
        clf,
        X,
        eps=0.05,
        response_method="predict",
        cmap=plt.cm.coolwarm,
        alpha=0.8,
        ax=ax,
        xlabel="x",c
        ylabel="y",
    )
    ax.scatter(X0, X1, c=y.astype(float), cmap=plt.cm.coolwarm, s=20, edgecolors="gray")
    ax.set_xticks(())
    ax.set_yticks(())
    ax.set_title(f"{nombre} ({tarea.puntajes[nombre] * 100:.2f}% acc.)")

plt.show()

### Pair Plot

In [None]:
helices = Dataset("helices", X, y)
df = pd.DataFrame(X, columns=["x", "y", "z"])
df["clase"] = y
sns.pairplot(df, hue="clase")

### El grafico de abajo, pero en 3d?
https://matplotlib.org/stable/gallery/mplot3d/bars3d.html#sphx-glr-gallery-mplot3d-bars3d-py

In [None]:
sns.lineplot(
    data=grilla[grilla.stage == "test"], x="clf__bandwidth", y="score", hue="clf__alpha"
)
plt.xlabel("$h$"), plt.ylabel("Exactitud [%]"), plt.legend(title="$\\alpha$")
plt.axhline(tarea.puntajes.fkdc * 100, linestyle="dotted", color="gray")
plt.xscale("log")

### Sapienza's Swiss Roll
> We use the well-known example coined “Swiss roll”, Figure 1(a) and 1(b). We consider a dataset composed of 4 subsets steaming from independent Normal distributions (restricted to the unit square) with mean $\mu_1 = (.3, .3), \mu_2 = (.3, .7), \mu_3 = (.7, .3), \mu_4 = (.7, .7)$ respectively and constant variance, Figure 1(a). Then, we apply the Swiss Roll transformation, Figure 1(b).

In [None]:
n_samples = 200
limites = (0.3, 0.7)
centros = [(x, y) for x in limites for y in limites]
varianza = 0.01 * np.identity(len(centros[0]))
Xs = np.vstack(
    [stats.multivariate_normal(mean=c, cov=varianza).rvs(n_samples) for c in centros]
)
ys = np.concatenate([np.ones(n_samples) * i for i in range(len(centros))])
ds = Dataset("swissroll", Xs, ys)
df = pd.DataFrame(Xs, columns = ["x", "y"])
df["clase"] = ys.astype(str)
sns.scatterplot(data=df, x="x", y="y", hue="clase")

In [None]:
def swissroll(x, y, noise=0.005, return_t=False):
    # Versión modificada de https://homepages.ecs.vuw.ac.nz/~marslast/Code/Ch6/lle.py
    N = len(x)

    t = 2 * np.pi * (1 + 2 * x)
    h = 21 * y
    data = np.vstack((t * np.cos(t), h, t * np.sin(t))) + noise * np.random.randn(
        3, N
    )
    if return_t:
        return np.transpose(data), np.squeeze(t)
    else:
        return np.transpose(data)



In [None]:
swissed = pd.DataFrame(swissroll(Xs[:, 0], Xs[:, 1]), columns = ["x", "y", "z"])
swissed["clase"] = ys.astype(str)
swissed.sample(10)

In [None]:
sns.pairplot(swissed, hue="clase", plot_kws=Bunch(alpha=0.2))