# 00. Primer Acercamiento a Clasificación con Distancia de Fermat: LandmarksClassifier
La clase `fermat.Fermat` sólo devuelve distancias entre las observaciones de entrenamiento `X_train` pasadas a `Fermat.fit(X_train)`. Para usar la estimación $D_{\mathbb{X}_n}$ de la distancia de Fermat $\mathcal{D_f}$ en tareas de clasificación, hay que extenderla a puntos no observados.

En este _notebook_ propongo la versión más bruta posible: para cada clase de entrenamiento (`X_train[y_train == cls]`), agregar la observación a predecir a la clase, calcular las nuevas distancias $D_{\mathbb{X}_{n+1}}$, y tomar el promedio de las distancias de la observación a los elementos de la clase de entrenamiento. Asignar la observación a la clase con menor distancia promedio.

Una sofisticación inmediata: es posible calcular las distancias de una nueva observación `x` a una muestra de tamaño `n`, en orden `n` 

In [None]:
import numpy as np
import pandas as pd
from scipy.spatial import distance_matrix
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from fermat import Fermat
from sklearn.datasets import load_iris

In [None]:
X, y = load_iris(return_X_y=True, as_frame=True)
N, D = X.shape
N, D

In [None]:
classes = y.unique()
all_distances = distance_matrix(X, X)
cls_distances = {cls: distance_matrix(X[y == cls], X[y == cls]) for cls in classes}


In [None]:
all_distances.shape, {i: X.shape for i, X in cls_distances.items()}

**CUIDADO**: Si tomo `f = Fermat(alpha=4, path_method="FW", ...)`
Sucesivas llamadas a `f.fit(X)` retornan una nueva instancia Fermat fiteada, pero fitean también el `f` "de base". Es un comportamiento esperado para los BaseEstimator de sklearn. Es problemático, el `partial` te deja trabajar con un "factory" manejable.

In [None]:
from functools import partial
f = partial(Fermat, alpha=4, path_method="FW")
f_all = f().fit(all_distances)
f_cls = {cls: f().fit(cls_distances[cls]) for cls in classes}

For any two points decide if they belong to the same class or not according to the distance in every class:

In [None]:
def get_distance_through(p, q, cls):
    nodes = pd.concat([X.loc[[p, q]], X[y == cls]])
    dist_mat = distance_matrix(X, X)
    return f().fit(dist_mat).get_distance(0, 1)

In [None]:
for p, q in np.random.randint(0, N, (3, 2)):
    print(f"p := {X.loc[p].values} (class {y[p]})")
    print(f"q := {X.loc[q].values} (class {y[q]})")
    dists = {}
    dists["all"] = f_all.get_distance(p, q)
    for cls in classes:
        dists[cls] = get_distance_through(p, q, cls)
    print(pd.Series(dists))

No tiene mucha utilidad el paquete de Fermat para clasificar as-is o estoy loco? No generaliza bien la distancia a nuevos puntos.

## Out-out-training predictions for FermatKMeans

In [None]:
import itertools as it
import matplotlib.pyplot as plt
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.datasets import *
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import euclidean_distances
from sklearn.model_selection import train_test_split
from numpy.random import RandomState
from fermat.kmedoids import KMedoids

In [None]:
class LandmarksClassifier(BaseEstimator, ClassifierMixin):
    def __init__(
        self, alpha=2, k=5, method="kmedoids", seed=None
    ):  # number of landmarks to take from each class
        self.alpha = alpha
        self.k = k
        self.seed = seed or np.random.randint(2**32 - 1)
        self.rs = RandomState(self.seed)
        if method not in ["kmedoids", "random"]:
            raise ValueError(f"{self.method} is not a valid landmarks' choosing method")
        self.method = method

    def fit(self, X, y):
        self.landmarks_ = {}
        for cls in np.unique(y):
            X_cls = X[y == cls]
            n_cls = X_cls.shape[0]
            if self.method == "kmedoids":
                fmt = Fermat(alpha=self.alpha, path_method="FW", seed=self.seed)
                fmt.fit(euclidean_distances(X_cls))
                km = KMedoids(iterations=10, seed=self.seed)
                distance_matrix = fmt.get_distances()
                labels = km(distance_matrix, min(self.k, n_cls))
                self.landmarks_[cls] = X_cls[km._find_centers(distance_matrix, labels)]
            else:
                self.landmarks_[cls] = X_cls[self.rs.randint(0, n_cls)]

    def _distances(self, x):
        distances = {}
        for cls, landmarks in self.landmarks_.items():
            X_cls = np.vstack([x, landmarks])
            fmt = Fermat(alpha=self.alpha, path_method="FW", seed=self.seed)
            fmt.fit(euclidean_distances(X_cls))
            distance_matrix = fmt.get_distances()
            distances[cls] = distance_matrix[0].mean()  # x is at th the top of X_cls
        return distances

    def _predict(self, x):
        distances = self._distances(x)
        return min(distances, key=distances.get)

    def predict(self, X):
        return np.apply_along_axis(lclf._predict, 1, X)


In [None]:
config = dict(
    n_iter=16,
    test_sizes=[0.1, 0.5],
    datasets=["digits", "iris", "breast_cancer", "wine", "diabetes"],
    alphas=[0.5, 1, 1.5, 2, 3, 4],
    ks=[3, 10, 30, 100],
    methods=["kmedoids", "random"],
    # n_estimators=[3, 10, 30]   # TODO: LATER
    # max_depths=[3, 6, 9]
)
simple_config = dict(
    n_iter=1,
    test_sizes=[0.1],
    datasets=["digits"],
    alphas=[2],
    ks=[10],
    methods=["kmedoids"],
    # n_estimators=[3, 10, 30]   # TODO: LATER
    # max_depths=[3, 6, 9]
)

In [None]:
def run(n_iter, test_sizes, datasets, alphas, ks, methods):
    from sklearn import datasets as sk_datasets
    runs = []
    for ds, size in it.product(datasets, test_sizes):
        log = {"ds": ds, "size": size}
        print(log)
        loader = eval(f"sk_datasets.load_{ds}")
        X, y = loader(return_X_y=True)
        for i in range(n_iter):
            log["i"] = i
            print(log)
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=size, random_state=i
            )
            try:
                rfclf = RandomForestClassifier(n_estimators=10)
                rfclf.fit(X_train, y_train)
                runs.append({"method": "rf", "score": rfclf.score(X_test, y_test), **log})
            except:
                print(f"Problem with RF for {log}")
            for alpha, k, method in it.product(alphas, ks, methods):
                # New keys must be at the end to replace previous values in the loop
                log = {**log, "alpha": alpha, "k": k, "method": method}
                print(log)
                try:
                    lclf = LandmarksClassifier(alpha=alpha, k=k, method=method)
                    lclf.fit(X_train, y_train)
                    runs.append({"score": lclf.score(X_test, y_test), **log})
                except Exception as e:
                    print(f"Problem with Landmarks for {log}")

    return runs


In [None]:
runs = run(**simple_config)

In [None]:
# from sklearn.model_selection import train_test_split
X, y = load_digits(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
rfclf = RandomForestClassifier(n_estimators=10)
rfclf.fit(X_train, y_train)


In [None]:
rfclf.score(X_test, y_test)

In [None]:
classes = {}
alpha = 2
seed = 34
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=seed
)
for cls in np.unique(y_train):
    X = X_train[y_train == cls]
    n = X.shape[0]
    fmt = Fermat(alpha=alpha, path_method="FW", seed=seed)
    fmt.fit(euclidean_distances(X))
    classes[cls] = dict(verts=X, dists=fmt.get_distances())

In [None]:
classes[0]["dists"].shape

In [None]:
ix = np.random.randint(0, X_test.shape[0])
x, y = X_test[ix], y_test[ix]
# plt.imshow(1- x.reshape(8,8), cmap="gray")
# plt.suptitle(y), plt.show()

In [None]:
ix, y, x

In [None]:
cls = 2
klass = classes[cls]
verts, dists = klass["verts"], klass["dists"]
n = verts.shape[0]
to_verts = euclidean_distances(x.reshape(1, -1), verts)[0] ** alpha
all_dists = fmt.fit(euclidean_distances(np.vstack([x, verts]))).get_distances()


In [None]:
new_dists = [min(to_verts + dists[:, i]) for i in range(n)]
assert np.allclose([0] + new_dists, all_dists[0])

### ÉXITO PAPÁÁÁÁÁÁÁ

In [None]:
from scipy.stats import norm
kern = norm.pdf

In [None]:
mu, sigma = np.mean(all_dists), np.std(all_dists)
hs = np.linspace(mu - 2 * sigma, mu + 2 * sigma, 5)  # bandwiths

In [None]:
hs

In [None]:
h = mu


In [None]:
y_test.shape

In [None]:
preds = []
fhats = []
for x in X_test:
    fhat = {}
    for cls in classes.keys():
        klass = classes[cls]
        verts, dists = klass["verts"], klass["dists"]
        n = verts.shape[0]
        to_verts = euclidean_distances(x.reshape(1, -1), verts)[0] ** alpha
        fmt_dists = [min(to_verts + dists[:, i]) for i in range(n)]
        # print(cls, np.mean(fmt_dists))
        fhat[cls] = (1 / h**D) * np.mean([kern(d / h) for d in fmt_dists])
    fhats.append(fhat)
    preds.append(pd.Series(fhat).argmax())

In [None]:
densities = pd.DataFrame(fhats)
densities["true"] = y_test
densities["pred"] = preds
densities#[densities.true != densities.pred]

In [None]:
densities = pd.DataFrame(fhats)
densities["true"] = y_test
densities["pred"] = preds
densities[densities.true != densities.pred]

In [None]:
densities

In [None]:
confusion = pd.DataFrame({"y_test": y_test, "preds": preds}).groupby(["y_test", "preds"]).apply(
    len
).rename("n").reset_index().pivot("y_test", "preds").fillna(0).astype(int)


In [None]:
confusion

In [None]:
plt.imshow(confusion, cmap="gray")

In [None]:
pd.Series(preds).value_counts()

In [None]:
(y_test == preds).mean()

In [None]:
y_test[:10], preds[:10]

In [None]:
fhat

In [None]:
pd.Series(y_test).value_counts(), pd.Series(preds).value_counts()

In [None]:
{i: np.mean(cls["dists"]) for i, cls in classes.items()}

In [None]:
pd.DataFrame(runs).to_csv("runs.csv", index=False)

In [None]:
df = pd.read_csv("runs.csv")

In [None]:
df[df.alpha.isna() | (df.k == 100)]