In [28]:
import numpy as np
import pandas as pd
from scipy.spatial import distance_matrix
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from fermat import Fermat
from sklearn.datasets import load_iris

In [29]:
data, target = load_iris(return_X_y=True, as_frame=True)
N, D = data.shape
N, D

(150, 4)

In [30]:
classes = target.unique()
all_distances = distance_matrix(data, data)
cls_distances = {cls: distance_matrix(data[target == cls], data[target == cls]) for cls in classes}


In [31]:
all_distances.shape, {i: data.shape for i, data in cls_distances.items()}

((150, 150), {0: (50, 50), 1: (50, 50), 2: (50, 50)})

In [32]:
from functools import partial
f = partial(Fermat, alpha=4, path_method="FW")
# TODO: posible bug, si tomo 
# f = Fermat(alpha=4, path_method="FW")
# Sucesivas llamadas a f.fit(data) retornan una nueva instancia Fermat fiteada, pero fitean también el f "de base"
# Es problemático, el `partial` te deja trabajar con un "factory" manejable

f_all = f().fit(all_distances)
f_cls = {cls: f().fit(cls_distances[cls]) for cls in classes}

For any two points decide if they belong to the same class or not according to the distance in every class:

In [33]:
def get_distance_through(p, q, cls):
    X = pd.concat([data.loc[[p, q]], data[target == cls]])
    dist_mat = distance_matrix(X, X)
    return f().fit(dist_mat).get_distance(0, 1)

In [34]:
for p, q in np.random.randint(0, N, (3, 2)):
    print(f"p := {data.loc[p].values} (class {target[p]})")
    print(f"q := {data.loc[q].values} (class {target[q]})")
    dists = {}
    dists["all"] = f_all.get_distance(p, q)
    for cls in classes:
        dists[cls] = get_distance_through(p, q, cls)
    print(pd.Series(dists))

p := [5.  3.3 1.4 0.2] (class 0)
q := [4.8 3.4 1.6 0.2] (class 0)
all    0.0029
0      0.0029
1      0.0081
2      0.0081
dtype: float64
p := [5.5 2.4 3.8 1.1] (class 1)
q := [6.  3.4 4.5 1.6] (class 1)
all    0.0737
0      3.9601
1      0.0737
2      2.2903
dtype: float64
p := [5.2 3.4 1.4 0.2] (class 0)
q := [5.4 3.4 1.5 0.4] (class 0)
all    0.0081
0      0.0081
1      0.0081
2      0.0081
dtype: float64


No tiene mucha utilidad el paquete de Fermat para clasificar as-is o estoy loco? No generaliza bien la distancia a nuevos puntos.

## Out-out-training predictions for FermatKMeans

In [35]:
import itertools as it
import matplotlib.pyplot as plt
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.datasets import *
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import euclidean_distances
from sklearn.model_selection import train_test_split
from numpy.random import RandomState
from fermat.kmedoids import KMedoids

In [36]:
class LandmarksClassifier(BaseEstimator, ClassifierMixin):
    def __init__(
        self, alpha=2, k=5, method="kmedoids", seed=None
    ):  # number of landmarks to take from each class
        self.alpha = alpha
        self.k = k
        self.seed = seed or np.random.randint(2**32 - 1)
        self.rs = RandomState(self.seed)
        if method not in ["kmedoids", "random"]:
            raise ValueError(f"{self.method} is not a valid landmarks' choosing method")
        self.method = method

    def fit(self, X, y):
        self.landmarks_ = {}
        for cls in np.unique(y):
            X_cls = X[y == cls]
            n_cls = X_cls.shape[0]
            if self.method == "kmedoids":
                fmt = Fermat(alpha=self.alpha, path_method="FW", seed=self.seed)
                fmt.fit(euclidean_distances(X_cls))
                km = KMedoids(iterations=10, seed=self.seed)
                distance_matrix = fmt.get_distances()
                labels = km(distance_matrix, min(self.k, n_cls))
                self.landmarks_[cls] = X_cls[km._find_centers(distance_matrix, labels)]
            else:
                self.landmarks_[cls] = X_cls[self.rs.randint(0, n_cls)]

    def _distances(self, x):
        distances = {}
        for cls, landmarks in self.landmarks_.items():
            X_cls = np.vstack([x, landmarks])
            fmt = Fermat(alpha=self.alpha, path_method="FW", seed=self.seed)
            fmt.fit(euclidean_distances(X_cls))
            distance_matrix = fmt.get_distances()
            distances[cls] = distance_matrix[0].mean()  # x is at th the top of X_cls
        return distances

    def _predict(self, x):
        distances = self._distances(x)
        return min(distances, key=distances.get)

    def predict(self, X):
        return np.apply_along_axis(lclf._predict, 1, X)


In [37]:
config = dict(
    n_iter=16,
    test_sizes=[0.1, 0.5],
    datasets=["digits", "iris", "breast_cancer", "wine", "diabetes"],
    alphas=[0.5, 1, 1.5, 2, 3, 4],
    ks=[3, 10, 30, 100],
    methods=["kmedoids", "random"],
    # n_estimators=[3, 10, 30]   # TODO: LATER
    # max_depths=[3, 6, 9]
)
simple_config = dict(
    n_iter=1,
    test_sizes=[0.1],
    datasets=["digits"],
    alphas=[2],
    ks=[10],
    methods=["kmedoids"],
    # n_estimators=[3, 10, 30]   # TODO: LATER
    # max_depths=[3, 6, 9]
)

In [38]:
from sklearn import datasets

In [39]:
dir(datasets)

['__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '_base',
 '_california_housing',
 '_covtype',
 '_kddcup99',
 '_lfw',
 '_olivetti_faces',
 '_openml',
 '_rcv1',
 '_samples_generator',
 '_species_distributions',
 '_svmlight_format_fast',
 '_svmlight_format_io',
 '_twenty_newsgroups',
 'clear_data_home',
 'data',
 'descr',
 'dump_svmlight_file',
 'fetch_20newsgroups',
 'fetch_20newsgroups_vectorized',
 'fetch_california_housing',
 'fetch_covtype',
 'fetch_kddcup99',
 'fetch_lfw_pairs',
 'fetch_lfw_people',
 'fetch_olivetti_faces',
 'fetch_openml',
 'fetch_rcv1',
 'fetch_species_distributions',
 'get_data_home',
 'load_boston',
 'load_breast_cancer',
 'load_diabetes',
 'load_digits',
 'load_files',
 'load_iris',
 'load_linnerud',
 'load_sample_image',
 'load_sample_images',
 'load_svmlight_file',
 'load_svmlight_files',
 'load_wine',
 'make_biclusters',
 'make_blobs',
 'make_checkerboard',
 'make_cir

In [42]:
def run(n_iter, test_sizes, datasets, alphas, ks, methods):
    from sklearn import datasets as sk_datasets
    runs = []
    for ds, size in it.product(datasets, test_sizes):
        log = {"ds": ds, "size": size}
        print(log)
        loader = eval(f"sk_datasets.load_{ds}")
        X, y = loader(return_X_y=True)
        for i in range(n_iter):
            log["i"] = i
            print(log)
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=size, random_state=i
            )
            try:
                rfclf = RandomForestClassifier(n_estimators=10)
                rfclf.fit(X_train, y_train)
                runs.append({"method": "rf", "score": rfclf.score(X_test, y_test), **log})
            except:
                print(f"Problem with RF for {log}")
            for alpha, k, method in it.product(alphas, ks, methods):
                # New keys must be at the end to replace previous values in the loop
                log = {**log, "alpha": alpha, "k": k, "method": method}
                print(log)
                try:
                    lclf = LandmarksClassifier(alpha=alpha, k=k, method=method)
                    lclf.fit(X_train, y_train)
                    runs.append({"score": lclf.score(X_test, y_test), **log})
                except Exception as e:
                    print(f"Problem with Landmarks for {log}")

    return runs


In [43]:
runs = run(**simple_config)

{'ds': 'digits', 'size': 0.1}
{'ds': 'digits', 'size': 0.1, 'i': 0}
{'ds': 'digits', 'size': 0.1, 'i': 0, 'alpha': 2, 'k': 10, 'method': 'kmedoids'}
Problem with Landmarks for {'ds': 'digits', 'size': 0.1, 'i': 0, 'alpha': 2, 'k': 10, 'method': 'kmedoids'}


In [211]:
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
rfclf = RandomForestClassifier(n_estimators=10)
rfclf.fit(X_train, y_train)


RandomForestClassifier(n_estimators=10)

In [212]:
rfclf.score(X_test, y_test)

0.9333333333333333

In [213]:
classes = {}
alpha = 2
seed = 34
X_train, X_test, y_train, y_test = train_test_split(
    X / 16.0, y, test_size=0.2, random_state=seed
)
for cls in np.unique(y_train):
    X = X_train[y_train == cls]
    n = X.shape[0]
    fmt = Fermat(alpha=alpha, path_method="FW", seed=seed + 1)
    fmt.fit(euclidean_distances(X))
    classes[cls] = dict(verts=X, dists=fmt.get_distances())

In [215]:
ix = np.random.randint(0, X_test.shape[0])
x, y = X_test[ix], y_test[ix]
# plt.imshow(1- x.reshape(8,8), cmap="gray")
# plt.suptitle(y), plt.show()

In [217]:
cls = 2
klass = classes[cls]
verts, dists = klass["verts"], klass["dists"]
n = verts.shape[0]
to_verts = euclidean_distances(x.reshape(1, -1), verts)[0] ** alpha
all_dists = fmt.fit(euclidean_distances(np.vstack([x, verts]))).get_distances()


In [218]:
new_dists = [min(to_verts + dists[:, i]) for i in range(n)]
assert np.allclose([0] + new_dists, all_dists[0])

### ÉXITO PAPÁÁÁÁÁÁÁ

In [219]:
from scipy.stats import norm
kern = norm.pdf

In [220]:
mu, sigma = np.mean(new_dists), np.std(new_dists)
hs = np.linspace(mu - 2 * sigma, mu + 2 * sigma, 5)  # bandwiths

In [221]:
h = mu


In [222]:
y_test.shape

(30,)

In [240]:
n_test = 200
preds = []
for x in X_test[:n_test]:
    fhat = {}
    for cls in classes.keys():
        klass = classes[cls]
        verts, dists = klass["verts"], klass["dists"]
        n = verts.shape[0]
        to_verts = euclidean_distances(x.reshape(1, -1), verts)[0] ** alpha
        fmt_dists = [min(to_verts + dists[:, i]) for i in range(n)]
        fhat[cls] = np.mean([(1 / h) * kern(d / h) for d in fmt_dists])
    preds.append(pd.Series(fhat).argmin())

In [241]:
y_test.shape

(30,)

In [242]:
(y_test[:n_test] == preds).mean()

0.0

In [239]:
fhat

{0: 2.312399008103711e-32, 1: 0.0007539552893333712, 2: 2.111223144529959e-05}

In [243]:
pd.Series(y_test[:n_test]).value_counts(), pd.Series(preds).value_counts()

(1    15
 0    11
 2     4
 dtype: int64,
 0    19
 2    11
 dtype: int64)

In [225]:
{i: np.mean(cls["dists"]) for i, cls in classes.items()}

{0: 0.0010472140039448103, 1: 0.001931632653061226, 2: 0.002776391186200444}

In [None]:
    # New keys must be at the end to replace previous values in the loop
    log = {**log, "alpha": alpha, "k": k, "method": method}
    print(log)
    try:
        lclf = LandmarksClassifier(alpha=alpha, k=k, method=method)
        lclf.fit(X_train, y_train)
        runs.append({"score": lclf.score(X_test, y_test), **log})
    except Exception as e:
        print(f"Problem with Landmarks for {log}")
        logger.error(e, exc_info=True)

In [29]:
pd.DataFrame(runs).to_csv("runs.csv", index=False)

In [30]:
df = pd.read_csv("runs.csv")

In [31]:
df[df.alpha.isna() | (df.k == 100)]

Unnamed: 0,method,score,ds,size,i,alpha,k
0,rf,0.950000,digits,0.1,0,,
7,kmedoids,0.916667,digits,0.1,0,0.5,100.0
8,random,0.722222,digits,0.1,0,0.5,100.0
15,kmedoids,0.888889,digits,0.1,0,1.0,100.0
16,random,0.661111,digits,0.1,0,1.0,100.0
...,...,...,...,...,...,...,...
7823,random,0.009050,diabetes,0.5,15,2.0,100.0
7830,kmedoids,0.004525,diabetes,0.5,15,3.0,100.0
7831,random,0.004525,diabetes,0.5,15,3.0,100.0
7838,kmedoids,0.004525,diabetes,0.5,15,4.0,100.0


In [3]:
X, y = load_

NameError: name 'X' is not defined