# 08. Experimentos

In [1]:
%load_ext autoreload
%autoreload 1
%aimport main

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, ShuffleSplit
from sklearn.datasets import (
    load_iris,
    load_digits,
    make_moons,
    fetch_openml,
    make_circles,
)

from main import pilot_h, euclidean, sample_fermat
from main import Bundle, FermatKDE, BaseKDEClassifier, FermatKDEClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from lightgbm import LGBMClassifier

SEED = 1957

## Datasets disponibles

In [123]:
# X, y = make_circles(n_samples=2000, noise=0.15, factor=0.5, random_state=SEED)
# X, y = make_moons(n_samples=2000, noise=0.1)
# X, y = load_digits(return_X_y=True)
# X, y = load_iris(return_X_y=True)
X, y = fetch_openml(
    "mnist_784", version=1, parser="auto", return_X_y=True, as_frame=False
)

In [124]:
n_samples = 1000
sample_indices = np.random.randint(0, len(X), 1000)
X, y = X[sample_indices], y[sample_indices]

In [125]:
# Choosing train/test split by indices is equivalent to choosing rows themselves
idx_train, idx_test = train_test_split(
    np.arange(len(X)), test_size=0.9, random_state=SEED
)
tt_cv = [(idx_train, idx_test)]
ss_cv = ShuffleSplit(n_splits=5, test_size=0.5, random_state=SEED)

In [126]:
len(idx_train), len(idx_test)

(100, 900)

In [127]:
A = euclidean(X[idx_train])
pilot_h(A)

138.9887274927925

In [128]:
logscale = np.logspace(-2, 2, 11)
base_grid = {"bandwidth": pilot_h(A) * logscale}
base_grid

{'bandwidth': array([1.38988727e+00, 3.49123899e+00, 8.76959584e+00, 2.20282288e+01,
        5.53324090e+01, 1.38988727e+02, 3.49123899e+02, 8.76959584e+02,
        2.20282288e+03, 5.53324090e+03, 1.38988727e+04])}

In [129]:
alphas = np.linspace(1, 2, 3)
DQ = {alpha: sample_fermat(X[idx_train], alpha) for alpha in alphas}

In [130]:
fermat_grid = [
    {
        "alpha": [alpha],
        "bandwidth": np.concatenate([[-1], pilot_h(DQ[alpha]) * logscale]),
    }
    for alpha in alphas
]
fermat_grid

[{'alpha': [1.0],
  'bandwidth': array([-1.00000000e+00,  1.38988727e+00,  3.49123899e+00,  8.76959584e+00,
          2.20282288e+01,  5.53324090e+01,  1.38988727e+02,  3.49123899e+02,
          8.76959584e+02,  2.20282288e+03,  5.53324090e+03,  1.38988727e+04])},
 {'alpha': [1.5],
  'bandwidth': array([-1.00000000e+00,  1.07346175e+02,  2.69641400e+02,  6.77308573e+02,
          1.70132221e+03,  4.27352819e+03,  1.07346175e+04,  2.69641400e+04,
          6.77308573e+04,  1.70132221e+05,  4.27352819e+05,  1.07346175e+06])},
 {'alpha': [2.0],
  'bandwidth': array([-1.00000000e+00,  7.37118280e+03,  1.85155741e+04,  4.65090193e+04,
          1.16825374e+05,  2.93452073e+05,  7.37118280e+05,  1.85155741e+06,
          4.65090193e+06,  1.16825374e+07,  2.93452073e+07,  7.37118280e+07])}]

In [131]:
models = {
    "fermat": (FermatKDEClassifier, fermat_grid),
    "base": (BaseKDEClassifier, base_grid),
}

In [132]:
searches = {}
for name, (model, grid) in models.items():
    print(name)
    searches[name] = search = GridSearchCV(
        model(), grid, cv=cv, n_jobs=-1, verbose=1, refit=False
    ).fit(X[idx_train], y[idx_train])
    print(search.best_params_, search.best_score_)

fermat
Fitting 5 folds for each of 36 candidates, totalling 180 fits
{'alpha': 1.0, 'bandwidth': -1.0} 0.11599999999999999
base
Fitting 5 folds for each of 11 candidates, totalling 55 fits
{'bandwidth': 1.389887274927925} 0.11599999999999999


In [133]:
for name, search in searches.items():
    print(name)
    print(search.best_params_, search.best_score_)

fermat
{'alpha': 1.0, 'bandwidth': -1.0} 0.11599999999999999
base
{'bandwidth': 1.389887274927925} 0.11599999999999999


In [134]:
from sklearn.model_selection import cross_val_score

test_results = {
    name: cross_val_score(models[name][0](**search.best_params_), X, y, cv=tt_cv)
    for name, search in searches.items()
}

In [135]:
test_results

{'fermat': array([0.10333333]), 'base': array([0.10333333])}

In [136]:
clf = FermatKDEClassifier(**searches["fermat"].best_params_).fit(
    X[idx_train], y[idx_train]
)
clf.bandwidths_, clf.score(X[idx_test], y[idx_test])

(array([ 96.11853658,  81.78409364,  67.02051245, 109.67047799,
         86.81772579,  93.46023483,  85.47128703,  65.21200656,
         68.37264705,  79.18888363]),
 0.10333333333333333)

In [137]:
results = pd.concat(
    [pd.DataFrame(v.cv_results_) for v in searches.values()],
    keys=searches.keys(),
    names=["method", "candidate"],
).reset_index()

In [138]:
results["oos_score"] = results.apply(
    lambda x: models[x.method][0](**x.params)
    .fit(X[idx_train], y[idx_train])
    .score(X[idx_test], y[idx_test]),
    axis=1,
)

In [139]:
results[
    ["method", "params", "mean_test_score", "std_test_score", "oos_score"]
].sort_values("oos_score", ascending=False).head(10).round(3)

Unnamed: 0,method,params,mean_test_score,std_test_score,oos_score
0,fermat,"{'alpha': 1.0, 'bandwidth': -1.0}",0.116,0.023,0.103
35,fermat,"{'alpha': 2.0, 'bandwidth': 73711828.03052434}",0.116,0.023,0.103
26,fermat,"{'alpha': 2.0, 'bandwidth': 18515.574067164158}",0.116,0.023,0.103
27,fermat,"{'alpha': 2.0, 'bandwidth': 46509.01927092032}",0.116,0.023,0.103
28,fermat,"{'alpha': 2.0, 'bandwidth': 116825.37444944237}",0.116,0.023,0.103
29,fermat,"{'alpha': 2.0, 'bandwidth': 293452.0729355802}",0.116,0.023,0.103
30,fermat,"{'alpha': 2.0, 'bandwidth': 737118.2803052434}",0.116,0.023,0.103
31,fermat,"{'alpha': 2.0, 'bandwidth': 1851557.406716418}",0.116,0.023,0.103
32,fermat,"{'alpha': 2.0, 'bandwidth': 4650901.927092034}",0.116,0.023,0.103
33,fermat,"{'alpha': 2.0, 'bandwidth': 11682537.444944236}",0.116,0.023,0.103


In [140]:
gbm_grid = Bundle(
    num_leaves=[3, 5, 7],
    n_estimators=[100, 500],
    learning_rate=np.logspace(-3, -1, 5),
    # reg_lambda=[0.01, 0.1],
    # reg_alpha=[0.01, 0.1],
)
gbm_search = GridSearchCV(LGBMClassifier(objective="multiclass"), gbm_grid, cv=ss_cv)
gbm_search.fit(X[idx_train], y[idx_train])
gbm_clf = LGBMClassifier(objective="multiclass", **gbm_search.best_params_).fit(
    X[idx_train], y[idx_train]
)
gbm_clf.score(X[idx_test], y[idx_test])

0.64

In [141]:
gbm_clf_ = LGBMClassifier(
    objective="multiclass",
    num_leaves=7,
    n_estimators=500,
    learning_rate=0.05,
    reg_lambda=1,
    # reg_alpha=0.1,
)
gbm_clf_.fit(X[idx_train], y[idx_train])


In [142]:
(gbm_clf_.score(X[idx_train], y[idx_train]), gbm_clf_.score(X[idx_test], y[idx_test]))

(1.0, 0.6088888888888889)

In [143]:
from sklearn.naive_bayes import GaussianNB

gnb_grid = Bundle(var_smoothing=np.logspace(-9, -5, 5))
gnb_search = GridSearchCV(GaussianNB(), gnb_grid, cv=ss_cv)
gnb_search.fit(X[idx_train], y[idx_train])
gnb_clf = GaussianNB(**gnb_search.best_params_).fit(
    X[idx_train], y[idx_train]
)
gnb_clf.score(X[idx_test], y[idx_test])

0.61

In [144]:
from sklearn.linear_model import LogisticRegression

lr_grid = Bundle(C=np.logspace(-4, 2, 7), penalty=["l1", "l2", "elasticnet", None])
lr_search = GridSearchCV(LogisticRegression(solver="saga"), lr_grid, cv=ss_cv)
lr_search.fit(X[idx_train], y[idx_train])


35 fits failed out of a total of 140.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
35 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/gonzalo/opt/anaconda3/envs/jose-luis/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/gonzalo/opt/anaconda3/envs/jose-luis/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py", line 1291, in fit
    fold_coefs_ = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, prefer=prefer)(
  File "/Users/gonzalo/opt/anaconda3/envs/jose-luis/lib/python3.10/site-packages/sklearn/utils/parallel.py", line 63, in __call__
    return super().__call__(iterabl

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

In [145]:

lr_clf = LogisticRegression(solver="saga", **lr_search.best_params_).fit(X[idx_train], y[idx_train])
lr_search.best_params_, lr_clf.score(X[idx_test], y[idx_test])



({'C': 0.1, 'penalty': 'l1'}, 0.7244444444444444)

In [146]:
svc_grid = Bundle(
    C=np.logspace(-4, 5, 10),
    kernel=["linear", "poly", "rbf", "sigmoid"],
    gamma=np.logspace(-5, 2, 10),
)
svc_search = GridSearchCV(SVC(), svc_grid, cv=ss_cv)
svc_search.fit(X[idx_train], y[idx_train])
svc_clf = SVC(**svc_search.best_params_).fit(X[idx_train], y[idx_train])
svc_search.best_params_, svc_clf.score(X[idx_test], y[idx_test])

({'C': 0.0001, 'gamma': 1e-05, 'kernel': 'linear'}, 0.7344444444444445)

In [147]:
knn_grid = Bundle(
    n_neighbors=[3, 5, 8, 13, 21, 34], weights=["uniform", "distance"]
)
# TODO: Pass FermatDistance as metric!!!
knn_search = GridSearchCV(KNeighborsClassifier(), knn_grid, cv=ss_cv)
knn_search.fit(X[idx_train], y[idx_train])
knn_clf = KNeighborsClassifier(**knn_search.best_params_).fit(
    X[idx_train], y[idx_train]
)
knn_search.best_params_, knn_clf.score(X[idx_test], y[idx_test])

({'n_neighbors': 3, 'weights': 'distance'}, 0.6644444444444444)