# Kernel ridge regression on housing regression datasets

This notebook uses the sklearn housing (regression) dataset. We perform least-squares KRR WITHOUT cross-validation.

Use this notebook to compare different methods:
1. Full KRR
2. Standard thinning (ST)
3. Kernel thinning (KT)
4. RFM
5. Kernel thinning + Feature matrix
6. FALKON
7. RPCholeksy
<!-- 5. FALKON + KT centers -->

In [1]:
# install using `conda install -c conda-forge line_profiler`
%load_ext line_profiler
%load_ext autoreload
%autoreload 2

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.utils.estimator_checks import check_estimator
from sklearn.utils.validation import check_is_fitted
import numpy as np
import pandas as pd
from datetime import datetime
from copy import deepcopy
# import torch
import pickle

# utils for plotting
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px

# utils for timing
from goodpoints.tictoc import tic, toc, TicToc
# utils for kernel ridge regression
from goodpoints.krr.util_estimators import get_estimator, get_sigma_heuristic
# utils for evaluating kernels
from goodpoints.krr.thin.util_k_mmd import kernel_eval, to_regression_kernel, get_kernel
# utils for generate samples from the data distribution
from goodpoints.krr.util_sample import get_Xy #, ToyData , get_toy_dataset, logistic
from goodpoints.krr.util_load_data import get_real_dataset
# utils for dataset thinning
from goodpoints.krr.thin.util_thin import sd_thin, kt_thin2
from goodpoints.krr.thin.util_thin_dnc import sd_thin_dnc, kt_thin2_dnc

In [4]:
# add this to be able to render plotly plots in non-vscode notebooks
import plotly.io as pio
pio.renderers.default = "notebook_connected"

In [5]:
# helper functions
def sample(arr, n=1000):
    return arr[np.random.choice(len(arr), n, replace=True)]
def histogram(arr, height=400, width=600):
    return px.histogram(arr, width=width, height=height)

## Set hyperparameters

In [67]:
dataset = 'housing'
assert dataset in ['housing', 'msd', 'svhn']
# get_real_dataset already normalizes the data

### Regression parameters

kernel = 'gauss'  # ['gauss', 'laplace']
sigma = 10
alpha = 1e-3 # 1.0

### RFM parameters
rfm_iters = 2

### Experiment parameters

k_fold = 5      # k >= 2
n_repeats = 10
use_cross_validation = False

n_jobs = 2 # -1 = use all CPUs
save = False

### Thinning parameters

m = None # Thinned dataset will have size n/2**m

In [68]:
# Determine auxiliary parameters

task = 'regression'
refit = 'neg_mean_squared_error'
postprocess = None
ydim = 1

Kernels:
- RBF:
$$\mathbf{k}(x, y) = \exp(-\gamma ||x-y||_2^2)$$
- Laplacian:
$$\mathbf{k}(x, y) = \exp(-\gamma ||x-y||_1)$$

Median heuristic to choose the bandwidth parameter, i.e., median of squared pairwise distances:
- For Gaussian data, we can compute this exactly. Assume $X\sim \mathcal{N}(0,\sigma^2 I_d)$. For the RBF kernel, $X_1-X_2\sim \mathcal{N}(0,2\sigma^2 I_d)$. Then $(X_1-X_2)^2$ follows a chi-squared distribution with $d$ degrees of freedom, mean $d\cdot \sqrt{2}\sigma$ and median roughly $d(1-\frac{2}{9d})^3 \cdot \sqrt{2}\sigma$. For the Laplacian kernel, $||x-y||_1$ follows a folded normal distribution (https://en.wikipedia.org/wiki/Folded_normal_distribution) with median roughly $\sqrt{2}\sigma$.

Available kernels in sklearn: 
https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics.pairwise

## Get dataset

In [69]:
X, y = get_real_dataset(dataset)
print(X.shape, y.shape)
# remove values corresponding to y>= 5
X = X[y < 5]
y = y[y < 5]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=(k_fold-1)/k_fold, shuffle=False)

(20640, 8) (20640,)
normalizing X
(20640, 8) (20640,)


In [70]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(15718, 8) (15718,)
(3930, 8) (3930,)


In [71]:
X[:10]

array([[ 2.34476576,  0.98214266,  0.62855945, -0.15375759, -0.9744286 ,
        -0.04959654,  1.05254828, -1.32783522],
       [ 2.33223796, -0.60701891,  0.32704136, -0.26333577,  0.86143887,
        -0.09251223,  1.04318455, -1.32284391],
       [ 1.7826994 ,  1.85618152,  1.15562047, -0.04901636, -0.82077735,
        -0.02584253,  1.03850269, -1.33282653],
       [ 0.93296751,  1.85618152,  0.15696608, -0.04983292, -0.76602806,
        -0.0503293 ,  1.03850269, -1.33781784],
       [-0.012881  ,  1.85618152,  0.3447108 , -0.03290586, -0.75984669,
        -0.08561576,  1.03850269, -1.33781784],
       [ 0.08744664,  1.85618152, -0.26972966,  0.01466934, -0.89407076,
        -0.08961842,  1.03850269, -1.33781784],
       [-0.11136631,  1.85618152, -0.2009177 , -0.3066332 , -0.29271158,
        -0.0907249 ,  1.03382082, -1.33781784],
       [-0.39513665,  1.85618152, -0.25523193, -0.07354166, -0.23707923,
        -0.12347647,  1.03382082, -1.33781784],
       [-0.94235915,  1.06160074

In [72]:
histogram(np.linalg.norm(X_train, axis=1, ord=2))

In [73]:
heur_sigma, distances = get_sigma_heuristic(X_train, sample_size=200, return_dist=True)
print('heuristic bandwidth:', heur_sigma)

heuristic bandwidth: 2.8288369267089615


In [74]:
histogram(sample(distances, 10000))

In [75]:
if ydim == 1:
    fig = histogram(y_train)
else:
    fig = histogram(np.argmax(y_train, axis=-1))
fig.show()

### Standard Thinning (ST)

In [76]:
# largest power of four
from goodpoints.compress import largest_power_of_four

In [77]:
largest_power_of_four(X_train.shape[0])

4096

In [78]:
%%time
sd_coreset = sd_thin(X_train, m=m)
print('sd coreset:', sd_coreset.shape)
X_train_sd_thin, y_train_sd_thin = X_train[sd_coreset], y_train[sd_coreset]

sd coreset: (192,)
CPU times: user 444 µs, sys: 110 µs, total: 554 µs
Wall time: 512 µs


In [79]:
sd_coresets = sd_thin_dnc(X_train, m=m)
print(len(sd_coresets), sd_coresets[0].shape)

coreset size: 192, m: 6
64 (192,)


### Kernel Thinning (KT)

In [80]:
from functools import partial

# KERNEL THINNING

# Define kernel params
d = X_train.shape[-1]
var_k = sigma**2
params_k_swap = {"name": kernel, "var": var_k, "d": int(d)}
params_k_split = {"name": kernel, "var": var_k, "d": int(d)}

split_kernel = get_kernel(params_k_split)
swap_kernel = get_kernel(params_k_swap)

regression_split_kernel = to_regression_kernel(split_kernel, ydim=ydim)
regression_swap_kernel = to_regression_kernel(swap_kernel, ydim=ydim)

In [81]:
Xy_train = get_Xy(X_train, y_train)
print(Xy_train.shape)


(15718, 9)


In [82]:
# single coreset
kt_coreset = kt_thin2(Xy_train, split_kernel, swap_kernel, m=m, store_K=True)

In [83]:
kt_coreset.shape

(192,)

In [84]:
kt_coresets = kt_thin2_dnc(Xy_train, m=m, split_kernel=split_kernel, store_K=True)

In [85]:
print(len(kt_coresets), kt_coresets[0].shape)

64 (192,)


In [86]:
# %lprun -f kt_thin3 kt_coreset = kt_thin3(X_train, split_kernel, swap_kernel, m=m)

In [87]:
# from goodpoints.compress import compress_gsn_kt
# X_intermediate = compress_gsn_kt(X_train)

In [88]:
# from goodpoints import compress
# %lprun -f compress.compresspp ktr_coreset = kt_thin2(Xy_train, regression_split_kernel, regression_swap_kernel, m=m, store_K=True)

| n | 5,000 | 20,000 |
| -------- | -------- | -------- |
| store_K=True | 7.9s | 46.9s |
| store_K=False | 20.8s | 1m59s |

In [89]:
# X_train_ktr_thin, y_train_ktr_thin = X_train[ktr_coreset], y_train[ktr_coreset]

In [90]:
# X_train_ktr_thin.shape

In [91]:
# print('n:', len(Xy_train))
# log2n = int(np.log2(len(Xy_train)))
# log4n = int(np.log2(len(Xy_train)) / 2)
# print('log2n:', log2n)
# print('log4n:', log4n) 

# print('2^log2n:', 2**log2n)
# print('4^log4n:', 4**log4n)

# for i in range(log2n // 2 + 1):
#     with TicToc():
#         print(i, kt_thin2(Xy_train, regression_split_kernel, regression_swap_kernel, m=i).shape[0])
#         print(i, sd_thin(X_train, m=i).shape[0])

## KRR (Full)

In [138]:
krr_full = get_estimator(
    task, 
    'full', 
    alpha=alpha, 
    # kernel=kernel, 
    kernel='gauss_M',
    sigma=sigma, 
    postprocess=postprocess,
    
)

In [139]:
krr_full

In [140]:
%%time
K_full = krr_full.fit(X_train, y_train)

CPU times: user 31 s, sys: 8.39 s, total: 39.4 s
Wall time: 9.74 s


In [141]:
histogram(sample(K_full.flatten(), n=10000))

In [142]:
pred_full = krr_full.predict(X_test)
train_pred_full = krr_full.predict(X_train)

M_ [[ 0.63127402  0.11107    -0.21064034  0.40641939  0.06186651 -2.29404736
  -1.0031311  -1.00452806]
 [ 0.11107     0.04042652 -0.03686148  0.07669016  0.02020151 -0.40177256
  -0.20123305 -0.1918705 ]
 [-0.21064034 -0.03686148  0.19966391 -0.20013657 -0.01925184  0.51121099
   0.29210928  0.29581516]
 [ 0.40641939  0.07669016 -0.20013657  0.31454826  0.04042747 -1.37281796
  -0.65507995 -0.66615293]
 [ 0.06186651  0.02020151 -0.01925184  0.04042747  0.02128479 -0.25818238
  -0.12344353 -0.11452671]
 [-2.29404736 -0.40177256  0.51121099 -1.37281796 -0.25818238  9.38930007
   3.94210326  3.95838557]
 [-1.0031311  -0.20123305  0.29210928 -0.65507995 -0.12344353  3.94210326
   1.77252983  1.76275424]
 [-1.00452806 -0.1918705   0.29581516 -0.66615293 -0.11452671  3.95838557
   1.76275424  1.80137742]]
M_ [[ 0.63127402  0.11107    -0.21064034  0.40641939  0.06186651 -2.29404736
  -1.0031311  -1.00452806]
 [ 0.11107     0.04042652 -0.03686148  0.07669016  0.02020151 -0.40177256
  -0.20123

In [143]:
fig = make_subplots(rows=2, cols=1, subplot_titles=['train', 'test'])

fig.add_trace(go.Histogram(x=train_pred_full.flatten(), name='train', opacity=0.5), row=1, col=1)
fig.add_trace(go.Histogram(x=y_train.flatten(), name='ground truth', opacity=0.5, legendgroup=1), row=1, col=1)

fig.add_trace(go.Histogram(x=pred_full.flatten(), name='test', opacity=0.5), row=2, col=1)
fig.add_trace(go.Histogram(x=y_test.flatten(), name='ground truth', opacity=0.5, legendgroup=1), row=2, col=1)
fig.show()

In [144]:
%%time
print('Score:', krr_full.score(X_test, y_test))

M_ [[ 0.63127402  0.11107    -0.21064034  0.40641939  0.06186651 -2.29404736
  -1.0031311  -1.00452806]
 [ 0.11107     0.04042652 -0.03686148  0.07669016  0.02020151 -0.40177256
  -0.20123305 -0.1918705 ]
 [-0.21064034 -0.03686148  0.19966391 -0.20013657 -0.01925184  0.51121099
   0.29210928  0.29581516]
 [ 0.40641939  0.07669016 -0.20013657  0.31454826  0.04042747 -1.37281796
  -0.65507995 -0.66615293]
 [ 0.06186651  0.02020151 -0.01925184  0.04042747  0.02128479 -0.25818238
  -0.12344353 -0.11452671]
 [-2.29404736 -0.40177256  0.51121099 -1.37281796 -0.25818238  9.38930007
   3.94210326  3.95838557]
 [-1.0031311  -0.20123305  0.29210928 -0.65507995 -0.12344353  3.94210326
   1.77252983  1.76275424]
 [-1.00452806 -0.1918705   0.29581516 -0.66615293 -0.11452671  3.95838557
   1.76275424  1.80137742]]
Score: 0.6616813142035796
CPU times: user 1.56 s, sys: 1.67 s, total: 3.23 s
Wall time: 489 ms


In [145]:
%%time
# print('Acc:', accuracy_score(y_test, pred_full))
print('Train MSE:', mean_squared_error(y_train, train_pred_full))
print('Test MSE:', mean_squared_error(y_test, pred_full))

Train MSE: 0.2984497767483261
Test MSE: 0.3520991310029902
CPU times: user 632 µs, sys: 349 µs, total: 981 µs
Wall time: 678 µs


In [146]:
histogram(krr_full.sol_)

In [147]:
len(krr_full.sol_)

15718

## KRR + ST

In [104]:
krr_sd_thin = get_estimator(
    task, 
    'st', 
    alpha=alpha, # / np.power(len(X_train), 1/4), 
    kernel=kernel, 
    sigma=sigma, 
    m=m, 
    postprocess=postprocess,
    use_dnc=True,
)

In [105]:
%%time
krr_sd_thin.fit(X_train, y_train)

coreset size: 192, m: 6
CPU times: user 2.11 s, sys: 923 ms, total: 3.03 s
Wall time: 444 ms


In [106]:
krr_sd_thin.M

In [107]:
%%time
pred_sd = krr_sd_thin.predict(X_test)
train_pred_sd = krr_sd_thin.predict(X_train)

print('train MSE:', mean_squared_error(y_train, train_pred_sd))
print('MSE:', mean_squared_error(y_test, pred_sd))

train MSE: 0.35867752890211974
MSE: 0.36831504269828036
CPU times: user 14 s, sys: 10.6 s, total: 24.5 s
Wall time: 2.97 s


## KRR + KT

In [108]:
krr_kt_thin = get_estimator(
    task,
    'kt', 
    kernel=kernel, 
    alpha=alpha, # / np.power(len(X_train), 1/4), 
    sigma=sigma, 
    m=m, 
    postprocess=postprocess,
    ydim=ydim,
    use_dnc=True,
)

In [109]:
%%time
krr_kt_thin.fit(X_train, y_train)

# To run line profiler, uncomment the next line
# %lprun -f krr_kt_thin.fit krr_kt_thin.fit(X_train, y_train)

CPU times: user 2.81 s, sys: 1.46 s, total: 4.27 s
Wall time: 2 s


In [110]:
len(krr_kt_thin.estimators_)

64

In [111]:
%%time
pred_kt = krr_kt_thin.predict(X_test)
train_pred_kt = krr_kt_thin.predict(X_train)
print('train MSE:', mean_squared_error(y_train, train_pred_kt))
print('MSE:', mean_squared_error(y_test, pred_kt))

train MSE: 0.36098450099285195
MSE: 0.37136722391458477
CPU times: user 12.4 s, sys: 11.4 s, total: 23.7 s
Wall time: 2.71 s


## RFM

Note: changing the bandwidth for RFM doesn't make a big difference, since increasing bandwidth will lead to greater weight values. However, there is a big difference in terms of numerical stability. Therefore, it's better to use the default bandwidth $L=10$.

In [47]:
rfm = get_estimator(
    task, 
    'rfm', 
    alpha=alpha, 
    kernel=kernel, 
    sigma=sigma,
    iters=rfm_iters,
    ydim=ydim,
)

In [48]:
Ms, mses, preds = rfm.fit(
    X_train, y_train, 
    val_data=(X_test, y_test),
)

Round 0, Test MSE: 0.4468
Using batch size of 4032


  0%|          | 0/5 [00:00<?, ?it/s]

Round 1, Test MSE: 0.4274
Using batch size of 4032


  0%|          | 0/5 [00:00<?, ?it/s]

Final MSE: 0.4320


In [49]:
# plot correlation matrices Ms as subplots
fig = make_subplots(rows=1, cols=len(Ms), subplot_titles=[f'iter {i}' for i in range(len(Ms))])
for i, M in enumerate(Ms):
    # add image
    fig.add_trace(go.Heatmap(z=M, showlegend=False), row=1, col=i+1)
    fig.update_layout(height=400, width=1000, title_text="Feature matrix per iteration")
fig.show()

In [50]:
histogram(rfm._model.weights)

In [51]:
print('train MSE:', mean_squared_error(y_train, rfm.predict(X_train)))
print('MSE:', mean_squared_error(y_test, rfm.predict(X_test)))

train MSE: 0.378357019876866
MSE: 0.4319936901204163


## KRR + KT + Feature Learning

In [79]:
krr_kf_thin = get_estimator(
    task,
    'kf', 
    kernel=kernel, 
    alpha=alpha, # / np.power(len(X_train), 1/4), 
    sigma=10, 
    m=m, 
    postprocess=postprocess,
    ydim=ydim,
    rfm_iters=rfm_iters,
)

In [80]:
krr_kf_thin

In [81]:
%%time
K = krr_kf_thin.fit(X_train, y_train, val_data=(X_test, y_test))

learning feature matrix...
Round 0, Test MSE: 0.4468
Using batch size of 4032


  0%|          | 0/5 [00:00<?, ?it/s]

Round 1, Test MSE: 0.4274
Using batch size of 4032


  0%|          | 0/5 [00:00<?, ?it/s]

Final MSE: 0.4320
47> M_gauss
CPU times: user 2min 1s, sys: 15.3 s, total: 2min 16s
Wall time: 27.1 s


In [82]:
fig = go.Figure(data=[go.Heatmap(z=krr_kf_thin.M)])
fig.update_layout(height=400, width=400, title_text="Feature matrix")
fig.show()

In [83]:
krr_kf_thin.X_fit_.shape

(128, 8)

In [84]:
K.shape

(128, 128)

In [85]:
histogram(K.flatten())

In [86]:
histogram(krr_kf_thin.sol_)

In [87]:
%%time
pred_kf = krr_kf_thin.predict(X_test)
print('train MSE:', mean_squared_error(y_train, krr_kf_thin.predict(X_train)))
print('MSE:', mean_squared_error(y_test, pred_kf))

76> M_gauss
76> M_gauss
train MSE: 0.4700533740566018
MSE: 0.43435546006226133
CPU times: user 232 ms, sys: 92.8 ms, total: 325 ms
Wall time: 44.1 ms


## RFM-Thin

In [49]:
rfm_thin = get_estimator(
    task, 
    'rfm', 
    alpha=alpha, 
    kernel=kernel, 
    sigma=sigma,
    iters=rfm_iters,
    ydim=ydim,
    use_kt = True,
)

In [50]:
rfm_thin

In [52]:
Ms, mses, preds = rfm_thin.fit(
    X_train, y_train, 
    val_data=(X_test, y_test),
)

Using kernel thinning to select centers...
Round 0, Test MSE: 0.5830
Using batch size of 4032


  0%|          | 0/5 [00:00<?, ?it/s]

Using kernel thinning to select centers...
Round 1, Test MSE: 0.5562
Using batch size of 4032


  0%|          | 0/5 [00:00<?, ?it/s]

Using kernel thinning to select centers...
Final MSE: 0.4950


In [53]:
%%time
pred_rfm_thin = rfm_thin.predict(X_test)
print('train MSE:', mean_squared_error(y_train, rfm_thin.predict(X_train)))
print('MSE:', mean_squared_error(y_test, pred_rfm_thin))

train MSE: 0.5129124751561491
MSE: 0.49498578089770495
CPU times: user 16.4 ms, sys: 4.29 ms, total: 20.7 ms
Wall time: 7.61 ms


## FALKON

In [58]:
krr_falkon = get_estimator(
    task,
    'falkon',
    kernel=kernel,
    sigma=sigma,
    alpha=alpha,
    m=m,
    postprocess=postprocess,
)

No module named 'falkon'


In [59]:
%%time
if krr_falkon:
    krr_falkon.fit(X_train, y_train)

CPU times: user 1e+03 ns, sys: 1e+03 ns, total: 2 µs
Wall time: 3.1 µs


In [60]:
%%time
if krr_falkon:
    pred_falkon = krr_falkon.predict(X_test)
    # print('Acc:', accuracy_score(y_test, pred_falkon))
    print('MSE:', mean_squared_error(y_test, pred_falkon))

CPU times: user 1e+03 ns, sys: 1e+03 ns, total: 2 µs
Wall time: 3.1 µs


## FALKON + KT

In [61]:
# krr_falkon_kt = get_estimator(
#     task,
#     'falkon+kt',
#     kernel=kernel,
#     sigma=sigma,
#     alpha=alpha,
#     m=m,
#     postprocess=postprocess,
#     ydim=ydim,
# )

In [62]:
# %lprun -f krr_falkon_kt.fit krr_falkon_kt.fit(X_train, y_train)

In [63]:
# %%time
# if krr_falkon_kt:
#     pred_falkon_kt = krr_falkon_kt.predict(X_test)
#     print('Score:', accuracy_score(y_test, pred_falkon_kt))
#     print('RMSE:', np.sqrt(mean_squared_error(y_test, pred_falkon_kt)))

## RPCholesky KRR

In [112]:
krr_rpcholesky = get_estimator(
    task,
    'rpcholesky',
    kernel=kernel,
    sigma=sigma,
    alpha=alpha,
    m=m,
    # postprocess=postprocess,
)

In [113]:
krr_rpcholesky.fit(X_train, y_train)

In [114]:
len(krr_rpcholesky.model_.sol)

192

In [115]:
%%time
pred_rpcholesky = krr_rpcholesky.predict(X_test)
print('train MSE:', mean_squared_error(y_train, krr_rpcholesky.predict(X_train)))
print('MSE:', mean_squared_error(y_test, pred_rpcholesky))

train MSE: 0.4476046239217258
MSE: 0.4687686964924562
CPU times: user 165 ms, sys: 215 ms, total: 381 ms
Wall time: 40.7 ms


## Run experiment

We now run a full grid search with cross validation across different-size datasets.

Reference: https://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_hist_grad_boosting_comparison.html#sphx-glr-auto-examples-ensemble-plot-forest-hist-grad-boosting-comparison-py

In [116]:
from sklearn.model_selection import GridSearchCV, KFold

In [117]:
# NOTE: these will only be applied if `use_cross_validation` is True
# Default param grid to search for each model
default_param_grid = {
    "sigma" :   [1,2,5,10,20], 
    "alpha" :   [1e-3,1e-4,1e-5],
}
# falkon_param_grid = {
#     "sigma" :   [0.05,0.1, 0.2, 0.5], 
#     "alpha" :   [1e-3,1e-4,1e-5], # Falkon requires smaller alpha
# }
falkon_param_grid = default_param_grid

rfm_param_grid = {
    "sigma" :   [10,], 
    "alpha" :   [1e-3,1e-4,1e-5],
    # "iters" :   [1,2,3],
}

In [118]:
# The different values will correspond to different columns in the final plots
varying_variable = 'kernel'
varying_variable_values = ['gauss', 'laplace', 'gauss_M', 'laplace_M']
datasets = ['housing',]

In [119]:
# Model constructors and data size for each model
# We allow for different data sizes to avoid running Full KR on large datasets
model_configs = {
    'full' : {
        'dataset' : datasets,
        'kwargs': {
            'postprocess' : postprocess
        },
        'param_grid' : default_param_grid
    },
}

# for m in [None,]:
model_configs[f'st'] = {
    'dataset' : datasets,
    'kwargs' : {
        'm' : m,
        'postprocess' : postprocess
    },
    'param_grid' : default_param_grid
}
model_configs[f'st-dnc'] = {
    'dataset' : datasets,
    'kwargs' : {
        'm' : m,
        'postprocess' : postprocess,
        'use_dnc' : True,
    },
    'param_grid' : default_param_grid
}

model_configs[f'kt'] = {
    'dataset' : datasets,
    'kwargs' : {
        'm' : m,
        'postprocess' : postprocess,
        'ydim' : ydim,
    },
    'param_grid' : default_param_grid
}
model_configs[f'kt-dnc'] = {
    'dataset' : datasets,
    'kwargs' : {
        'm' : m,
        'postprocess' : postprocess,
        'ydim' : ydim,
        'use_dnc' : True,
    },
    'param_grid' : default_param_grid
}

# model_configs[f'falkon'] = {
#     'dataset' : datasets,
#     'kwargs' : {
#         'm' : m,
#         'postprocess' : postprocess,
#     },
#     'param_grid' : falkon_param_grid
# }

# model_configs[f'falkon+kt'] = {
#     'dataset' : datasets,
#     'kwargs' : {
#         'm' : m,
#         'postprocess' : postprocess,
#         'ydim' : ydim,
#     },
#     'param_grid' : falkon_param_grid
# }
# model_configs[f'rfm'] = {
#     'dataset' : datasets,
#     'kwargs' : {
#         'iters' : rfm_iters,
#     },
#     'param_grid' : rfm_param_grid
# }

# model_configs[f'kf'] = {
#     'dataset' : datasets,
#     'kwargs' : {
#         'm' : m,
#         'postprocess' : postprocess,
#         'ydim' : ydim,
#         'rfm_iters' : rfm_iters,
#     },
#     'param_grid' : rfm_param_grid
# }

# model_configs[f'rfm-thin'] = {
#     'dataset' : datasets,
#     'kwargs' : {
#         'iters' : rfm_iters,
#         'use_kt' : True,
#     },
#     'param_grid' : rfm_param_grid
# }

model_configs[f'rpcholesky'] = {
    'dataset' : datasets,
    'kwargs' : {
        'm' : m,
        # 'postprocess' : postprocess,
    },
    'param_grid' : default_param_grid
}

In [120]:
model_configs

{'full': {'dataset': ['housing'],
  'kwargs': {'postprocess': None},
  'param_grid': {'sigma': [1, 2, 5, 10, 20], 'alpha': [0.001, 0.0001, 1e-05]}},
 'st': {'dataset': ['housing'],
  'kwargs': {'m': None, 'postprocess': None},
  'param_grid': {'sigma': [1, 2, 5, 10, 20], 'alpha': [0.001, 0.0001, 1e-05]}},
 'st-dnc': {'dataset': ['housing'],
  'kwargs': {'m': None, 'postprocess': None, 'use_dnc': True},
  'param_grid': {'sigma': [1, 2, 5, 10, 20], 'alpha': [0.001, 0.0001, 1e-05]}},
 'kt': {'dataset': ['housing'],
  'kwargs': {'m': None, 'postprocess': None, 'ydim': 1},
  'param_grid': {'sigma': [1, 2, 5, 10, 20], 'alpha': [0.001, 0.0001, 1e-05]}},
 'kt-dnc': {'dataset': ['housing'],
  'kwargs': {'m': None, 'postprocess': None, 'ydim': 1, 'use_dnc': True},
  'param_grid': {'sigma': [1, 2, 5, 10, 20], 'alpha': [0.001, 0.0001, 1e-05]}},
 'rpcholesky': {'dataset': ['housing'],
  'kwargs': {'m': None},
  'param_grid': {'sigma': [1, 2, 5, 10, 20], 'alpha': [0.001, 0.0001, 1e-05]}}}

In [121]:
use_cross_validation

False

In [122]:
filename = dataset + ('_cv' if use_cross_validation else '') # '_'.join(['toy', housing])
pickle_file = filename + '.p'
print(pickle_file)

housing.p


In [149]:
# Run experiment (depending on experiment_type)

results = []

count = 0
for name, config in model_configs.items():
    for dataset in config['dataset']:

        for v in varying_variable_values:
            kwargs = deepcopy(config['kwargs'])
            kwargs[varying_variable] = v
            model_name = f"{name}_{v}"
            # NOTE: full and rfm are deterministic, so we only need to run them once
            trials = (1 if name in ['full', 'rfm'] else n_repeats)

            # STEP 1: Get data
            # use X_train, y_train, X_test, y_test from above
            
            if 'kernel' not in kwargs:
                kwargs['kernel'] = kernel

            model = get_estimator(task, name=name, **kwargs)
            if model is None: continue
            print(f'i={count+1}: dataset={dataset}, model={model}')

            # STEP 2: Get optimal parameters through grid search
            # NOTE: we want to get rid of randomness in the Kernel Thinning (or Standard Thinning) routine
            # so we do k-fold cross validation `trials` times using the *same* split.
            # This is different from sklearn's repeated k-fold implementation which uses a 
            # different random split each time.            

            if use_cross_validation:
                split = list(KFold(n_splits=k_fold).split(X_train)) * trials
                grid_search = GridSearchCV(
                    estimator=model,
                    param_grid=config['param_grid'],
                    return_train_score=True,
                    cv=split,
                    scoring=refit,
                    refit=False,
                    n_jobs=n_jobs,
                ).fit(X_train, y_train)
                # get validation scores
                cv_results = pd.DataFrame(grid_search.cv_results_)
                val_scores = []
                for i in range(trials):
                    val_scores.append( cv_results.iloc[grid_search.best_index_][f'split{i}_test_score'] )
            
                # get optimal parameters
                best_params = grid_search.best_params_
            
            else:
                # Dummy values
                val_scores = [1,] * trials
                
                best_params = {
                    'sigma' : sigma,
                    'alpha' : 1e-5 if name in ['falkon', 'rpcholesky', 'falkon+kt'] else alpha, 
                    # * (len(X_train)**(1/4) if name in ['st', 'kt'] else 1),
                }
            print(f"best params: {best_params}")
            best_model = get_estimator(task, name=name, 
                                       sigma=best_params['sigma'],
                                       alpha=best_params['alpha'],
                                       **kwargs)
            print(best_model)

            # STEP: Estimate test score
            train_scores = []
            test_scores = []
            for _ in range(trials):
                best_model.fit(X_train, y_train)

                # compute train score
                train_pred = best_model.predict(X_train).squeeze()
                # compute test score
                test_pred = best_model.predict(X_test).squeeze()

                if refit == 'neg_mean_squared_error':
                    train_score = mean_squared_error(y_train, train_pred)
                    test_score = mean_squared_error(y_test, test_pred)
                elif refit == 'accuracy':
                    train_score = 1- accuracy_score(y_train, train_pred)
                    test_score = 1- accuracy_score(y_test, test_pred)
                else:
                    raise ValueError(f"invalid refit metric: {refit}")

                train_scores.append( train_score )
                test_scores.append( test_score )

            results.append({
                "dataset": dataset, 
                "model": model_name, 
                "cv_results": pd.DataFrame(grid_search.cv_results_) if use_cross_validation else None,
                "best_index_" : grid_search.best_index_ if use_cross_validation else 0,
                "best_params_" : best_params,
                "val_scores" : val_scores,
                "train_scores" : train_scores,
                "test_scores" : test_scores,
            })

            count += 1

            # Save results with pickle
            if save:
                with open(pickle_file, 'wb') as f:
                    pickle.dump(results, f)

i=1: dataset=housing, model=KernelRidgeRegressor(kernel='gauss')
best params: {'sigma': 10, 'alpha': 0.001}
KernelRidgeRegressor(alpha=0.001, kernel='gauss', sigma=10)


KeyboardInterrupt: 

In [154]:
results

[{'dataset': 'housing',
  'model': 'full_gauss',
  'cv_results': None,
  'best_index_': 0,
  'best_params_': {'sigma': 10, 'alpha': 0.001},
  'val_scores': [1],
  'train_scores': [0.2802697493653728],
  'test_scores': [0.3685139287435234]},
 {'dataset': 'housing',
  'model': 'full_laplace',
  'cv_results': None,
  'best_index_': 0,
  'best_params_': {'sigma': 10, 'alpha': 0.001},
  'val_scores': [1],
  'train_scores': [0.00040123880285507165],
  'test_scores': [0.3526217333776659]},
 {'dataset': 'housing',
  'model': 'st_gauss',
  'cv_results': None,
  'best_index_': 0,
  'best_params_': {'sigma': 10, 'alpha': 0.001},
  'val_scores': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
  'train_scores': [0.4387731374174301,
   0.43252089758749035,
   0.41704087381052746,
   0.4826575925707216,
   0.40630454833688695,
   0.515094718575916,
   0.4521087459951504,
   0.4022948862833796,
   0.6368826297979556,
   0.39916214949868145],
  'test_scores': [0.41194714366663154,
   0.42366281592342564,
   0.41348910

## Plot Results

In [155]:
import plotly.colors as colors
import seaborn as sns

from functools import reduce
from operator import concat

### Varying variable (e.g., kernel choice)

In [156]:
def plot_results(varying_variable, varying_variable_values, scale='linear'):
    # row_subplot_titles = ["Test score vs n"], #, "Test Neg-MSE vs n"] #, "Train time vs n", "Predict time vs n"]
    row_subplot_titles = ["Test Score", "Val Score", "Train Score"]

    fig = make_subplots(
        rows=len(row_subplot_titles),
        cols=len(varying_variable_values),
        shared_yaxes=True,
        subplot_titles=reduce(concat, [[f'{varying_variable}={v}' for v in varying_variable_values] for _ in row_subplot_titles]),
        vertical_spacing=0.1,
    )
    model_names = [model_name.split('_')[0] for model_name in model_configs.keys()]
    colors_list = colors.qualitative.Plotly * (
        len(model_names) // len(colors.qualitative.Plotly) + 1
    )
    colors_used = set()

    def plot_vs_n(print_name, attr_name, vvv, r, c, is_better='higher', scale='log2'):
        """
        Args:
        - vvv: varying variable value
        """
        
        for result in results:
            model_name = result["model"]
            name_components = model_name.split('_') # E.g., Kernel-Thin_rbf -> Kernel-Thin, rbf
            if len(name_components) == 2:
                model_name_prefix, vv_name = name_components
                m = '0'
            else:
                model_name_prefix, m, vv_name = name_components        
            best_params = result["best_params_"]

            if vv_name != vvv:
                continue

            color = colors_list[model_names.index(model_name_prefix)]

            if scale == 'log2':
                y = np.log2(np.abs(result[attr_name]))
            elif scale == 'linear':
                y = np.abs(result[attr_name])

            trace = go.Box(
                x=[result['dataset']]*len(result[attr_name]),
                y=y,
                name=model_name_prefix,
                # opacity=0.5,
                legendgroup=model_name_prefix,
                line_color=color,
                offsetgroup=model_name_prefix,
                showlegend=color not in colors_used,
                boxmean=True,
            )

            fig.add_trace(trace, row=r, col=c)
            colors_used.add(color)

        if c == 1: fig.update_yaxes(title_text=f"{scale}({print_name}) - {is_better} is better", row=r, col=c)
        fig.update_xaxes(title_text="dataset", row=r, col=c)
        fig.update_layout(boxmode='group')

    def plot_test_score_vs_n(vvv, r, c, scale):
        plot_vs_n(f"Test MSE", "test_scores", vvv, r, c, is_better='lower', scale=scale)

    def plot_val_score_vs_n(vvv, r, c, scale):
        plot_vs_n(f"Val MSE", "val_scores", vvv, r, c, is_better='lower', scale=scale)
    def plot_train_score_vs_n(vvv, r, c, scale):
        plot_vs_n(f"Train MSE", "train_scores", vvv, r, c, is_better='lower', scale=scale)

    for c, vvv in enumerate(varying_variable_values):
        plot_test_score_vs_n(str(vvv), 1, c+1, scale=scale)
        plot_val_score_vs_n(str(vvv), 2, c+1, scale=scale)
        plot_train_score_vs_n(str(vvv), 3, c+1, scale=scale)

    return fig

In [157]:
fig = plot_results(varying_variable, varying_variable_values, scale='linear')
fig.update_layout(
    legend=dict(traceorder="normal", borderwidth=1),
    title=dict(x=0.5, text=f"Evaluation for {varying_variable} in {varying_variable_values}"), # \
            #    f"sigma {param_grid['sigma']} / alpha {param_grid['alpha']}"),
    width=800,
    height=1000,
)
fig.show()
if save:
    fig_file = filename + '.png'
    print(fig_file)
    fig.write_image(fig_file)

In [158]:
fig = plot_results(varying_variable, varying_variable_values, scale='log2')
fig.update_layout(
    legend=dict(traceorder="normal", borderwidth=1),
    title=dict(x=0.5, text=f"Evaluation for {varying_variable} in {varying_variable_values}"), # \
            #    f"sigma {param_grid['sigma']} / alpha {param_grid['alpha']}"),
    width=800,
    height=1000,
)
fig.show()
if save:
    fig_file = filename + '_log2.png'
    print(fig_file)
    fig.write_image(fig_file)

### Overfitting

In [159]:
def plot_results_overfitting(varying_variable, varying_variable_values, scale='linear'):
    col_subplot_titles = ["Test Score", "Val Score", "Train Score", ]

    fig = make_subplots(
        rows=len(varying_variable_values),
        cols=len(col_subplot_titles),
        shared_yaxes=True,
        subplot_titles=col_subplot_titles + [None,] * len(varying_variable_values),
        vertical_spacing=0.1,
    )
    model_names = [model_name.split('_')[0] for model_name in model_configs.keys()]
    colors_list = colors.qualitative.Plotly * (
        len(model_names) // len(colors.qualitative.Plotly) + 1
    )
    colors_used = set()

    def plot(print_name, attr_name, vvv, r, c, is_better='higher', scale='log2'):
        """
        Args:
        - vvv: varying variable value
        """
        
        for result in results:
            model_name = result["model"]
            name_components = model_name.split('_') # E.g., Kernel-Thin_rbf -> Kernel-Thin, rbf
            if len(name_components) == 2:
                model_name_prefix, vv_name = name_components
                m = '0'
            else:
                model_name_prefix, m, vv_name = name_components        
            best_params = result["best_params_"]

            if vv_name != vvv:
                continue

            color = colors_list[model_names.index(model_name_prefix)]

            if scale == 'log2':
                y = np.log2(np.abs(result[attr_name]))
            elif scale == 'linear':
                y = np.abs(result[attr_name])

            trace = go.Box(
                x=[result['dataset']]*len(result[attr_name]),
                y=y,
                name=model_name_prefix,
                # opacity=0.5,
                legendgroup=model_name_prefix,
                line_color=color,
                offsetgroup=model_name_prefix,
                showlegend=color not in colors_used,
                boxmean=True,
            )

            fig.add_trace(trace, row=r, col=c)
            colors_used.add(color)

        if c == 1: fig.update_yaxes(title_text=f"{varying_variable}={vvv}", row=r, col=c)
        fig.update_xaxes(title_text="dataset", row=r, col=c)
        fig.update_layout(boxmode='group')

    def plot_test_score(vvv, r, c, scale):
        plot(f"Test MSE", "test_scores", vvv, r, c, is_better='lower', scale=scale)
    def plot_val_score(vvv, r, c, scale):
        plot(f"Val MSE", "val_scores", vvv, r, c, is_better='lower', scale=scale)
    def plot_train_score(vvv, r, c, scale):
        plot(f"Train MSE", "train_scores", vvv, r, c, is_better='lower', scale=scale)

    for r, vvv in enumerate(varying_variable_values):
        plot_test_score(str(vvv), r+1, 1, scale=scale)
        plot_val_score(str(vvv), r+1, 2, scale=scale)
        plot_train_score(str(vvv), r+1, 3, scale=scale)

    return fig

In [160]:
fig = plot_results_overfitting(varying_variable, varying_variable_values, scale='linear')
fig.update_layout(
    legend=dict(traceorder="normal", borderwidth=1),
    title=dict(x=0.5, text=f"Evaluation for {varying_variable} in {varying_variable_values}" \
            #    f"sigma {param_grid['sigma']} / alpha {param_grid['alpha']}"
               "<br>scale: linear"
               ),
    width=1000,
    height=600,
)
fig.show()

In [161]:
fig = plot_results_overfitting(varying_variable, varying_variable_values, scale='log2')
fig.update_layout(
    legend=dict(traceorder="normal", borderwidth=1),
    title=dict(x=0.5, text=f"Evaluation for {varying_variable} in {varying_variable_values}" \
            #    f"sigma {param_grid['sigma']} / alpha {param_grid['alpha']}"
               "<br>scale: log2"
               ),
    width=1000,
    height=600,
)
fig.show()