In [1]:
from timer import Timer
import numpy as np
import pandas as pd

from highly_adaptive_regression import HighlyAdaptiveLassoCV
from kernel_ridge import HighlyAdaptiveRidgeCV, RadialBasisKernelRidgeCV, MixedSobolevRidgeCV 
from kernel_ridge import kernels

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.dummy import DummyRegressor

import pandas as pd
import glob
import os
import altair as alt

# Empirical

In [2]:
LEARNERS = {
    'Mean':DummyRegressor(strategy="mean"),
    'Ridge Regression':Ridge(alpha=1e-3),
    'Random Forest':RandomForestRegressor(n_estimators=2000, n_jobs=-1),
    'Radial Basis KRR':RadialBasisKernelRidgeCV(
        gammas=[0.001, 0.01, 0.1, 1, 10], 
        eps=1e-10,
        max_alpha_coef_norm = 10,
    ),
    'Mixed Sobolev KRR':MixedSobolevRidgeCV(
        eps=1e-6,
        max_alpha_coef_norm = 100,
    ),
    'HAL':HighlyAdaptiveLassoCV(),
    'HAR':HighlyAdaptiveRidgeCV(eps=1e-10, order=0),
}

In [3]:
FILE = 'data'

HAL_DATASETS = [
    "yacht",
    "energy",
    "boston",
    "concrete",
]

DATASETS = [
    *HAL_DATASETS,
    "wine",
    "power",
    "kin8nm",
    "naval",
    "protein",
    "slice",
    "yearmsd"
]

N_REPS = 5

MAX_ROWS = 2000

results = []
data_timer = Timer(verbose=True)
for data in DATASETS:
    with data_timer.task(f"analyzing {data}"):
        df = pd.read_csv(f"~/Desktop/csv/{data}.csv")
        X_full = df.iloc[:MAX_ROWS, :-1].values
        Y_full = df.iloc[:MAX_ROWS,-1].values
        n, d = X_full.shape

        for rep in range(N_REPS):
            X, X_, Y, Y_ = train_test_split(X_full, Y_full, test_size=0.2)
            learner_timer = Timer()
            for name, learner in LEARNERS.items():
                if name == 'HAL' and data not in HAL_DATASETS:
                    continue
                with learner_timer.task("time fitting"):
                    learner.fit(X,Y)
                with learner_timer.task("time predicting"):
                    mse = np.mean((learner.predict(X_) - Y_)**2)

                results += [{
                    'data': data,
                    'n': n,
                    'd': d,
                    'learner': name,
                    'mse': mse,
                    **learner_timer.durations,
                }]

            pd.DataFrame(results).to_csv(f"results/data/{FILE}_{data}_{rep}.csv")

analyzing yacht...analyzing yacht...



  model = cd_fast.enet_coordinate_descent(




  model = cd_fast.enet_coordinate_descent(




  model = cd_fast.enet_coordinate_descent(




  model = cd_fast.enet_coordinate_descent(




  model = cd_fast.enet_coordinate_descent(


elapsed time: 82.50523900985718
analyzing energy...elapsed time: 82.50523900985718
analyzing energy...



  model = cd_fast.enet_coordinate_descent(




  model = cd_fast.enet_coordinate_descent(




  model = cd_fast.enet_coordinate_descent(




  model = cd_fast.enet_coordinate_descent(




  model = cd_fast.enet_coordinate_descent(


elapsed time: 2138.266622066498
analyzing boston...elapsed time: 2138.266622066498
analyzing boston...

  model = cd_fast.enet_coordinate_descent(


In [51]:
folder_path = 'results/data'  
csv_files = glob.glob(os.path.join(folder_path, "*.csv"))
df_list = [pd.read_csv(file, index_col=0) for file in csv_files]
results = pd.concat(df_list, ignore_index=True)

results_avg = (
    pd.DataFrame(results)
    .groupby(['data', 'n', 'd', 'learner'], as_index=False)  
    .agg({
        'mse': np.mean,  
        'time fitting': np.mean,
        'time predicting': np.mean,
    })
    # mutate mse to rmse by taking sqrt
    .assign(**{
        'rmse': lambda df: np.sqrt(df['mse']),
    })
    .sort_values(by=['d', 'data', 'rmse'], ascending=[True, True, True])
)

mse_table = (
    results_avg
    .pivot_table(index=['data', 'n', 'd'], columns='learner', values='rmse')  # Pivot by 'learner' for mse
    .reindex(columns=['HAR', 'HAL', 'Mixed Sobolev KRR', 'Radial Basis KRR', 'Random Forest', 'Ridge Regression'])  # Reorder the columns based on the desired order
    .sort_values(by=['d'], ascending=[True])
    .reset_index()
)

# Display the result
print(mse_table.to_latex(index=False, float_format='%.2e'))

\begin{tabular}{lrrrrrrrr}
\toprule
data & n & d & HAR & HAL & Mixed Sobolev KRR & Radial Basis KRR & Random Forest & Ridge Regression \\
\midrule
power & 2000 & 4 & 3.98e+00 & NaN & 4.09e+00 & 4.24e+00 & 4.12e+00 & 4.49e+00 \\
yacht & 308 & 6 & 1.15e+00 & 9.51e-01 & 1.42e+00 & 2.94e+00 & 1.89e+00 & 9.82e+00 \\
concrete & 1030 & 8 & 4.13e+00 & 4.66e+00 & 4.78e+00 & 1.05e+01 & 5.89e+00 & 1.07e+01 \\
energy & 768 & 8 & 4.65e-01 & 4.94e-01 & 4.42e-01 & 8.20e-01 & 5.59e-01 & 2.93e+00 \\
kin8nm & 2000 & 8 & 1.50e-01 & NaN & 1.45e-01 & 1.05e-01 & 1.74e-01 & 2.02e-01 \\
protein & 2000 & 9 & 1.89e+00 & NaN & 1.75e+00 & 5.78e+00 & 1.73e+00 & 2.35e+00 \\
wine & 1599 & 11 & 5.97e-01 & NaN & 6.26e-01 & 6.62e-01 & 5.97e-01 & 6.55e-01 \\
boston & 506 & 13 & 3.40e+00 & 3.88e+00 & 3.82e+00 & 6.08e+00 & 3.74e+00 & 5.09e+00 \\
naval & 2000 & 17 & 1.23e-03 & NaN & 6.67e-04 & 2.15e-03 & 1.02e-03 & 1.37e-03 \\
yearmsd & 2000 & 90 & 1.11e+01 & NaN & 9.68e+00 & 1.15e+01 & 1.02e+01 & 1.07e+01 \\
slice & 2000 



# Simulations

In [209]:
def ramp(x, x0=0.5, eps=0.1):
    return np.clip((x - x0) / eps, 0, 1)

def dgp(n, d):
    eps = 0.05
    x0 = 1 - 2**(-1/5) - eps
    X = np.random.uniform(size=(n, 10))
    Y = np.prod(X[:,0:5], axis=1) - np.prod(ramp(X[:,5:-1], x0=x0, eps=eps), axis=1) + np.random.normal(scale=0.1, size=(n,))
    return X,Y

In [213]:

SIM_LEARNERS = [
    'HAR', 
    # 'Mixed Sobolev KRR', 
    # 'Radial Basis KRR', 
    # 'Random Forest',
    # 'Mean',
]
REPS = 10
N_RANGE = [50, 125, 200, 300, 400, 600]
D_RANGE = [10]

for d in D_RANGE:
    results = []
    for n in N_RANGE:
        for rep in range(REPS):
            X,Y  = dgp(n+1000, d)
            X, X_, Y, Y_ = train_test_split(X, Y, test_size=1000)
            learner_timer = Timer()
            for name, learner in {k: LEARNERS[k] for k in SIM_LEARNERS}.items():
                with learner_timer.task("time fitting"):
                    learner.fit(X,Y)
                with learner_timer.task("time predicting"):
                    mse = np.mean((learner.predict(X_) - Y_)**2)

                results += [{
                    'n': n,
                    'd': d,
                    'learner': name,
                    'mse': mse,
                    **learner_timer.durations,
                }]

    pd.DataFrame(results).to_csv(f"results/sims/{d}.csv")

In [227]:
import pandas as pd
import glob
import os

folder_path = 'results/sims'  
csv_files = glob.glob(os.path.join(folder_path, "*.csv"))
df_list = [pd.read_csv(file, index_col=0) for file in csv_files]
results = pd.concat(df_list, ignore_index=True)

results_avg = (
    pd.DataFrame(results)
    .groupby(['n', 'd', 'learner'], as_index=False)  
    .agg({
        'mse': np.mean,  
        'time fitting': np.mean,
        'time predicting': np.mean,
    })
    # mutate mse to rmse by taking sqrt
    .assign(**{
        'rmse': lambda df: np.sqrt(df['mse']),
        'rate': lambda df: df.n**(-1/3) * np.log(df.n)**(2*(df.d-1)/3),
        'relative_rmse': lambda df: df.rmse / df.rate,
    })
    .sort_values(by=['learner', 'd', 'n'], ascending=[True, True, True])
)

import altair as alt
import altair_saver as saver

plot = alt.Chart(results_avg).mark_line().encode(
    x='n:Q',
    y=alt.Y('relative_rmse:Q', scale=alt.Scale(zero=False), title='Rate-Scaled RMSE'),
    # color='learner',
    # row='d'
).resolve_scale(
    y='independent'
).properties(
    width=800,  # Set the chart wider
    height=300  # Set the chart narrower
).configure_axis(
    labelFontSize=14,
    titleFontSize=16
).configure_legend(
    labelFontSize=14,
    titleFontSize=16
).configure_title(
    fontSize=18
)
plot

plot.save('results/plots/convergence.pdf')



# Dimension Experiments

In [33]:
K

HighlyAdaptiveRidge: {'depth': array([[0.88788436, 0.65382304, 0.8205276 , ..., 0.01666537, 0.96007135,
        0.07163804],
       [0.83411064, 0.00130576, 0.90138635, ..., 0.43242943, 0.07302692,
        0.49883272],
       [0.74242175, 0.27839354, 0.07083517, ..., 0.25567212, 0.01823056,
        0.15984818],
       ...,
       [0.18862626, 0.45658028, 0.27621575, ..., 0.45298265, 0.39164189,
        0.78288198],
       [0.17333797, 0.85298253, 0.92603183, ..., 0.5253794 , 0.14810314,
        0.275442  ],
       [0.30020353, 0.29341873, 0.64938508, ..., 0.89014852, 0.52988017,
        0.31619236]]), 'order': 0}

In [51]:
def generate_low_rank_matrix(n, d, rank=5):
    # Generate two random matrices
    A = np.random.rand(n, rank)  # n x rank matrix
    B = np.random.rand(rank, d)  # rank x d matrix
    
    # Matrix multiplication to generate a low-rank matrix
    low_rank_matrix = np.dot(A, B)
    
    return low_rank_matrix

# Example usage
n = 100  # number of rows
d = 50   # number of columns
low_rank_matrix = generate_low_rank_matrix(n, d, rank=5)

In [75]:
n = 350
d = 25
# X = np.random.uniform(size=(n,d))
X = generate_low_rank_matrix(n, d, rank=5)

K = kernels.HighlyAdaptiveRidge()(X)
eigenvalues = np.linalg.eigvals(K)

# Prepare data for plotting
eigenvalues_log = np.log(np.abs(eigenvalues))
data = pd.DataFrame({
    'Eigenvalues (Log Scale)': eigenvalues_log
})

# Create histogram using Altair
hist = alt.Chart(data).mark_bar().encode(
    alt.X("Eigenvalues (Log Scale):Q", bin=True),
    y='count()'
).properties(
    title='Histogram of Eigenvalues (Log Scale)'
)

# Display the plot
hist

In [76]:

K = kernels.RadialBasis(gamma=5)(X)
eigenvalues = np.linalg.eigvals(K)

# Prepare data for plotting
eigenvalues_log = np.log(np.abs(eigenvalues))
data = pd.DataFrame({
    'Eigenvalues (Log Scale)': eigenvalues_log
})

# Create histogram using Altair
hist = alt.Chart(data).mark_bar().encode(
    alt.X("Eigenvalues (Log Scale):Q", bin=True),
    y='count()'
).properties(
    title='Histogram of Eigenvalues (Log Scale)'
)

# Display the plot
hist

# Dimension Experiments

In [29]:
DATASETS = [
    "slice",
]
SIM_LEARNERS = [
    'HAR', 
    # 'Mixed Sobolev KRR', 
    'Radial Basis KRR', 
    # 'Random Forest',
    # 'Mean',
]
N_REPS = 5
TRAIN_SIZE = 400
TEST_SIZE = 1000
MAX_COLS = [5, 15, 20, 30]

results = []
data_timer = Timer(verbose=True)
for data in DATASETS:
    for cols in MAX_COLS:
        with data_timer.task(f"{cols} features..."):
            df = pd.read_csv(f"~/Desktop/csv/{data}.csv")
            X_full = df.iloc[:TRAIN_SIZE+TEST_SIZE, :cols].values
            Y_full = df.iloc[:TRAIN_SIZE+TEST_SIZE,-1].values

            for rep in range(N_REPS):
                X, X_, Y, Y_ = train_test_split(X_full, Y_full, train_size=TRAIN_SIZE)
                n, d = X.shape
                learner_timer = Timer()
                for name, learner in {k: LEARNERS[k] for k in SIM_LEARNERS}.items():
                    with learner_timer.task("time fitting"):
                        learner.fit(X,Y)
                    with learner_timer.task("time predicting"):
                        mse = np.mean((learner.predict(X_) - Y_)**2)

                    results += [{
                        'data': data,
                        'n': n,
                        'd': d,
                        'learner': name,
                        'mse': mse,
                    **learner_timer.durations,
                }]

        pd.DataFrame(results).to_csv(f"results/dim/cols_{cols}.csv")

5 features......5 features......

elapsed time: 8.800706148147583
15 features......elapsed time: 8.800706148147583
15 features......

elapsed time: 22.495224952697754
20 features......elapsed time: 22.495224952697754
20 features......











elapsed time: 30.342422008514404
30 features......elapsed time: 30.342422008514404
30 features......

elapsed time: 47.933465003967285
elapsed time: 47.933465003967285


In [30]:
folder_path = 'results/dim'  
csv_files = glob.glob(os.path.join(folder_path, "*.csv"))
df_list = [pd.read_csv(file, index_col=0) for file in csv_files]
results = pd.concat(df_list, ignore_index=True)

results_avg = (
    pd.DataFrame(results)
    .groupby(['n', 'd', 'learner'], as_index=False)  
    .agg({
        'mse': np.mean,  
        'time fitting': np.mean,
        'time predicting': np.mean,
    })
    # mutate mse to rmse by taking sqrt
    .assign(**{
        'rmse': lambda df: np.sqrt(df['mse']),
    })
    .sort_values(by=['learner', 'd', 'n'], ascending=[True, True, True])
)

results_avg



Unnamed: 0,n,d,learner,mse,time fitting,time predicting,rmse
0,400,5,HAR,38.006109,1.085833,0.332322,6.164909
2,400,15,HAR,6.206894,3.190848,0.978896,2.491364
4,400,20,HAR,5.395258,4.371174,1.363748,2.322769
6,400,30,HAR,6.727497,6.989935,2.254316,2.593742
1,400,5,Radial Basis KRR,37.581095,0.195866,0.001225,6.130342
3,400,15,Radial Basis KRR,4.440913,0.19087,0.001463,2.107347
5,400,20,Radial Basis KRR,3.848762,0.19545,0.001622,1.961826
7,400,30,Radial Basis KRR,2.977879,0.205508,0.001964,1.725653


In [19]:
df = pd.read_csv(f"~/Desktop/csv/slice.csv")
X_full = df.iloc[:TRAIN_SIZE+TEST_SIZE, :50].values
Y_full = df.iloc[:TRAIN_SIZE+TEST_SIZE,-1].values
har = HighlyAdaptiveRidgeCV(eps=1e-10, order=0)
rbf = RadialBasisKernelRidgeCV(
        gammas=[0.001, 0.01, 0.1, 1, 10], 
        eps=1e-10,
        max_alpha_coef_norm = 10,
    )

X, X_, Y, Y_ = train_test_split(X_full, Y_full, train_size=TRAIN_SIZE)
rbf.fit(X,Y)

In [20]:
rbf.predict(X_)

array([18.64304367, 34.38899521, 31.73219449, 34.23110234, 10.98618134,
       30.57829447, 17.55049709, 12.1589382 , 21.65061546, 35.04466986,
       39.87703981, 16.74064225, 17.35718694, 18.87893174, 12.72567693,
       18.2195623 , 28.23012272, 27.34032064, 22.583116  , 14.15603029,
       33.21737635, 31.692534  , 34.48597576, 30.00491948,  7.39261807,
       34.64288401, 29.01382499, 36.99844081, 17.62632008, 36.1642801 ,
       18.02346142, 28.30212629, 32.89235282, 10.4888958 , 20.71284501,
       21.22924908, 31.84213375, 18.77199683, 12.53197571, 19.20090674,
       31.23549741,  8.87194838, 33.37896443, 15.87196892, 23.6764229 ,
       17.21822163, 18.04400278,  6.6401228 , 21.49512785, 26.63539455,
       11.27500576, 21.91760738, 32.29785595, 35.05527264, 22.37258631,
       32.96500896, 22.8483879 , 28.91937393, 31.5210281 , 27.16583671,
       13.8993063 , 31.55131676, 33.35021139,  8.94661209, 36.0051029 ,
       19.16950218, 17.99665377, 36.69600392, 15.90615692, 16.40