In [5]:
from timer import Timer
import numpy as np
import pandas as pd

from highly_adaptive_regression import HighlyAdaptiveLassoCV
from kernel_ridge import HighlyAdaptiveRidgeCV, RadialBasisKernelRidgeCV, MixedSobolevRidgeCV 
from kernel_ridge import kernels

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.dummy import DummyRegressor

# Empirical

In [55]:
LEARNERS = {
    'Mean':DummyRegressor(strategy="mean"),
    'Ridge Regression':Ridge(alpha=1e-3),
    'Random Forest':RandomForestRegressor(n_estimators=2000, n_jobs=-1),
    'Radial Basis KRR':RadialBasisKernelRidgeCV(
        gammas=[0.001, 0.01, 0.1, 1, 10], 
        eps=1e-10,
        max_alpha_coef_norm = 10,
    ),
    'Mixed Sobolev KRR':MixedSobolevRidgeCV(
        eps=1e-6,
        max_alpha_coef_norm = 100,
    ),
    'HAL':HighlyAdaptiveLassoCV(),
    'HAR':HighlyAdaptiveRidgeCV(eps=1e-10, order=0),
}

In [49]:
FILE = 'HAL_big'

DATASETS = [
    "yacht",
    "energy",
    "boston",
    "concrete",
    "wine",
    "power",
    "kin8nm",
    "naval",
    "protein",
    "slice",
    "yearmsd"
]

N_REPS = 5

MAX_ROWS = 2000

results = []
data_timer = Timer(verbose=True)
for data in DATASETS:
    with data_timer.task(f"analyzing {data}"):
        df = pd.read_csv(f"~/Desktop/csv/{data}.csv")
        X_full = df.iloc[:MAX_ROWS, :-1].values
        Y_full = df.iloc[:MAX_ROWS,-1].values
        n, d = X.shape

        for rep in range(N_REPS):
            X, X_, Y, Y_ = train_test_split(X_full, Y_full, test_size=0.2)
            learner_timer = Timer()
            for name, learner in LEARNERS.items():
                with learner_timer.task("time fitting"):
                    learner.fit(X,Y)
                with learner_timer.task("time predicting"):
                    mse = np.mean((learner.predict(X_) - Y_)**2)

                results += [{
                    'data': data,
                    'n': n,
                    'd': d,
                    'learner': name,
                    'mse': mse,
                    **learner_timer.durations,
                }]

pd.DataFrame(results).to_csv(f"results/data/{FILE}.csv")

analyzing wine...analyzing wine...

  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(


In [22]:
import pandas as pd
import glob
import os

folder_path = 'results/data'  
csv_files = glob.glob(os.path.join(folder_path, "*.csv"))
df_list = [pd.read_csv(file, index_col=0) for file in csv_files]
results = pd.concat(df_list, ignore_index=True)

In [23]:
results

Unnamed: 0,data,n,d,learner,mse,time fitting,time predicting
0,yacht,308,6,HAR,0.213843,0.504134,0.020669
1,energy,768,8,HAR,0.146393,9.535488,0.291234
2,boston,506,13,HAR,13.932925,4.302512,0.136813
3,concrete,1030,8,HAR,17.974295,24.744194,0.739761
4,wine,1599,11,HAR,0.399823,138.408775,4.189828
...,...,...,...,...,...,...,...
345,yearmsd,2000,90,Mean,148.581420,0.000098,0.000024
346,yearmsd,2000,90,Ridge Regression,140.268486,0.000500,0.000041
347,yearmsd,2000,90,Random Forest,114.232732,6.665474,0.102855
348,yearmsd,2000,90,Radial Basis KRR,148.581170,0.549773,0.001037


In [51]:
results_avg = (
    pd.DataFrame(results)
    .groupby(['data', 'n', 'd', 'learner'], as_index=False)  
    .agg({
        'mse': np.mean,  
        'time fitting': np.mean,
        'time predicting': np.mean,
    })
    # mutate mse to rmse by taking sqrt
    .assign(**{
        'rmse': lambda df: np.sqrt(df['mse']),
    })
    .sort_values(by=['d', 'data', 'rmse'], ascending=[True, True, True])
)

mse_table = (
    results_avg
    .pivot_table(index=['data', 'n', 'd'], columns='learner', values='rmse')  # Pivot by 'learner' for mse
    .reindex(columns=['HAR', 'HAL', 'Mixed Sobolev KRR', 'Radial Basis KRR', 'Random Forest', 'Ridge Regression'])  # Reorder the columns based on the desired order
    .sort_values(by=['d'], ascending=[True])
    .reset_index()
)

# Display the result
print(mse_table.to_latex(index=False, float_format='%.2e'))

\begin{tabular}{lrrrrrrrr}
\toprule
data & n & d & HAR & HAL & Mixed Sobolev KRR & Radial Basis KRR & Random Forest & Ridge Regression \\
\midrule
power & 2000 & 4 & 3.98e+00 & NaN & 4.09e+00 & 4.24e+00 & 4.12e+00 & 4.49e+00 \\
yacht & 308 & 6 & 1.15e+00 & 9.51e-01 & 1.42e+00 & 2.94e+00 & 1.89e+00 & 9.82e+00 \\
concrete & 1030 & 8 & 4.13e+00 & 4.66e+00 & 4.78e+00 & 1.05e+01 & 5.89e+00 & 1.07e+01 \\
energy & 768 & 8 & 4.65e-01 & 4.94e-01 & 4.42e-01 & 8.20e-01 & 5.59e-01 & 2.93e+00 \\
kin8nm & 2000 & 8 & 1.50e-01 & NaN & 1.45e-01 & 1.05e-01 & 1.74e-01 & 2.02e-01 \\
protein & 2000 & 9 & 1.89e+00 & NaN & 1.75e+00 & 5.78e+00 & 1.73e+00 & 2.35e+00 \\
wine & 1599 & 11 & 5.97e-01 & NaN & 6.26e-01 & 6.62e-01 & 5.97e-01 & 6.55e-01 \\
boston & 506 & 13 & 3.40e+00 & 3.88e+00 & 3.82e+00 & 6.08e+00 & 3.74e+00 & 5.09e+00 \\
naval & 2000 & 17 & 1.23e-03 & NaN & 6.67e-04 & 2.15e-03 & 1.02e-03 & 1.37e-03 \\
yearmsd & 2000 & 90 & 1.11e+01 & NaN & 9.68e+00 & 1.15e+01 & 1.02e+01 & 1.07e+01 \\
slice & 2000 



# Simulations

In [209]:
def ramp(x, x0=0.5, eps=0.1):
    return np.clip((x - x0) / eps, 0, 1)

def dgp(n, d):
    eps = 0.05
    x0 = 1 - 2**(-1/5) - eps
    X = np.random.uniform(size=(n, 10))
    Y = np.prod(X[:,0:5], axis=1) - np.prod(ramp(X[:,5:-1], x0=x0, eps=eps), axis=1) + np.random.normal(scale=0.1, size=(n,))
    return X,Y

In [213]:

SIM_LEARNERS = [
    'HAR', 
    # 'Mixed Sobolev KRR', 
    # 'Radial Basis KRR', 
    # 'Random Forest',
    # 'Mean',
]
REPS = 10
N_RANGE = [50, 125, 200, 300, 400, 600]
D_RANGE = [10]

for d in D_RANGE:
    results = []
    for n in N_RANGE:
        for rep in range(REPS):
            X,Y  = dgp(n+1000, d)
            X, X_, Y, Y_ = train_test_split(X, Y, test_size=1000)
            learner_timer = Timer()
            for name, learner in {k: LEARNERS[k] for k in SIM_LEARNERS}.items():
                with learner_timer.task("time fitting"):
                    learner.fit(X,Y)
                with learner_timer.task("time predicting"):
                    mse = np.mean((learner.predict(X_) - Y_)**2)

                results += [{
                    'n': n,
                    'd': d,
                    'learner': name,
                    'mse': mse,
                    **learner_timer.durations,
                }]

    pd.DataFrame(results).to_csv(f"results/sims/{d}.csv")

In [227]:
import pandas as pd
import glob
import os

folder_path = 'results/sims'  
csv_files = glob.glob(os.path.join(folder_path, "*.csv"))
df_list = [pd.read_csv(file, index_col=0) for file in csv_files]
results = pd.concat(df_list, ignore_index=True)

results_avg = (
    pd.DataFrame(results)
    .groupby(['n', 'd', 'learner'], as_index=False)  
    .agg({
        'mse': np.mean,  
        'time fitting': np.mean,
        'time predicting': np.mean,
    })
    # mutate mse to rmse by taking sqrt
    .assign(**{
        'rmse': lambda df: np.sqrt(df['mse']),
        'rate': lambda df: df.n**(-1/3) * np.log(df.n)**(2*(df.d-1)/3),
        'relative_rmse': lambda df: df.rmse / df.rate,
    })
    .sort_values(by=['learner', 'd', 'n'], ascending=[True, True, True])
)

import altair as alt
import altair_saver as saver

plot = alt.Chart(results_avg).mark_line().encode(
    x='n:Q',
    y=alt.Y('relative_rmse:Q', scale=alt.Scale(zero=False), title='Rate-Scaled RMSE'),
    # color='learner',
    # row='d'
).resolve_scale(
    y='independent'
).properties(
    width=800,  # Set the chart wider
    height=300  # Set the chart narrower
).configure_axis(
    labelFontSize=14,
    titleFontSize=16
).configure_legend(
    labelFontSize=14,
    titleFontSize=16
).configure_title(
    fontSize=18
)
plot

plot.save('results/plots/convergence.pdf')



# Dimension Experiments

In [229]:
DATASETS = [
    "slice",
]
SIM_LEARNERS = [
    'HAR', 
    # 'Mixed Sobolev KRR', 
    'Radial Basis KRR', 
    # 'Random Forest',
    # 'Mean',
]
N_REPS = 5
TRAIN_SIZE = 250
TEST_SIZE = 1000
MAX_COLS = [5, 15, 40, 100]

results = []
data_timer = Timer(verbose=True)
for data in DATASETS:
    for cols in MAX_COLS:
        with data_timer.task(f"{cols} features..."):
            df = pd.read_csv(f"~/Desktop/csv/{data}.csv")
            X_full = df.iloc[:TRAIN_SIZE+TEST_SIZE, :cols].values
            Y_full = df.iloc[:TRAIN_SIZE+TEST_SIZE,-1].values
            n, d = X.shape

            for rep in range(N_REPS):
                X, X_, Y, Y_ = train_test_split(X_full, Y_full, train_size=TRAIN_SIZE)
                learner_timer = Timer()
                for name, learner in {k: LEARNERS[k] for k in SIM_LEARNERS}.items()::
                    with learner_timer.task("time fitting"):
                        learner.fit(X,Y)
                    with learner_timer.task("time predicting"):
                        mse = np.mean((learner.predict(X_) - Y_)**2)

                    results += [{
                        'data': data,
                        'n': n,
                        'd': d,
                        'learner': name,
                        'mse': mse,
                    **learner_timer.durations,
                }]

        pd.DataFrame(results).to_csv(f"results/dim/cols_{MAX_COLS}.csv")

5 features......5 features......





  model = cd_fast.enet_coordinate_descent(




elapsed time: 76.08342790603638
15 features......elapsed time: 76.08342790603638
15 features......