In [21]:
from timer import Timer
import numpy as np
import pandas as pd

from highly_adaptive_regression import HighlyAdaptiveRidgeCV as HARCV
from kernel_ridge import KernelRidge, KernelRidgeCV as KRRCV, HighlyAdaptiveRidgeCV as kHARCV, RadialBasisKernelRidgeCV as RBFKRRCV, MixedSobolevRidgeCV as MSKRRCV
from kernel_ridge import kernels

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor as RF
from sklearn.linear_model import LinearRegression as LR
from sklearn.dummy import DummyRegressor
from sklearn.pipeline import Pipeline

In [22]:
from sklearn.preprocessing import MinMaxScaler

class ClippedMinMaxScaler(MinMaxScaler):
    def transform(self, X):
        return np.clip(super().transform(X), 0, 1)

In [33]:
LEARNERS = {
    'Mean':DummyRegressor(strategy="mean"),
    'Linear Regression':LR(),
    'Random Forest':RF(n_estimators=2000, n_jobs=-1),
    'RBF Kernel Ridge':RBFKRRCV(gammas=[0.001, 0.01, 0.1, 1, 10], eps=1e-10),
    'Mixed Sobolev Kernel Ridge':Pipeline([
        ('scaler', ClippedMinMaxScaler()),
        ('mskrrcv', MSKRRCV(eps=1e-10))
    ]),
    'HAR':kHARCV(eps=1e-10, order=0),
}

DATASETS = [
    "yacht",
    "energy",
    "boston",
    "concrete",
    "wine",
]

N_REPS = 5

results = []
for data in DATASETS:

    df = pd.read_csv(f"~/Desktop/csv/{data}.csv")
    X = df.iloc[:, :-1].values
    Y = df.iloc[:,-1].values
    n, d = X.shape

    for rep in range(N_REPS):
        X, X_, Y, Y_ = train_test_split(X, Y, test_size=0.2)
        timer = Timer()
        for name, learner in LEARNERS.items():
            with timer.task("time fitting"):
                learner.fit(X,Y)
            with timer.task("time predicting"):
                mse = np.mean((learner.predict(X_) - Y_)**2)

            results += [{
                'data': data,
                'n': n,
                'd': d,
                'learner': name,
                'mse': mse,
                **timer.durations,
            }]



























































In [34]:
pd.DataFrame(results)

Unnamed: 0,data,n,d,learner,mse,time fitting,time predicting
0,yacht,308,6,Mean,223.367895,0.000136,0.000026
1,yacht,308,6,Linear Regression,77.469453,0.000299,0.000036
2,yacht,308,6,Random Forest,1.536505,0.858904,0.078556
3,yacht,308,6,HAR,0.329394,0.487169,0.021109
4,yacht,308,6,Mixed Sobolev Kernel Ridge,0.178893,0.020843,0.000468
...,...,...,...,...,...,...,...
145,wine,1599,11,Linear Regression,0.396323,0.000328,0.000038
146,wine,1599,11,Random Forest,0.427531,0.974879,0.099411
147,wine,1599,11,HAR,0.406538,9.306353,0.291466
148,wine,1599,11,Mixed Sobolev Kernel Ridge,0.410729,0.108649,0.001501


In [49]:
results_avg = (
    pd.DataFrame(results)
    .groupby(['data', 'n', 'd', 'learner'], as_index=False)  
    .agg({
        'mse': np.mean,  
        'time fitting': np.mean,
        'time predicting': np.mean
    })
)

pivot_df = (
    results_avg
    .pivot_table(index=['data', 'n', 'd'], columns='learner', values='mse')  # Pivot by 'learner' for mse
    .reindex(columns=[k for k in LEARNERS.keys()][::-1])  # Reorder the columns based on the desired order
    .sort_values(by='n')  # Sort the rows by 'n'
    .reset_index()  # Reset the index to make 'data', 'n', and 'd' regular columns
)

# Display the result
print(pivot_df)

learner      data     n   d        HAR  Mixed Sobolev Kernel Ridge  RBF Kernel Ridge  Random Forest  Linear Regression        Mean
0           yacht   308   6   3.082063                    0.951421         13.575310       1.841530          84.509518  244.089715
1          boston   506  13  19.055743                   12.682318         35.199414      16.502700          27.819152   88.255263
2          energy   768   8   0.193621                    0.199039          0.585088       0.284699           9.412902  104.016834
3        concrete  1030   8  21.170419                   23.703165        111.582927      36.918705         117.201361  273.251333
4            wine  1599  11   0.395536                    0.399302          0.430979       0.377481           0.445893    0.661504
learner      data     n   d        HAR  Mixed Sobolev Kernel Ridge  RBF Kernel Ridge  Random Forest  Linear Regression        Mean
0           yacht   308   6   3.082063                    0.951421         13.57531

