## Code to generate kernel performance metrics

In [1]:
import re
import csv
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import r2_score
from joblib import Parallel, delayed
from itertools import combinations_with_replacement
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import ConstantKernel, DotProduct, ExpSineSquared, Matern, RBF, RationalQuadratic, WhiteKernel

### Dataset

In [2]:
data = f"dataset/dataset-essential/oscillating_series.csv"
df = pd.read_csv(data)
X = df['value'].to_numpy()

In [3]:
data_total = np.array(df.value.values)
data = data_total.reshape(-1, 1)

CRdata = data/1000
CRdata  = np.ravel(CRdata)

In [4]:
trainL = len(CRdata) - 20

t = np.linspace(1,len(CRdata),len(CRdata))
t = t.reshape(len(t),1)
t = np.atleast_2d(t)

t_tr  = t[0:trainL]
t_test = t[trainL:]

CR_tr = CRdata[0:trainL]
CR_test = CRdata[trainL:]

### Functions

In [5]:
def Metrics_to_performance(CR_tr, t_tr, CR_test, t_test, kernel_str):

  t = np.append(t_tr, t_test).reshape(-1,1)
  CRdata = np.append(CR_tr, CR_test)

  kernel = kernel_str

  model  = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=20, alpha = 10, normalize_y=False)

  model.fit(t_tr, CR_tr)

  params = model.kernel_.get_params()

  R2_tr  = model.score(t_tr, CR_tr)
  R2     = model.score(t, CRdata)
  R2_test = model.score(t_test, CR_test)

  CRpred_tr, sigma_tr = model.predict(t_tr, return_std=True)
  CRpred_test, sigma_test = model.predict(t_test, return_std=True)
  CRpred, sigma   = model.predict(t, return_std=True)

  learned_kernel = model.kernel_
  mse = np.mean(((CRpred_tr-CR_tr)*1000)**2)
  std = np.sqrt(mse)
  r_training = R2_tr
  r_whole_interval = R2
  r_test = R2_test

  return kernel_str, learned_kernel, mse, std, r_training, r_whole_interval, r_test

In [6]:
def evaluate_kernel(kernel_str):
  try:
    result = Metrics_to_performance(CR_tr, t_tr, CR_test, t_test, kernel_str)
    return result
  except Exception as e:
    print(f"Error evaluating kernel {kernel_str}: {e}")
    return None

### Metrics to evaluate performance

In [7]:
df_kernels_data = f"dataset/dataset_possibilities/oscillating_series_kernel_possibilities.csv"

data_k = pd.read_csv(df_kernels_data)

kernels = [kernel[0] for kernel in data_k.values]

kernels_data = [eval(kernel_str) for kernel_str in kernels]

In [None]:
batch_size = 1000
num_batches = len(kernels_data) // batch_size + 1
kernels_metrics = []
num_cores = -1

for i in range(num_batches):
  start_idx = i * batch_size
  end_idx = min((i + 1) * batch_size, len(kernels_data))
  kernels_batch = kernels_data[start_idx:end_idx]

  kernels_metrics_batch = Parallel(n_jobs=num_cores)(delayed(evaluate_kernel)(kernel_str) for kernel_str in tqdm(kernels_batch, desc=f"Evaluating batch {i+1}/{num_batches}"))

  kernels_metrics.extend(kernels_metrics_batch)

In [9]:
file_name = f"dataset/dataset_library/oscillating_series_library.csv"

In [None]:
with open(file_name, 'w', newline='') as file_csv:
    writer = csv.writer(file_csv)
    writer.writerow(['kernel_str', 'learned_kernel', 'mse', 'std', 'R2_tr', 'R2', 'R2_test'])
    for row in kernels_metrics:
        if row is not None:
            formatted_row = [row[0], row[1], '{:.15f}'.format(row[2]), '{:.15f}'.format(row[3]), '{:.15f}'.format(row[4]), '{:.15f}'.format(row[5]), '{:.15f}'.format(row[6])]
            writer.writerow(formatted_row)

print(f"Data saved in {file_csv}")