## Generating Kernel 

In [1]:
import csv
import random
import itertools
import numpy as np
import pandas as pd

from tqdm import tqdm
from sklearn.cluster import KMeans
from sklearn.metrics import r2_score
from joblib import Parallel, delayed
from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import ConstantKernel, DotProduct, ExpSineSquared, Matern, RBF, RationalQuadratic, WhiteKernel

import warnings

from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings(action='ignore', category=ConvergenceWarning)
warnings.filterwarnings(action='ignore', category=RuntimeWarning)
warnings.filterwarnings(action='ignore', category=UserWarning)

### Functions

In [None]:
def generate_combinations_with_repetition(kernels, operation):
    combinations = []
    for r in range(2, 5):
        for comb in itertools.combinations_with_replacement(kernels, r):
            combinations.append(f" {operation} ".join(comb))
    return combinations

In [None]:
def generate_permutations_with_repetition(kernels, operations):
    permutations = []
    for r in range(2, 5):
        for perm in itertools.product(kernels, repeat=r):
            for ops in itertools.product(operations, repeat=r-1):
                expr = ""
                for i in range(r-1):
                    expr += f"{perm[i]} {ops[i]} "
                expr += perm[-1]
                permutations.append(expr)
    return permutations

In [4]:
def cluster_and_sample(data, sample_size, n_clusters=10):

    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(data).toarray()
    
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    labels = kmeans.fit_predict(X)

    sampled_data = []
    for cluster in range(n_clusters):
        cluster_items = [data[i] for i in range(len(data)) if labels[i] == cluster]
        sample_count = min(sample_size // n_clusters, len(cluster_items))
        sampled_data += random.sample(cluster_items, sample_count) if cluster_items else []
    
    return sampled_data

### Combinations

In [5]:
kernels = ['ConstantKernel()', 'DotProduct()', 'ExpSineSquared()', 'Matern()', 'RBF()', 'RationalQuadratic()', 'WhiteKernel()']
operations = ['*', '+']

In [6]:
combinations = []
for op in operations:
    combinations += generate_combinations_with_repetition(kernels, op)

permutations = generate_permutations_with_repetition(kernels, operations)

all_combinations = combinations + permutations + kernels

In [7]:
all_combinations_views = set()
duplicates_views = set()

for item in all_combinations:
    if item in all_combinations_views:
        duplicates_views.add(item) 
    else:
        all_combinations_views.add(item)

## Code to generate kernel performance metrics

### Data

In [None]:
data = f"data/input/your_file.csv"
df = pd.read_csv(data)

time = 300

In [None]:
data_total = np.array(df['Your_Column_Name'].values)
data = data_total[:int(time)]
data = data.reshape(-1, 1)

CRdata = data/1000
CRdata  = np.ravel(CRdata)

In [12]:
trainL = len(CRdata) - 60
t = np.linspace(1,len(CRdata),len(CRdata))
t = t.reshape(len(t),1)
t = np.atleast_2d(t)

t_tr  = t[0:trainL]
t_test = t[trainL:]

CR_tr = CRdata[0:trainL]
CR_test = CRdata[trainL:]

### Functions

In [13]:
def Metrics_to_performance(CR_tr, t_tr, CR_test, t_test, kernel_str):
    t = np.append(t_tr, t_test).reshape(-1, 1)
    CRdata = np.append(CR_tr, CR_test)

    kernel = kernel_str

    model = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=10, alpha=0.1, normalize_y=False)
    model.fit(t_tr, CR_tr)
    params = model.kernel_.get_params()

    R2_tr = model.score(t_tr, CR_tr)
    R2 = model.score(t, CRdata)
    R2_test = model.score(t_test, CR_test)

    CRpred_tr, sigma_tr = model.predict(t_tr, return_std=True)
    CRpred_test, sigma_test = model.predict(t_test, return_std=True)
    CRpred, sigma = model.predict(t, return_std=True)

    learned_kernel = model.kernel_

    mse = np.mean(((CRpred_tr-CR_tr)*1000)**2)
    lml = model.log_marginal_likelihood_value_
    std = np.sqrt(mse)

    return kernel_str, learned_kernel, mse, lml, std, R2_tr, R2, R2_test

In [14]:
def evaluate_kernel(kernel_str):
  try:
    result = Metrics_to_performance(CR_tr, t_tr, CR_test, t_test, kernel_str)
    return result
  except Exception as e:
    print(f"Error evaluating kernel {kernel_str}: {e}")
    return None

### Metrics to evaluate performance

In [None]:
kernels_data = [eval(k) for k in all_combinations_views]

In [None]:
batch_size = 100
num_batches = len(kernels_data) // batch_size + 1
kernels_metrics = []
num_cores = -1

for i in range(num_batches):
  start_idx = i * batch_size
  end_idx = min((i + 1) * batch_size, len(kernels_data))
  kernels_batch = kernels_data[start_idx:end_idx]

  kernels_metrics_batch = Parallel(n_jobs=num_cores)(delayed(evaluate_kernel)(kernel_str) for kernel_str in tqdm(kernels_batch, desc=f"Evaluating batch {i+1}/{num_batches}"))

  kernels_metrics.extend(kernels_metrics_batch)

## Generate File with all kernel combinations

In [None]:
file_name = "data/samples/your_samples_file.csv"

In [None]:
with open(file_name, 'w', newline='') as file_csv:
    writer = csv.writer(file_csv)
    writer.writerow(['kernel_str', 'learned_kernel', 'mse', 'lml', 'std', 'R2_tr', 'R2', 'R2_test'])
    for row in kernels_metrics:
        if row is not None:
            formatted_row = [row[0], row[1], '{:.4f}'.format(row[2]), '{:.4f}'.format(row[3]), '{:.4f}'.format(row[4]), '{:.4f}'.format(row[5]), '{:.4f}'.format(row[6]), '{:.4f}'.format(row[7])]
            writer.writerow(formatted_row)

print(f"Data saved in: {file_name}")