# Script that creates a table for the paper

In [5]:
# classic imports
import os
import pickle
import pandas as pd

# this is a weird fix for some weird error with paths
import pathlib
temp = pathlib.PosixPath
pathlib.PosixPath = pathlib.WindowsPath

In [6]:
# root folder with the results
results_folder = "../results/2024-06-07-full-results"
pk_file_name = "symbolic_regression_cp.pk"

# data structures
datasets_to_files = {}

# recursively walk through all the folders, looking for files
for folder, subfolders, files in os.walk(results_folder) :
    if pk_file_name in files :
        datasets_to_files[os.path.basename(folder)] = os.path.join(folder, pk_file_name)

print("Number of files: %d" % len(datasets_to_files))
print(datasets_to_files)

Number of files: 22
{'abalone': '../results/2024-06-07-full-results\\abalone\\symbolic_regression_cp.pk', 'airfoil_self_noise': '../results/2024-06-07-full-results\\airfoil_self_noise\\symbolic_regression_cp.pk', 'brazilian_houses': '../results/2024-06-07-full-results\\brazilian_houses\\symbolic_regression_cp.pk', 'california_housing': '../results/2024-06-07-full-results\\california_housing\\symbolic_regression_cp.pk', 'cars': '../results/2024-06-07-full-results\\cars\\symbolic_regression_cp.pk', 'concrete_compressive_strength': '../results/2024-06-07-full-results\\concrete_compressive_strength\\symbolic_regression_cp.pk', 'fifa': '../results/2024-06-07-full-results\\fifa\\symbolic_regression_cp.pk', 'grid_stability': '../results/2024-06-07-full-results\\grid_stability\\symbolic_regression_cp.pk', 'health_insurance': '../results/2024-06-07-full-results\\health_insurance\\symbolic_regression_cp.pk', 'kin8nm': '../results/2024-06-07-full-results\\kin8nm\\symbolic_regression_cp.pk', 'king

In [7]:
# right now, the equations are encoded as sympy objects; sympy objects have a method that can
# convert them to latex expressions; so, we could manipulate the expression, replacing symbols
# with more human-readable symbols. But to do that, we need to build a dictionary of Symbols

# and maybe it's actually easier to replace stuff directly inside the latex expression
dict_replacement = {}
dict_replacement[r"x_{0}"] = r"\hat{y}" # x_0 is actually the predicted (normalized) value
dict_replacement[r"x_{1}"] = r"\sigma_d" # x_1 is the sigmas, based on distance
dict_replacement[r"x_{2}"] = r"\sigma_{std}" # x_2 is the sigmas, based on standard deviation
dict_replacement[r"x_{3}"] = r"\sigma_{oob}" # x_3 is the sigmas, based on oob residuals
dict_replacement[r"x_{4}"] = r"\sigma_{var}" # x_4 is the sigmas, based on variance of predictors in ensemble

# all other values are features, so we can just replace them with x_{n-5}
# 116 is the maximum number of features appearing among the considered data sets
dict_replacement_features = {}
for i in range(0, 116) :
    dict_replacement_features["x_{%d}" % (i+5)] = "x_{%d}" % i

In [8]:
# this part takes for granted that we have a total of 22 data sets,
# so that we can create two columns of 11 data sets
latex_table = r"\begin{tabular}{l|l|l|l}" + "\n" 
latex_table += r"\textbf{Data set name} & \textbf{Best equation} & \textbf{Data set name} & \textbf{Best equation}\\ \hline \hline" + "\n"

# list of keys in the dictionary
datasets = [key for key in datasets_to_files]

for i in range(0, 11) :
    # columns to the left
    dataset_left = datasets[i]
    # add data set name
    # for the data set name, we need to escape all "_" with "\_"
    latex_table += r"\texttt{" + dataset_left.replace("_", r"\_") + r"} & "
    # add equation
    sr_left = pickle.load(open(datasets_to_files[dataset_left], "rb"))
    equation = sr_left.latex()
    # replacement of variables in two steps, to avoid issues with x_%d
    for key, value in dict_replacement.items() :
        equation = equation.replace(key, value)
    for key, value in dict_replacement_features.items() :
        equation = equation.replace(key, value)
    latex_table += r"$" + equation + r"$" + r" & "
    
    # columns to the right
    dataset_right = datasets[i+11]
    # add data set name
    # for the data set name, we need to escape all "_" with "\_"
    latex_table += r"\texttt{" + dataset_right.replace("_", r"\_") + r"} & "
    # add equation
    sr_right = pickle.load(open(datasets_to_files[dataset_right], "rb"))
    equation = sr_right.latex()
    # replacement of variables in two steps, to avoid issues with x_%d
    for key, value in dict_replacement.items() :
        equation = equation.replace(key, value)
    for key, value in dict_replacement_features.items() :
        equation = equation.replace(key, value)
    latex_table += r"$" + equation + r"$" + r" \\ \hline" + "\n"

latex_table += r"\end{tabular}" + "\n"

print(latex_table)

\begin{tabular}{l|l|l|l}
\textbf{Data set name} & \textbf{Best equation} & \textbf{Data set name} & \textbf{Best equation}\\ \hline \hline
\texttt{abalone} & $e^{\sigma_{oob}}$ & \texttt{miami\_housing} & $\sigma_d + 5.22 \sigma_{var}$ \\ \hline
\texttt{airfoil\_self\_noise} & $\sigma_{var} + 0.364$ & \texttt{Moneyball} & $\sin{\left(\sin{\left(\sin{\left(\sin{\left(\cos{\left(\cos{\left(x_{0} \left(x_{7} - 0.861\right) \right)} \right)} \right)} \right)} \right)} \right)}$ \\ \hline
\texttt{brazilian\_houses} & $\log{\left(\hat{y} + 1.23 \right)}$ & \texttt{physiochemical\_protein} & $\sigma_{var} + 0.923$ \\ \hline
\texttt{california\_housing} & $3.00 \sigma_{var} + 0.451$ & \texttt{pumadyn32nh} & $- 0.0338 x_{12} + 0.0338 x_{1} + 1.16$ \\ \hline
\texttt{cars} & $\log{\left(\sigma_{oob} + 1.11 \right)}$ & \texttt{QSAR\_fish\_toxicity} & $e^{\sin{\left(\frac{- x_{5} - 0.901}{\cos{\left(0.592 \sigma_d + 0.592 x_{4} \right)}} \right)} \cos{\left(\hat{y} \right)}} + 0.386$ \\ \hline
\tex

In [15]:
# just a test
sr_path = datasets_to_files[datasets[0]]
sr = pickle.load(open(sr_path, "rb"))

print(sr.latex())

e^{x_{3}}
