In [None]:
import os
import json 

import numpy as np
import torch

import matplotlib.pyplot as plt

from sympy import lambdify
import sympy as sp

from pysr import PySRRegressor

import nesymres
from nesymres.architectures.model import Model
from nesymres.utils import load_metadata_hdf5
from nesymres.dclasses import FitParams, NNEquation, BFGSParams
import omegaconf

from pathlib import Path
from functools import partial

In [None]:
# parameters for running benchmarks

distribution_type = "uniform"
# for normal distribution, use mean and standard deviation.
# for uniform distribution the range is the min and max values
distribution_range = [0, 10.0]
number_points = 500
number_trials = 100 # seeds will be trial number


In [None]:
nguyen_1 = "x0**3 + x0**2 + x0"
nguyen_2 = "x0**4 + x0**3 + x0**2 + x0"
nguyen_3 = "x0**5 + x0**4 + x0**3 + x0**2 + x0"
nguyen_4 = "x0**6 + x0**5 + x0**4 + x0**3 + x0**2 + x0"
nguyen_5 = "sin(x0**2) * cos(x0) - 1"
nguyen_6 = "sin(x0) + sin(x0+x0**2)" 
nguyen_7 = "log(x0+1) + log(x0**2 + 1) "
nguyen_8 = "x0**(1/2)"

nguyen_benchmarks = [nguyen_1, nguyen_2, nguyen_3, nguyen_4, \
        nguyen_5, nguyen_6, nguyen_7, nguyen_8]


In [None]:
# visualize equations

fn = sp.lambdify("x0", expr=nguyen_2)
plt.figure()
for number, fn in enumerate(nguyen_benchmarks):

    my_fn = sp.lambdify("x0", expr=fn)
    
    step_size = (distribution_range[1] - distribution_range[0])/100
    x = np.arange(distribution_range[0], distribution_range[1], step_size)
    
    plt.plot(x, my_fn(x), label = f"Nguyen-{number}")
    print(x.shape, my_fn(x).shape)
    
plt.legend()
plt.title("Nguyen benchmark equations")
plt.show()

In [None]:
accuracies = []
for hh, eqn in enumerate(nguyen_benchmarks):
    
    equivalents = []
    
    for trial in range(number_trials):

        np.random.seed(trial)
        torch.manual_seed(trial)

        model = PySRRegressor(
            niterations=10,
            binary_operators=["+", "*"],
            unary_operators=[
                "cos",
                "exp",
                "sin",
                "inv(x) = 1/x"  # Custom operator (julia syntax)
            ],
            model_selection="best",
            deterministic = True,
            procs = 0,
            multithreading = False,
            random_state = trial,
            verbosity=0,
            loss="loss(x, y) = (x - y)^2",  # Custom loss function (julia syntax)
        )


        my_fn = sp.lambdify("x0", expr=eqn)
        x = np.random.rand(number_points, 1) \
                * (distribution_range[1]-distribution_range[0]) \
                - distribution_range[0]
        y = my_fn(x)
        model.fit(x, y)
        
        best_eqn = sp.simplify(model.get_best()["equation"])
        tgt_eqn = sp.simplify(eqn)
        is_correct = sp.simplify(best_eqn - tgt_eqn) == 0
        equivalents.append(is_correct)
        
        wright = "correct" if equivalents[-1] else "incorrect"
        try:
            msg = f"eqn {hh}, trial {trial} predicted {wright} equation {best_eqn} for target {tgt_eqn}"
        except:
            msg = ""
        print(msg)
        
    msg = f"accuracy for equation {eqn}: {np.mean(equivalents)}"
    print(msg, equivalents)
    accuracies.append(np.mean(equivalents))

In [None]:
my_method = "PySR"
msg = f"{my_method} accuracies\n"

for ii, eqn in enumerate(nguyen_benchmarks):
    
    msg += f"\n  Nguyen-{ii+1}  accuracy: {accuracies[ii]:4f}\n"
    
    msg += f"  {sp.expand(eqn)} \n"
    
print(msg)

In [None]:
# needs to point to directory containing 
#  weights/100M.ckpt and jupyter/100M/eq_settings.json and jupyter/100M/config.yaml
nsrts_dir = "../../nsrts"


json_filepath = os.path.join(nsrts_dir, "jupyter", "100M", "eq_setting.json")
with open(json_filepath, 'r') as json_file:
    eq_setting = json.load(json_file)
     
cfg_filepath = os.path.join(nsrts_dir, "jupyter", "100M", "config.yaml")
cfg = omegaconf.OmegaConf.load(cfg_filepath)

weights_path = os.path.join(nsrts_dir, "weights", "100M.ckpt")
    
## Set up BFGS load rom the hydra config yaml
bfgs = BFGSParams(
        activated= cfg.inference.bfgs.activated,
        n_restarts=cfg.inference.bfgs.n_restarts,
        add_coefficients_if_not_existing=cfg.inference.bfgs.add_coefficients_if_not_existing,
        normalization_o=cfg.inference.bfgs.normalization_o,
        idx_remove=cfg.inference.bfgs.idx_remove,
        normalization_type=cfg.inference.bfgs.normalization_type,
        stop_time=cfg.inference.bfgs.stop_time,
    )

# adjust this parameter up for greater accuracy and longer runtime
cfg.inference.beam_size = 1

params_fit = FitParams(word2id=eq_setting["word2id"], 
                            id2word={int(k): v for k,v in eq_setting["id2word"].items()}, 
                            una_ops=eq_setting["una_ops"], 
                            bin_ops=eq_setting["bin_ops"], 
                            total_variables=list(eq_setting["total_variables"]),  
                            total_coefficients=list(eq_setting["total_coefficients"]),
                            rewrite_functions=list(eq_setting["rewrite_functions"]),
                            bfgs=bfgs,
                            beam_size=cfg.inference.beam_size #This parameter is a tradeoff between accuracy and fitting time
                            )

## Load equation configuration and architecture configuration
accuracies = []

for hh, eqn in enumerate(nguyen_benchmarks):
    
    equivalents = []
    
    for trial in range(number_trials):

        np.random.seed(trial)
        torch.manual_seed(trial)



        ## Load architecture, set into eval mode, and pass the config parameters
        model = Model.load_from_checkpoint(weights_path, cfg=cfg.architecture)
        model.eval()
        if torch.cuda.is_available(): 
            model.to(torch.device("cuda:1")) 
            
        fitfunc = partial(model.fitfunc, cfg_params=params_fit)

        my_fn = sp.lambdify("x0", expr=eqn)
        
        
        print(x.shape, y.shape)
        # work-around for occasional catastrophic failure
        output = {"best_bfgs_preds": []}
        
        while len(output["best_bfgs_preds"]) == 0:
            
            x = np.random.rand(number_points, 1) \
                * (distribution_range[1]-distribution_range[0]) \
                - distribution_range[0]
            y = my_fn(x)

            x = torch.tensor(x)
            y = torch.tensor(y)
            
            output = fitfunc(x, y.squeeze()) 
        
        best_eqn = sp.simplify(output["best_bfgs_preds"][0].replace("x_1", "x0"))
        
        tgt_eqn = sp.simplify(eqn)
        is_correct = sp.simplify(best_eqn - tgt_eqn) == 0
        equivalents.append(is_correct)
        
        wright = "correct" if equivalents[-1] else "incorrect"
        msg = f"eqn {hh}, trial {trial} predicted {wright} equation {best_eqn} for target {tgt_eqn}"
        print(msg)
        
    msg = f"accuracy for equation {eqn}: {np.mean(equivalents)}"
    print(msg, equivalents)
    accuracies.append(np.mean(equivalents))

In [None]:
my_method = "NSRTS"
msg = f"{my_method} accuracies\n"

for ii, eqn in enumerate(nguyen_benchmarks):
    
    msg += f"\n  Nguyen-{ii+1}  accuracy: {accuracies[ii]:4f}\n"
    
    msg += f"  {sp.expand(eqn)} \n"
    
print(msg)

In [None]:

# results with 500 sample points per equation, 100 trials with 100 different random seeds, default settings
# and random uniform distribution from 0 to 10.0

"""
PySR accuracies

  Nguyen-1  accuracy: 0.890000
  x0**3 + x0**2 + x0 

  Nguyen-2  accuracy: 0.150000
  x0**4 + x0**3 + x0**2 + x0 

  Nguyen-3  accuracy: 0.160000
  x0**5 + x0**4 + x0**3 + x0**2 + x0 

  Nguyen-4  accuracy: 0.000000
  x0**6 + x0**5 + x0**4 + x0**3 + x0**2 + x0 

  Nguyen-5  accuracy: 0.110000
  sin(x0**2)*cos(x0) - 1 

  Nguyen-6  accuracy: 0.880000
  sin(x0) + sin(x0**2 + x0) 

  Nguyen-7  accuracy: 0.000000
  log(x0 + 1) + log(x0**2 + 1) 

  Nguyen-8  accuracy: 0.000000
  sqrt(x0) 

"""