In [None]:
import os
import json 
import time

import numpy as np
import torch

import matplotlib.pyplot as plt

from sympy import lambdify
import sympy as sp

from pysr import PySRRegressor

import nesymres
from nesymres.architectures.model import Model
from nesymres.utils import load_metadata_hdf5
from nesymres.dclasses import FitParams, NNEquation, BFGSParams
import omegaconf

from symbolicgpt.models import GPT, GPTConfig, PointNetConfig
from symbolicgpt.utils import processDataFiles, CharDataset,\
        sample_from_model

from pathlib import Path
from functools import partial

#
import glob

In [None]:
# parameters for running benchmarks

distribution_type = "uniform"
# for normal distribution, use mean and standard deviation.
# for uniform distribution the range is the min and max values
distribution_range = [-.990, 1.0]
number_points = 20
number_trials = 100 # seeds will be trial number
logging = True

In [None]:
complex_1 = "1.0*x0/(1.0*x0*1.0*exp(1.0*x0)*1.0*exp(1.0*sin(1.0*x0+1.0))+1.0*exp(1.0*x0)*1.0*exp(1.0*sin(1.0*x0+1.0)))+1.0"
complex_2 = "1.0*sqrt(1.0*abs(1.0*sqrt(1.0*abs(1.0*x0+1.0))))*1.0*sin(1.0*x0/(1.0*x0+1.0)+1.0/(1.0*x0+1.0))+1.0"
complex_3 = "1.0*sqrt(1.0*abs(1.0*x0/(1.0*x0+1.0)+1.0/(1.0*x0+1.0)))+1.0*sqrt(1.0*abs(1.0*exp(1.0*x0)))+1.0"
complex_4 = "1.0*sqrt(1.0*abs(1.0*x0))+1.0/(x0*1.0*sqrt(1.0*abs(1.0*x0))*1.0*sqrt(1.0*abs(1.0*x0)))+1.0"
complex_5 = "1.0*x0/(1.0*x0*1.0*sqrt(1.0*abs(1.0*log(1.0*x0)))+1.0*sqrt(1.0*abs(1.0*log(1.0*x0))))+1.0"
complex_6 = "1.0*x0**2*1.0*sqrt(1.0*abs(1.0*x0+1.0))+1.0*x0+1.0*x0/(1.0*x0)+1.0*log(1.0*x0+1.0)+1.0"
complex_7 = "1.0*sqrt(1.0*abs(1.0*x0*1.0*sqrt(1.0*abs(1.0*x0))/(1.0*x0+1.0)+1.0/(1.0*x0+1.0)))+1.0"
complex_8 = "1.0*x0**2+1.0*x0+1.0*exp(1.0*x0)/1.0*sqrt(1.0*abs(1.0*sqrt(1.0*abs(1.0*x0+1.0))))+1.0"

sample_meta = {complex_1: (-3, 3., 20),
               complex_2: (-3, 3., 20),
               complex_3: (-3, 3., 20),
               complex_4: (0.1, 4., 20),
               complex_5: (1.01, 4, 20),
               complex_6: (-.9, 3., 20),
               complex_7: (-3, 3., 20),
               complex_8: (-3, 3., 20)
              }

benchmark_eqns = [complex_1, complex_2, complex_3, complex_4, \
        complex_5, complex_6, complex_7, complex_8]

In [None]:
# visualize equations

plt.figure()
for number, fn in enumerate(benchmark_eqns):

    my_fn = sp.lambdify("x0", expr=fn)
    
    (bottom, top, c) = sample_meta[fn]
    
    step_size = (top-bottom)/10000
    x = np.arange(bottom, top, step_size)
    
    plt.plot(x, my_fn(x), label = f"nguyen-{1+number}")
    print(x.shape, my_fn(x).shape)
    print(f" {1+number} {sp.simplify(fn)} \n")
    print(f"  {sp.expand(fn)} \n")
    print(fn)
    
plt.legend()
plt.title("nguyen benchmark equations")
plt.show()

In [None]:


# from symbolicGPT.py in Valipour

embeddingSize = 512
numPoints = [20,21]
numVars = 1
numYs = 1
method = "EMB_SUM"
variableEmbedding = "NOT_VAR"

# create the model                                                              
pconf = PointNetConfig(embeddingSize=embeddingSize,                             
                       numberofPoints=numPoints[1]-1,                           
                       numberofVars=numVars,                                    
                       numberofYs=numYs,                                        
                       method=method,                                           
                       variableEmbedding=variableEmbedding)    


In [None]:
blockSize = 64
maxNumFiles = 100
const_range = [-2.1, 2.1]
decimals = 8
trainRange = [-3.0,3.0]

target = "Skeleton"
addVars = True if variableEmbedding == 'STR_VAR' else False
path = os.path.join("./symbolicgpt", "datasets", "exp_test_temp", "Train", "*.json")
my_device = torch.device("cpu")


files = glob.glob(path)[:maxNumFiles]                                       
text = processDataFiles(files) 
chars = sorted(list(set(text))+['_','T','<','>',':']) # extract unique characters from the text before converting the text to a list, # T is for the test data
text = text.split('\n') # convert the raw text to a set of examples         
trainText = text[:-1] if len(text[-1]) == 0 else text    
vocab_size = 49

train_dataset = CharDataset(text, blockSize, chars, numVars=numVars,        
                numYs=numYs, numPoints=numPoints, target=target, addVars=addVars, 
                const_range=const_range, xRange=trainRange, decimals=decimals, augment=False)

                 
mconf = GPTConfig(vocab_size, blockSize,           
                  n_layer=8, n_head=8, n_embd=embeddingSize,                    
                  padding_idx=train_dataset.paddingID)   

model = GPT(mconf, pconf)      

# # load the best model before training                                         

model_name = "XYE_1Var_30-31Points_512EmbeddingSize_SymbolicGPT_GPT_PT_EMB_SUM_Skeleton_Padding_NOT_VAR_MINIMIZE.pt"
model_path = os.path.join("symbolicgpt", "Models", model_name)
model.load_state_dict(torch.load(model_path))                                   
model = model.eval().to(my_device)

In [None]:
char_dict = {index:elem for index, elem in enumerate(chars[:])}

In [None]:
variables = torch.tensor([1])
temperature = 1.0
top_k = 0.0
top_p = 0.7
do_sample = False
inputs = torch.tensor([[23]]) # assume 23 is start token '<' 
model.to(torch.device("cpu"));

accuracies = []
all_mses = []
all_mse_sds = []

catastrophic_failure_count = 0
for hh, eqn in enumerate(benchmark_eqns):
    equivalents = []
    mses = []
    for trial in range(number_trials):
        
        np.random.seed(trial)
        torch.manual_seed(trial)
        
        my_fn = sp.lambdify("x0", expr=eqn)
        
        (bottom, top, number_samples) = sample_meta[eqn]
        x = np.random.rand(number_samples, 1) \
                * (top-bottom) \
                + bottom
        
        y = my_fn(x)
        
        x = torch.tensor(x.transpose(1,0)[None,:,:])
        y = torch.tensor(y.transpose(1,0)[None,:,:])
        
        points = torch.cat([x,y], dim=1).float()
        
        
        pred_outputs = sample_from_model(model, inputs, 
            blockSize, points=points,\
            variables=variables, temperature=temperature,\
            sample=do_sample, top_k=top_k, top_p=top_p)
        
        string_output = [char_dict[elem.item()] for elem in pred_outputs[0]]
        pred_skeleton = "".join(string_output).replace("s","x").replace("q","s").split(">")[0][1:]
              
        pred_expression = pred_skeleton.replace("C","1.").replace("x1","x0")
        print(pred_expression)
        
        try:
            tgt_eqn = sp.simplify(eqn)
            best_eqn = sp.simplify(pred_expression)
            print(best_eqn)

            tgt_fn = sp.lambdify("x0", expr=eqn)
            best_fn = sp.lambdify("x0", expr=pred_expression.replace("x1","x0"))

            is_correct = 1.0 * (sp.simplify(best_eqn - tgt_eqn) == 0) 
            is_correct += 1.0 * (sp.simplify(sp.simplify(best_eqn-1.0) - tgt_eqn) == 0)
            
            my_mse_1 = np.mean((tgt_fn(x.numpy()) - best_fn(x.numpy()))**2)
            my_mse_0 = np.mean((tgt_fn(x.numpy()) - (best_fn(x.numpy())-1.0) )**2)
            
            mses.append(min([my_mse_1, my_mse_0]))
            
        except:
            error_msg = f"evaluation failed with predicted expression {pred_skeleton}."
            wright = "incorrect"
            
            catastrophic_failure_count += 1
            is_correct = 0
            
        equivalents.append(is_correct)
        
        wright = "correct" if equivalents[-1] else "incorrect"

        correct = 1 if equivalents[-1] else 0
        
        try:
            msg = f"eqn {hh+1}, trial {trial} predicted {wright} equation: \n    predicted: {best_eqn}\n"
            msg +=f"    target   : {tgt_eqn}"
            msg += f" with mse {mses[-1]:.3}\n"
        except:
            msg = ""
        print(msg)

    msg = f"accuracy for equation {hh+1}: {np.mean(equivalents)}"\
            f" with mean mse: {np.mean(mses):3}, running total failure count: {catastrophic_failure_count}\n"
    print(msg)
    accuracies.append(np.mean(equivalents))
    all_mses.append(np.mean(mses))
    all_mse_sds.append(np.std(mses))
    
failure_msg = f"\nTotal failure count: {catastrophic_failure_count}, of {len(benchmark_eqns)*number_trials}"
failure_msg += f" = {catastrophic_failure_count / (len(benchmark_eqns)*number_trials)}"
print(failure_msg)

In [None]:
my_method = "Valipour"
msg = f"{my_method} accuracies\n"

for ii, eqn in enumerate(benchmark_eqns):
    
    msg += f"\n  Nguyen-{ii+1},  accuracy: {accuracies[ii]:5f}, mse: {all_mses[ii]:.5} +/- {all_mse_sds[ii]:.5}\n"
    
    msg += f"  {sp.expand(eqn)} \n"

print(failure_msg)
print(msg)

In [None]:
"""
Total failure count: 231, of 800 = 0.28875
Valipour accuracies

  Nguyen-1,  accuracy: 0.000000, mse: 0.1345 +/- 0.090686
  x0**3 + x0**2 + x0 

  Nguyen-2,  accuracy: 0.050000, mse: 93.834 +/- 908.36
  x0**4 + x0**3 + x0**2 + x0 

  Nguyen-3,  accuracy: 0.190000, mse: 0.07634 +/- 0.051586
  x0**5 + x0**4 + x0**3 + x0**2 + x0 

  Nguyen-4,  accuracy: 0.000000, mse: 0.89156 +/- 4.9355
  x0**6 + x0**5 + x0**4 + x0**3 + x0**2 + x0 

  Nguyen-5,  accuracy: 0.000000, mse: 61.043 +/- 495.85
  sin(x0**2)*cos(x0) - 1 

  Nguyen-6,  accuracy: 0.000000, mse: 0.34162 +/- 0.18094
  sin(x0) + sin(x0**2 + x0) 

  Nguyen-7,  accuracy: 0.000000, mse: nan +/- nan
  log(x0 + 1) + log(x0**2 + 1) 

  Nguyen-8,  accuracy: 0.000000, mse: 0.39772 +/- 0.018983
  sqrt(x0) 
"""