In [5]:
import sympy
from sympy import Symbol, simplify, factor
from sympy.parsing.sympy_parser import parse_expr
from yaml import load, Loader
from glob import glob
import pandas as pd
import json

symbolic_algs = [
    'AFPRegressor',
    'AIFeynman',
    'BSRRegressor',
    'DSRRegressor',
    'FFXRegressor',
    'FEATRegressor',
    'FE_AFPRegressor',
    'EPLEXRegressor',
    'GPGOMEA',
    'gplearn',
    'ITEARegressor',
    'MRGPRegressor',
    'OperonRegressor',
    'sembackpropgp'
]

def get_sym_model(dataset):
    """return sympy model from dataset metadata"""
    metadata = load(
            open('/'.join(dataset.split('/')[:-1])+'/metadata.yaml','r'),
            Loader=Loader
    )
    df = pd.read_csv(dataset,sep='\t')
    features = [c for c in df.columns if c != 'target']
#     print('features:',df.columns)
    description = metadata['description'].split('\n')
    model_str = [ms for ms in description if '=' in ms][0].split('=')[-1]
#     print('model:',model_str)
    model_sym = parse_expr(model_str, local_dict = {k:Symbol(k) for k in features})
#     print('sym model:',model_sym)
    return model_sym

In [6]:
# dataset = '/home/bill/projects/pmlb/datasets/feynman_III_10_19/feynman_III_10_19.tsv.gz'
# dataset = '/home/bill/projects/pmlb/datasets/strogatz_bacres1/strogatz_bacres1.tsv.gz'
# dataset = '/home/bill/projects/pmlb/datasets/strogatz_barmag1/strogatz_barmag1.tsv.gz'
# for dataset in glob('/home/bill/projects/pmlb/datasets/feynman_*/*.tsv.gz'):
#     get_sym_model(dataset)

In [7]:
# for dataset in glob('/home/bill/projects/pmlb/datasets/strogatz_*/*.tsv.gz'):
#     print(dataset.split('/')[-1])
#     get_sym_model(dataset)

# compare symbolic model to generated ones

In [11]:
import re
def process_mrgp(best_data):
    """reads in best model and gets a string version with complexity"""         
    # file structure:                                                           
    # 0 mintarget, 1 maxtarget, 2 weights, 3 intercept, 4 model_str             
    internal_weights=best_data[2]                                               
    intercept = best_data[3]                                                    
    model_form = best_data[4]                                                   
    # rename functions to python operator names                                 
    model_form = model_form.replace('mydivide','div')                           
    model_form = model_form.replace('*','mul')                                  
    model_form = model_form.replace('-','sub')                                  
    # move starting paren to other side of functions                            
    model_form = re.sub(                                                        
                    pattern=r'\((.+?(?= ))',                                    
                    repl=r'\1(',                                                
                    string=model_form                                           
                   )                                                            
    #TODO: replace square,cube,quart with pow function                          
    # currently matching parens are not captured properly                       
    # for op,i in [['square',2],['cube',3],['quart',4]]:                        
    #     pattern = op+r'\((.*?)\)'                                             
    #     model_form = re.sub(pattern=pattern,                                  
    #                         repl=r'pow(\1,'+str(i)+')',                       
    #                         string=model_form                                 
    #       )                                                                   
    complexity_ = 2+len(internal_weights)*3                                     
    model_ = ' '.join([b+'*'+ m for b,m in zip(internal_weights.split(' '),     
                                      model_form.split(' '))])  
def get_pred_model(model_str, dataset, mrgp=False):
    if mrgp:
        model_str = process_mrgp(model_str)
        
    df = pd.read_csv(dataset,sep='\t')
    
    features = [c for c in df.columns if c != 'target']
    local_dict = {k:Symbol(k) for k in features}
    new_model_str = model_str
    for i,f in enumerate(features): 
        new_model_str = new_model_str.replace('x'+str(i),f)
        new_model_str = new_model_str.replace('x_'+str(i),f)
        new_model_str = new_model_str.replace('X_'+str(i),f)
        new_model_str = new_model_str.replace('X'+str(i),f)
        new_model_str = new_model_str.replace('x[:,{}]'.format(i),f)
        new_model_str = new_model_str.replace('x[{}]'.format(i),f)
    # operators
    new_model_str = new_model_str.replace('^','**')
    #GP-GOMEA
    new_model_str = new_model_str.replace('p/','/') 
    new_model_str = new_model_str.replace('plog','log') 
    new_model_str = new_model_str.replace('aq','/') 
    # ITEA
    new_model_str = re.sub(pattern=r'sqrtAbs\((.*?)\)',
           repl=r'sqrt(abs(\1))',
           string=new_model_str
          )
    new_model_str = new_model_str.replace('np.','') 
#         new_model_str = new_model_str.replace('Sqrt','/') 

    # replace floating point digits with constants
#     constants = re.findall(r'[-+]?\d*\.*\d+',new_model_str)
#     print('constants:',constants)
#     for i,c in enumerate(constants):
#         new_model = new_model_str.replace(c,'C'+str(i))
        
    print('parsing',new_model_str)
    model_sym = parse_expr(new_model_str, local_dict = local_dict)
#     simp = model_sym
#     print('factor...')
#     simp = factor(model_sym)
#     print('simplify...')
    simp = simplify(model_sym, ratio=1)
#     print('simplify...')
#     simp = simplify(simp)
    return model_sym, simp

In [12]:
examples = [
'../results_pmlb_r1/feynman_III_10_19/feynman_III_10_19_AFPRegressor_22118.json',
'../results_pmlb_r1/feynman_III_10_19/feynman_III_10_19_OperonRegressor_11284.json',
'../results_pmlb_r1/feynman_III_10_19/feynman_III_10_19_DSRRegressor_22118.json',
'../results_pmlb_r1/feynman_III_10_19/feynman_III_10_19_FFXRegressor_6265.json',
'../results_pmlb_r1/feynman_III_10_19/feynman_III_10_19_gplearn_29802.json',
'../results_pmlb_r1/strogatz_bacres1/strogatz_bacres1_GPGOMEARegressor_21575.json',
'../results_pmlb_r1/strogatz_bacres1/strogatz_bacres1_ITEARegressor_15795.json',
'../results_pmlb_r1/strogatz_glider1/strogatz_glider1_OperonRegressor_23654.json'
]
rdir = '../results_pmlb_r1'
# for ex in glob(rdir + '/strogatz_*/*.json'):
#     if not any([sa in ex for sa in symbolic_algs]): continue
def assess(ex): 
    r = json.load(open(ex, 'r'))
    if isinstance(r['symbolic_model'],list):
        sm = ['C'+str(i)+'*'+ri for i, ri in enumerate(r['symbolic_model'])]
        sm = '+'.join(sm)
        r['symbolic_model'] = sm
    dataset = r['dataset']
    print(80*'=')
    print(r['algorithm'],'model of',dataset)
    datafile = '/home/bill/projects/pmlb/datasets/'+dataset+'/'+dataset+'.tsv.gz'
    true_model = get_sym_model(datafile)
    print('true_model:',true_model)
    print('raw pred_model:',r['symbolic_model'])
    pred_model, simp = get_pred_model(r['symbolic_model'], datafile, 'MRGP' in ex)
    print('converted pred_model:',pred_model)
    print('simplified pred_model:',simp)
#     print('difference:',simplify(true_model-simp, ratio=1))
    print('difference:',true_model-simp)
    print(80*'=')
    
    

In [16]:
# ex = '../results_sym_data/strogatz_vdp2/strogatz_vdp2_tuned.AFPRegressor_23654.json'
ex = '../results_sym_data/strogatz_vdp2/strogatz_vdp2_tuned.AIFeynman_23654.json'
assess(ex)


JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [None]:
model_str = '-0.5193796813204485,4.130491970555556,-0.7334227722845027 -0.4420849780293733 0.11011374002722484 0.0 -0.05253410777465981 0.0 -2.4521794519812034E-5 -1.477740092046933E-5 4.6562008138897446E-5 0.0 -0.021331899129844074 0.0 0.0 0.008196015389612293 0.0 0.0 -0.002695666304331192 0.0 0.0 -3.7379378890512117E-4 -0.02052241989325296 0.0 0.0 0.0 -8.272653031457096E-5 0.0 0.0 -0.011342590566753909 -0.00154153743023166 0.0 -0.005081943057842678 -0.0013633105435522624 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.8296524089695643E-10,0.03617020314319655,(mydivide (* (cube (exp (mylog (* (sqrt X1) (* (exp (mylog X2)) (cube (exp (mylog (* (exp X2) (cube X1)))))))))) (* (cube (square (sqrt X1))) (mydivide (square X1) (quart (square X1))))) (- (quart (* X2 X1)) (quart (sqrt (cube X1)))))'
best_data = model_str.split(',')               
# file structure:                                                              
# mintarget, maxtarget, weights, intercept, model_str            
internal_weights=best_data[2]                                               
intercept = best_data[3]                                                    
model_form = best_data[4]                                                   
# weighted_model =                                                            
# model_ = intercept '+' scale '*( ' + weighted_model + ' )'  

model_form = model_form.replace('mydivide','div').replace('*','mul').replace('-','sub')

In [None]:
model_form = re.sub(
                    pattern=r'\((.+?(?= ))',
                    repl=r'\1(',
                    string=model_form
                   )

In [None]:
# re.sub(pattern=r'quart\((.*?)\)',
#            repl=r'pow(\1,4)',
#            string=new_model_str
#           )
# re.sub(pattern=r'square\((.*?)\)',
#            repl=r'pow(\1,2)',
#            string=new_model_str
#           )
# re.sub(pattern=r'cube\((.*?)\)',
#            repl=r'pow(\1,3)',
#            string=new_model_str
#           )
for op,i in [['square',2],['cube',3],['quart',4]]:                          
    pattern = op+r'\((.*?)\)'                                               
    model_form = re.sub(pattern=pattern,                                    
       repl=r'pow(\1,'+str(i)+')',                                          
       string=model_form                                                    
      )  
model_form

In [None]:
model_form.split(' ')


In [None]:
print(len(internal_weights.split(' ')),
      len(model_form.split(' ')),
      len(model_form.split('('))
     )

In [None]:
# ' '.join(['('+b+'*'+ m+')'  for b,m in zip(internal_weights.split(' '),model_form.split(' '))])
' '.join(['('+b+'*'+m[1:] if m.startswith('(') else b+'*'+ m  
 for b,m in zip(internal_weights.split(' '),model_form.split(' '))])

In [None]:
frames = []
comparison_cols = [
    'dataset',
    'algorithm',
    'random_state',
    'time_time',
    'model_size',
    'symbolic_model',
    'r2_test',
    'mse_test',
    'mae_test'
]
fails = []
import pdb
for f in tqdm(glob(rdir + '/*/*.json')):
    if 'cv_results' in f: 
        continue
    try: 
        r = json.load(open(f,'r'))
        if isinstance(r['symbolic_model'],list):
#             print(f)
            sm = ['B'+str(i)+'*'+ri for i, ri in enumerate(r['symbolic_model'])]
            sm = '+'.join(sm)
            r['symbolic_model'] = sm
            
        sub_r = {k:v for k,v in r.items() if k in comparison_cols}
        frames.append(sub_r) 
    except Exception as e:
        fails.append([f,e])
        pass