# saved tuned models

Go thru PMLB results and pick the most common params in the best estimators. 
Use these parameter settings for the Feynman and Strogatz models.

In [7]:
import pandas as pd
import json
import numpy as np
from glob import glob
from tqdm import tqdm
import os

rdir = '../results_pmlb_r1/'

symbolic_algs = [
    'AFPRegressor',
    'AIFeynman',
    'BSRRegressor',
    'DSRRegressor',
    'FFXRegressor',
    'FEATRegressor',
    'FE_AFPRegressor',
    'EPLEXRegressor',
    'GPGOMEA',
    'gplearn',
    'ITEARegressor',
    'MRGPRegressor',
    'OperonRegressor',
    'sembackpropgp'
]

In [8]:
frames = []
comparison_cols = [
    'algorithm',
    'dataset',
    'params'
]
fails = []
import pdb
for f in tqdm(glob(rdir + '/*/*.json')):
    if 'cv_results' in f: 
        continue
    try: 
        r = json.load(open(f,'r'))
            
        sub_r = {k:v for k,v in r.items() if k in comparison_cols}
        frames.append(sub_r) 
    except Exception as e:
        fails.append([f,e])
        pass
    
print('fails:',len(fails),'out of ',len(frames)+len(fails))
# df_results = pd.concat(frames)
df_results = pd.DataFrame.from_records(frames)

# keep only symbolic regressors
df_results = df_results.loc[df_results.algorithm.isin(symbolic_algs)]
# turn params in to string
df_results['params_str'] = df_results['params'].apply(str)

# save so we don't have to load again
# df_results[['algorithm','params_str']].to_feather(rdir.replace('.','').replace('/','')+'_params.feather')

100%|██████████| 56654/56654 [00:10<00:00, 5537.54it/s]


fails: 385 out of  34856


## find the mode of each algorithm's params

In [9]:
# df_results.params.apply(str)
best_params = []
for alg, dfg in df_results.groupby('algorithm'):
    counts = dfg['params_str'].value_counts()
#     print(counts)
    best_params.append([alg, counts.index[0]])

best_params                    

[['AFPRegressor',
  "{'AR': False, 'AR_lookahead': False, 'AR_na': 1, 'AR_nb': 1, 'AR_nka': 1, 'AR_nkb': 0, 'ERC': True, 'ERCints': False, 'EstimateFitness': False, 'FE_ind_size': 0, 'FE_pop_size': 0, 'FE_rank': False, 'FE_train_gens': 0, 'FE_train_size': 0, 'G_sel': 1, 'G_shuffle': False, 'PS_sel': 1, 'SGD': False, 'align_dev': False, 'class_bool': False, 'class_m4gp': False, 'class_prune': False, 'classification': False, 'complex_measure': 1, 'cross': 3, 'cross_ar': 0.025, 'eHC_init': 0.5, 'eHC_its': 1, 'eHC_mut': False, 'eHC_on': False, 'eHC_prob': 0.1, 'eHC_slim': False, 'elitism': True, 'estimate_generality': False, 'fit_type': 'MSE', 'g': 250, 'init_trees': True, 'init_validate_on': False, 'island_gens': 100, 'islands': False, 'learning_rate': 1.0, 'lex_class': False, 'lex_eps_dynamic': False, 'lex_eps_dynamic_madcap': False, 'lex_eps_dynamic_rand': False, 'lex_eps_error': False, 'lex_eps_error_mad': False, 'lex_eps_global': False, 'lex_eps_std': False, 'lex_eps_target': False, '

In [10]:
df_results.loc[df_results.algorithm=='DSRRegressor']

Unnamed: 0,dataset,algorithm,params,params_str
14,1027_ESL,DSRRegressor,{},{}
95,1027_ESL,DSRRegressor,{},{}
96,1027_ESL,DSRRegressor,{},{}
97,1027_ESL,DSRRegressor,{},{}
98,1027_ESL,DSRRegressor,{},{}
...,...,...,...,...
34349,titanic,DSRRegressor,{},{}
34350,titanic,DSRRegressor,{},{}
34434,titanic,DSRRegressor,{},{}
34435,titanic,DSRRegressor,{},{}


In [11]:
# write tuned model scripts for each algorithm

In [24]:
import os
for alg, bp in best_params:
#     os.system('cp ../experiment/methods/{ALG}.py ../experiment/methods/tuned/{ALG}.py'.format(ALG=alg))
    with open('../experiment/methods/tuned/params/_'+alg.lower()+'.py','w') as f:
        f.write('params = {}'.format(bp))
        