In [16]:
import numpy as np
import pandas as pd
from genetic_programming.genetics import SymbolicRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.linear_model import  Ridge
import warnings

warnings.filterwarnings("ignore")

In [17]:
x = pd.read_csv('./data/70_dataset.csv')
x.set_index('compound', inplace=True)
x = x.drop(x.columns[x.nunique() == 1], axis=1)
y = x.pop('target')
x = x
feature_name = x.columns

In [18]:
def assign(X_list):
    x = X_list[0]
    for i in range(len(X_list)-1):
        x = np.c_[x, X_list[i+1]]
    return x

def LR(x_tr, x_ts, y_tr, y_ts, alpha, dimension):
    ols = Ridge(alpha=alpha)
    model = ols.fit(x_tr, y_tr)
    pred_tr = model.predict(x_tr)
    pred_ts = model.predict(x_ts)

    tr_result = r2_score(y_tr, pred_tr)
    ts_result = r2_score(y_ts, pred_ts)

    return pred_tr, pred_ts, tr_result, ts_result

In [12]:
x_tr, x_ts, y_tr, y_ts = train_test_split(x, y, test_size=0.25, shuffle=True, random_state=352)

list_of_iter_result = []
list_of_iter_data = []

x_tr_data, x_ts_data, y_tr_data, y_ts_data = [x_tr], [x_ts], [y_tr], [y_ts]

for d in range(10):
    ## You can change hyperparameters in here (population, generation, etc)
    gp = SymbolicRegressor(population_size=4000, generations=15, init_depth=(1, 2), metric='pearson',
                            feature_names=feature_name, parsimony_coefficient=0.02,
                            tournament_size=100, stopping_criteria=0.75-d*0.01, low_memory=True, n_jobs=16)


    gp.fit(x_tr_data[0], y_tr_data[-1])

    x_tr_descriptor = gp.predict(x_tr_data[0]).reshape(-1, 1)
    x_ts_descriptor = gp.predict(x_ts_data[0]).reshape(-1, 1)

    

    x_tr_data.append(x_tr_descriptor)
    x_ts_data.append(x_ts_descriptor)
    

    program = str(gp._program)

    X_tr, X_ts = x_tr_data[1:], x_ts_data[1:]

    X_tr = assign(X_tr)
    X_ts = assign(X_ts)

    pred_tr, pred_ts, tr_result, ts_result = LR(X_tr, X_ts, y_tr_data[0], y_ts_data[0], 1, d + 1)

    res_tr, res_ts = y_tr_data[0] - pred_tr, y_ts_data[0] - pred_ts
    y_tr_data.append(res_tr)
    y_ts_data.append(res_ts)
    print(f'{d+1} Dimension - {program}')
    print(f'{d+1} Dimension - Training : {tr_result} | Test : {ts_result}')

list_of_data = [x_tr_data, x_ts_data, y_tr_data, y_ts_data]

list_of_iter_data.append(list_of_data)

1 Dimension - div(add(add(DVEtm, Rx), Rx), Vtmx)
1 Dimension - Training : 0.4697259353836266 | Test : 0.45896404039867267
2 Dimension - LEs
2 Dimension - Training : 0.5033195688367174 | Test : 0.5258371305605807
3 Dimension - Rtm
3 Dimension - Training : 0.516418527612539 | Test : 0.5580810378111847
4 Dimension - pow3(pow3(pow3(div(Rtm, BEtmx))))
4 Dimension - Training : 0.6330390337884809 | Test : 0.8762437503284182
5 Dimension - pow3(div(DVEx, DVEtm))
5 Dimension - Training : 0.6799852158103572 | Test : 0.8285923469867899
6 Dimension - pow3(pow3(pow3(pow3(pow3(Etm)))))
6 Dimension - Training : 0.6981948064183023 | Test : 0.8130522910040505
7 Dimension - abs(log(sub(Vtmx, Ntmf)))
7 Dimension - Training : 0.7552447163259655 | Test : 0.8301417588185398
8 Dimension - log(LEf)
8 Dimension - Training : 0.7739460913060383 | Test : 0.8235066472408219
9 Dimension - sqrt(inv(sub(Qtm, Ctm)))
9 Dimension - Training : 0.7940680398351522 | Test : 0.7725288604965468
10 Dimension - log(sub(Nxs, Qtm)