In [1]:
import numpy as np
import pandas as pd
from genetic_programming.genetics import SymbolicRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.linear_model import  Ridge
import warnings

warnings.filterwarnings("ignore")

In [2]:
x = pd.read_csv('./data/70_dataset.csv')
x.set_index('compound', inplace=True)
x = x.drop(x.columns[x.nunique() == 1], axis=1)
y = x.pop('target')
x = x
feature_name = x.columns

In [3]:
def assign(X_list):
    x = X_list[0]
    for i in range(len(X_list)-1):
        x = np.c_[x, X_list[i+1]]
    return x

def LR(x_tr, x_ts, y_tr, y_ts, alpha, dimension):
    ols = Ridge(alpha=alpha)
    model = ols.fit(x_tr, y_tr)
    pred_tr = model.predict(x_tr)
    pred_ts = model.predict(x_ts)

    tr_result = r2_score(y_tr, pred_tr)
    ts_result = r2_score(y_ts, pred_ts)

    return pred_tr, pred_ts, tr_result, ts_result

In [15]:
x_tr, x_ts, y_tr, y_ts = train_test_split(x, y, test_size=0.25, shuffle=True, random_state=389)

list_of_iter_result = []
list_of_iter_data = []

x_tr_data, x_ts_data, y_tr_data, y_ts_data = [x_tr], [x_ts], [y_tr], [y_ts]

for d in range(10):
    ## You can change hyperparameters in here (population, generation, etc)
    gp = SymbolicRegressor(population_size=7000, generations=15, init_depth=(1, 2), metric='pearson',
                            feature_names=feature_name, parsimony_coefficient=0.03,
                            tournament_size=100, stopping_criteria=0.8, low_memory=True, n_jobs=16,random_state=1)


    gp.fit(x_tr_data[0], y_tr_data[-1])

    x_tr_descriptor = gp.predict(x_tr_data[0]).reshape(-1, 1)
    x_ts_descriptor = gp.predict(x_ts_data[0]).reshape(-1, 1)

    

    x_tr_data.append(x_tr_descriptor)
    x_ts_data.append(x_ts_descriptor)
    

    program = str(gp._program)

    X_tr, X_ts = x_tr_data[1:], x_ts_data[1:]

    X_tr = assign(X_tr)
    X_ts = assign(X_ts)

    pred_tr, pred_ts, tr_result, ts_result = LR(X_tr, X_ts, y_tr_data[0], y_ts_data[0], 1, d + 1)
    res_tr, res_ts = y_tr_data[0] - pred_tr, y_ts_data[0] - pred_ts
    y_tr_data.append(res_tr)
    y_ts_data.append(res_ts)
    print(f'{d+1} Dimension - {program}')
    print(f'{d+1} Dimension - Training : {tr_result} | Test : {ts_result}')

list_of_data = [x_tr_data, x_ts_data, y_tr_data, y_ts_data]

list_of_iter_data.append(list_of_data)

1 Dimension - inv(div(BEtmx, Rtm))
1 Dimension - Training : 0.40293383731005594 | Test : 0.6880644463221829
2 Dimension - inv(sub(DVEtm, Vtmx))
2 Dimension - Training : 0.5739180184004931 | Test : 0.7412927065069477
3 Dimension - add(Nxs, Vtm)
3 Dimension - Training : 0.6567109889904947 | Test : 0.7201962538300211
4 Dimension - log(DVEtm)
4 Dimension - Training : 0.6569751749283519 | Test : 0.6991531489311533
5 Dimension - pow2(sub(Wx, Wtm))
5 Dimension - Training : 0.6852712858857054 | Test : 0.7181720471970856
6 Dimension - Qtm
6 Dimension - Training : 0.6917103954984354 | Test : 0.7250837270135865
7 Dimension - inv(log(sub(Qtm, Vtmx)))
7 Dimension - Training : 0.7443748743410221 | Test : 0.7367170943685689
8 Dimension - Ctm
8 Dimension - Training : 0.753078787039407 | Test : 0.7342095015442306
9 Dimension - sqrt(sqrt(sqrt(log(sub(Nxs, Vtmx)))))
9 Dimension - Training : 0.7837341798702921 | Test : 0.7292474913848324
10 Dimension - div(LEf, Ctm)
10 Dimension - Training : 0.78752394782