In [1]:
import numpy as np
import pandas as pd
from genetic_programming.genetics import SymbolicRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
import warnings

warnings.filterwarnings("ignore")

In [2]:
x = pd.read_csv('./data/70_dataset.csv')
x.set_index('compound', inplace=True)
x = x.drop(x.columns[x.nunique() == 1], axis=1)
y = x.pop('target')
x = x
feature_name = x.columns

In [3]:
def assign(X_list):
    x = X_list[0]
    for i in range(len(X_list)-1):
        x = np.c_[x, X_list[i+1]]
    return x

def LR(x_tr, x_ts, y_tr, y_ts, alpha, dimension):
    ols = Ridge(alpha=alpha)
    model = ols.fit(x_tr, y_tr)
    pred_tr = model.predict(x_tr)
    pred_ts = model.predict(x_ts)

    tr_result = r2_score(y_tr, pred_tr)
    ts_result = r2_score(y_ts, pred_ts)

    return pred_tr, pred_ts, tr_result, ts_result

In [4]:
x_tr, x_ts, y_tr, y_ts = train_test_split(x, y, test_size=0.25, shuffle=True, random_state=0)

list_of_iter_result = []
list_of_iter_data = []

x_tr_data, x_ts_data, y_tr_data, y_ts_data = [x_tr], [x_ts], [y_tr], [y_ts]

for d in range(10):
    ## You can change hyperparameters in here (population, generation, etc)
    gp = SymbolicRegressor(population_size=1000, generations=20, init_depth=(1, 2), metric='pearson',
                            feature_names=feature_name, parsimony_coefficient=0.02,
                            tournament_size=100, stopping_criteria=0.85, low_memory=True, n_jobs=6)


    gp.fit(x_tr_data[0], y_tr_data[-1])

    x_tr_descriptor = gp.predict(x_tr_data[0]).reshape(-1, 1)
    x_ts_descriptor = gp.predict(x_ts_data[0]).reshape(-1, 1)

    

    x_tr_data.append(x_tr_descriptor)
    x_ts_data.append(x_ts_descriptor)
    

    program = str(gp._program)

    X_tr, X_ts = x_tr_data[1:], x_ts_data[1:]

    X_tr = assign(X_tr)
    X_ts = assign(X_ts)

    pred_tr, pred_ts, tr_result, ts_result = LR(X_tr, X_ts, y_tr_data[0], y_ts_data[0], 1, d + 1)

    res_tr, res_ts = y_tr_data[0] - pred_tr, y_ts_data[0] - pred_ts
    y_tr_data.append(res_tr)
    y_ts_data.append(res_ts)

    print(f'{d+1} Dimension - Training : {tr_result} | Test : {ts_result}')

list_of_data = [x_tr_data, x_ts_data, y_tr_data, y_ts_data]

list_of_iter_data.append(list_of_data)

1 Dimension - Training : 0.3930903772050588 | Test : 0.624309541113151
2 Dimension - Training : 0.43860766083099756 | Test : 0.6342259462168646
3 Dimension - Training : 0.5335112978871552 | Test : 0.6373142487103456
4 Dimension - Training : 0.5397992799670179 | Test : 0.6402755666470916
5 Dimension - Training : 0.5447959901016661 | Test : 0.6426255596488246
6 Dimension - Training : 0.5659931126309901 | Test : 0.6469524167510123
7 Dimension - Training : 0.584989584537525 | Test : 0.6641793341548861
8 Dimension - Training : 0.6053994173902437 | Test : 0.6501632147426832
9 Dimension - Training : 0.6054045375126402 | Test : 0.6501639386246535
10 Dimension - Training : 0.6054096573932853 | Test : 0.6501646624782218
