In [7]:
%pylab inline
from gp.genetics import SymbolicRegressor
from sklearn.utils.random import check_random_state
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

Populating the interactive namespace from numpy and matplotlib


In [8]:
df = pd.read_csv('dataset/high.csv')
df.set_index('Material', inplace=True)
y = df.pop('Heat of formation')
X = df
x_tr, x_ts, y_tr, y_ts = train_test_split(X, y, test_size=0.2, random_state=42)

sc = StandardScaler()
x_tr = sc.fit_transform(x_tr)
x_ts = sc.fit_transform(x_ts)

In [9]:
print(len(x_tr), len(y_tr))
n_sample, n_feature = x_tr.shape
print(n_sample, n_feature)

1092 1092
1092 85


In [11]:
gp = SymbolicRegressor(population_size=5000, generations=20, stopping_criteria=0.01,
                       p_crossover=0.7, p_subtree_mutation=0.1, p_hoist_mutation=0.05,
                       p_point_mutation=0.1, max_samples=0.9, verbose=1,
                       parsimony_coefficient=0.01, random_state=42     
                        )
gp.fit(x_tr, y_tr)

    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0    25.43          5146.19        3         0.467756         0.450665     53.42s
   1     3.51          1.50625        3         0.455547         0.559657     38.81s
   2     3.15           1.4176        3          0.45042         0.605428     37.63s
   3     2.91         0.996109        3         0.448346         0.623941     34.52s
   4     3.26          73.3247        3         0.444151         0.661393     31.85s
   5     3.16          0.96215        5         0.441367          0.40033     30.19s
   6     3.20          0.96353        5         0.431917          0.48469     27.62s
   7     3.20          6.77462        5          0.43127         0.490467     25.51s
   8     3.22          1.13984        5         0.430558         0.496826  

SymbolicRegressor(max_samples=0.9, p_crossover=0.7, p_hoist_mutation=0.05,
                  p_point_mutation=0.1, p_subtree_mutation=0.1,
                  parsimony_coefficient=0.01, population_size=5000,
                  random_state=42, stopping_criteria=0.01, verbose=1)

In [12]:
print(gp._program)

mul(mul(X8, -0.685), X8)


In [35]:
init_depth = (2, 10)
gp = SymbolicRegressor(population_size=5000, generations=20, stopping_criteria=0.01, init_depth=init_depth,
                       p_crossover=0.7, p_subtree_mutation=0.1, p_hoist_mutation=0.05,
                       p_point_mutation=0.1, max_samples=0.9, verbose=1, metric='rmse',
                       parsimony_coefficient=0.01, random_state=42     
                        )
gp.fit(x_tr, y_tr)

    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0   221.78       2.8386e+18        3         0.683159         0.602846      3.79m
   1     3.57          691.824        3         0.642922         0.916382     46.03s
   2     3.41          5.84126        3         0.594911         0.669017     47.18s
   3     2.79          168.603        3         0.595149         0.667121     43.49s
   4     1.66           4482.1        3         0.591169         0.697996     38.24s
   5     2.23          76.9772        3         0.585307         0.740786     35.43s
   6     3.30          6.58109        3         0.582184         0.762444     35.24s
   7     3.33          83.7065        3         0.582691         0.758974     29.96s
   8     3.49          122.771        3         0.576934         0.797271  

SymbolicRegressor(init_depth=(2, 10), max_samples=0.9, metric='rmse',
                  p_crossover=0.7, p_hoist_mutation=0.05, p_point_mutation=0.1,
                  p_subtree_mutation=0.1, parsimony_coefficient=0.01,
                  population_size=5000, random_state=42, stopping_criteria=0.01,
                  verbose=1)

In [15]:
# Best program after final generation
print(gp._program)

mul(X52, X51)


In [16]:
# Best program's fitness result on training dataset
gp._program.raw_fitness_

0.5824967699992059

In [17]:
# Best Program's result on training dataset
# 추후에 target에서 빼줘서 residual을 새로운 target으로 생성

gp._program.execute(x_tr)

array([-0.5945983 , -1.59203403, -0.5945983 , ..., -1.59203403,
       -1.59203403, -0.5945983 ])

In [25]:
# Get program's depth

gp._program._depth()

1

In [28]:
gp._program.fitness()

0.6124967699992059

In [29]:
# Best program's result on training dataset
# Above code 'gp._prgoram.execute(x_tr) 과 동일한 결과
gp.predict(x_tr)

array([-0.5945983 , -1.59203403, -0.5945983 , ..., -1.59203403,
       -1.59203403, -0.5945983 ])

In [33]:
# R square 값 구하기
# sklearn.metrics.r2_score과 동일
tr_score = gp.score(x_tr, y_tr)
ts_score = gp.score(x_ts, y_ts)
print(f'Training : {tr_score}, Test : {ts_score}')

from sklearn.metrics import r2_score

tr_score = r2_score(y_tr, gp.predict(x_tr))
ts_score = r2_score(y_ts, gp.predict(x_ts))
print(f'Training : {tr_score}, Test : {ts_score}')

Training : 0.17025521006087074, Test : 0.0856154934767056


In [32]:
# Best program 이 생성된 history
gp._program.parents

{'method': 'Crossover',
 'parent_idx': 4581,
 'parent_nodes': range(0, 3),
 'donor_idx': 1148,
 'donor_nodes': []}