In [1]:
%pylab inline
from gp.genetics import SymbolicRegressor
from sklearn.utils.random import check_random_state
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

Populating the interactive namespace from numpy and matplotlib


In [2]:
df = pd.read_csv('dataset/high.csv')
df.set_index('Material', inplace=True)
y = df.pop('Heat of formation')
X = df
x_tr, x_ts, y_tr, y_ts = train_test_split(X, y, test_size=0.2, random_state=42)

sc = StandardScaler()
x_tr = sc.fit_transform(x_tr)
x_ts = sc.fit_transform(x_ts)

In [3]:
print(len(x_tr), len(y_tr))
n_sample, n_feature = x_tr.shape
print(n_sample, n_feature)

1092 1092
1092 85


In [4]:
init_depth = (5, 8)
gp = SymbolicRegressor(population_size=5000, generations=20, stopping_criteria=0.01,
                       p_crossover=0.7, p_subtree_mutation=0.1, p_hoist_mutation=0.05,
                       p_point_mutation=0.1, max_samples=0.9, verbose=1,
                       parsimony_coefficient=0.01, random_state=42     
                        )
gp.fit(x_tr, y_tr)

    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0     5.95      2.52912e+52        4         0.493791         0.557858     42.04s
   1     4.30       1.5866e+10        7         0.444629          0.46779     42.80s
   2     4.35      6.53161e+13        4         0.433749         0.497823     40.06s
   3     3.13           930634        4          0.42533          0.41283     37.05s
   4     3.15          134.316        4         0.414417         0.510247     33.85s
   5     3.85           343.03        7         0.411905         0.490707     31.88s
   6     3.93      1.11321e+08        3         0.410937         0.541316     31.20s
   7     3.55          1.43956        3         0.408756          0.56079     28.29s
   8     3.08          4.64096        3         0.409915          0.55044  

SymbolicRegressor(max_samples=0.9, p_crossover=0.7, p_hoist_mutation=0.05,
                  p_point_mutation=0.1, p_subtree_mutation=0.1,
                  parsimony_coefficient=0.01, population_size=5000,
                  random_state=42, stopping_criteria=0.01, verbose=1)

In [5]:
print(gp._program)

log(cos(X51))


In [6]:
init_depth = (3, 10)
gp = SymbolicRegressor(population_size=5000, generations=30, stopping_criteria=0.01, init_depth=init_depth,
                       p_crossover=0.7, p_subtree_mutation=0.1, p_hoist_mutation=0.05,
                       p_point_mutation=0.1, max_samples=0.9, verbose=1, metric='rmse',
                       parsimony_coefficient=0.01, random_state=42     
                        )
gp.fit(x_tr, y_tr)

    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0    12.46     5.44599e+107       10         0.642958          0.63106      1.38m
   1     5.46      4.26411e+28        4         0.612599         0.626963      1.20m
   2     5.07      5.33755e+46        8         0.569606         0.630061      1.14m
   3     2.97      3.13002e+09        5         0.555984          0.53948      1.01m
   4     2.98       3.7335e+21        5          0.54508         0.631043     56.64s
   5     3.52          2212.73        5         0.544508         0.635435     54.30s
   6     3.34          872.806        5         0.540494         0.530439     51.64s
   7     3.24      3.66353e+43        5           0.5318         0.603816     50.43s
   8     3.18           49.914        5         0.528805         0.626858  

SymbolicRegressor(generations=30, init_depth=(3, 10), max_samples=0.9,
                  metric='rmse', p_crossover=0.7, p_hoist_mutation=0.05,
                  p_point_mutation=0.1, p_subtree_mutation=0.1,
                  parsimony_coefficient=0.01, population_size=5000,
                  random_state=42, stopping_criteria=0.01, verbose=1)

In [7]:
# Best program after final generation
print(gp._program)

neg(sqrt(log(cos(X8))))


In [8]:
# Best program's fitness result on training dataset
gp._program.raw_fitness_

0.5209919841758972

In [9]:
# Best Program's result on training dataset
# 추후에 target에서 빼줘서 residual을 새로운 target으로 생성

gp._program.execute(x_tr)

array([-0.54406812, -1.22226961, -0.54406812, ..., -1.22226961,
       -0.54406812, -0.54406812])

In [10]:
# Get program's depth

gp._program._depth()

4

In [11]:
gp._program.fitness()

0.5709919841758972

In [12]:
# Best program's result on training dataset
# Above code 'gp._prgoram.execute(x_tr) 과 동일한 결과
gp.predict(x_tr)

array([-0.54406812, -1.22226961, -0.54406812, ..., -1.22226961,
       -0.54406812, -0.54406812])

In [13]:
# R square 값 구하기
# sklearn.metrics.r2_score과 동일
tr_score = gp.score(x_tr, y_tr)
ts_score = gp.score(x_ts, y_ts)
print(f'Training : {tr_score}, Test : {ts_score}')

from sklearn.metrics import r2_score

tr_score = r2_score(y_tr, gp.predict(x_tr))
ts_score = r2_score(y_ts, gp.predict(x_ts))
print(f'Training : {tr_score}, Test : {ts_score}')

Training : 0.3353671779562626, Test : 0.27172870472316923
Training : 0.3353671779562626, Test : 0.27172870472316923


In [14]:
# Best program 이 생성된 history
gp._program.parents

{'method': 'Crossover',
 'parent_idx': 853,
 'parent_nodes': range(1, 5),
 'donor_idx': 2195,
 'donor_nodes': [0]}

In [22]:
y_tr = y_tr - gp.predict(x_tr)
y_ts = y_ts - gp.predict(x_ts)

init_depth = (3, 10)
gp = SymbolicRegressor(population_size=5000, generations=30, stopping_criteria=0.01, init_depth=init_depth,
                       p_crossover=0.7, p_subtree_mutation=0.1, p_hoist_mutation=0.05,
                       p_point_mutation=0.1, max_samples=0.9, verbose=1, metric='rmse',
                       parsimony_coefficient=0.01, random_state=42     
                        )
gp.fit(x_tr, y_tr)

    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0    12.46     5.44599e+107        6         0.541839         0.557822      1.40m
   1     4.38      3.03365e+73        6         0.532801         0.630768      1.12m
   2     3.03      5.33755e+46        7         0.531597         0.587107      1.02m
   3     3.04          40.4044        6         0.528107         0.614583      1.03m
   4     3.25       3.7335e+21        3         0.530139         0.674328     57.66s
   5     3.19          1108.31        4         0.528582         0.635148     55.77s
   6     3.21          46.6257        3         0.527236         0.694348     57.65s
   7     3.20          63643.5        3         0.525786         0.675393     54.47s
   8     3.16          6.22832        3         0.527718         0.661802  

SymbolicRegressor(generations=30, init_depth=(3, 10), max_samples=0.9,
                  metric='rmse', p_crossover=0.7, p_hoist_mutation=0.05,
                  p_point_mutation=0.1, p_subtree_mutation=0.1,
                  parsimony_coefficient=0.01, population_size=5000,
                  random_state=42, stopping_criteria=0.01, verbose=1)

In [23]:
# Best program after final generation
print(gp._program)

sin(pow2(X8))
