In [1]:
import os
import pandas as pd
import numpy as np
from algorithms.gp import GpLearnConfig, GpLearnRegressor
from pprint import pprint

In [3]:
config = GpLearnConfig()
pprint(config)

GpLearnConfig(population_size=10000,
              generations=10,
              stopping_criteria=0.01,
              p_crossover=0.7,
              p_subtree_mutation=0.1,
              p_hoist_mutation=0.1,
              p_point_mutation=0.05,
              max_samples=0.9,
              verbose=0,
              parsimony_coefficient=0.01,
              function_set=['add',
                            'sub',
                            'mul',
                            'div',
                            'sqrt',
                            'log',
                            'neg',
                            'inv',
                            'sin',
                            'cos',
                            'tan'],
              random_state=42)


In [4]:
train_df = pd.read_csv("dataset/train_df.csv", index_col=0)
sampled_df = train_df.groupby('number').apply(lambda x: x.sample(1, random_state=42)).reset_index(drop=True)
sampled_df["path"] = sampled_df.apply(lambda row:os.path.join(row["filename"], f"{row['data_num']}.npy"), axis=1)

In [5]:
sampled_df.head()

Unnamed: 0,filename,data_num,number,path
0,I.6.2a,83,1,I.6.2a/83.npy
1,I.6.2,83,2,I.6.2/83.npy
2,I.6.2b,83,3,I.6.2b/83.npy
3,I.8.14,83,4,I.8.14/83.npy
4,I.9.18,83,5,I.9.18/83.npy


In [9]:
models = []

In [10]:
for index, row in sampled_df.iterrows():
    data = np.load(os.path.join("dataset", row["path"]))
    X = data[:, :-1]
    y = data[:, -1]
    regressor = GpLearnRegressor(config)
    model = regressor.predict_single(X, y)
    print(index)
    models.append(model)


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


In [11]:
models

[0.13/X0,
 0.111000000000000,
 0.376/X0,
 (-X0 + X1)**0.5 + (-X2 + X3)**0.5,
 1/(X5 + 0.371),
 X0 + 0.082,
 X0*X4 + X1*X3 + X2*X5 - 0.640228994290611,
 X0*X1,
 X0**0.5*X1/X3 + X0*(X1**1.5 + X2*X3),
 0.086/X2,
 0.056/(X1*(X2 - 0.876)),
 X0*X1,
 X0*(X1 + X2*X3*sin(X4)),
 (X2 - X3)*(X4 + log(X1*(X0 - (0.028 - X0)**0.5)/X2**0.5)),
 X0*X1*X2,
 X1**2*(-X0 - sin(0.378*X0**0.5 - X0))**0.5,
 X0 - X1*X3 + 1/X2,
 X3,
 X0*X1 + 1/X2,
 X0,
 (X2*X3)**0.5,
 X0*X1*sin(X2),
 X0*X1*X2*sin(X3),
 X0*(-sin((X1 - 0.877)**0.5) + cos(X3))*(-X1*X2 - X2*X3 + cos(X1 - X2)),
 X0/X1,
 X0*sin(X1),
 X1/(X2 + X1/X0),
 X0/X1,
 (0.524*X0 + (-X1)**0.5)*(-X2 + X3)**0.5,
 (2*X0 + cos(X1))/X1,
 X0/(X1*X2),
 0.0480000000000000,
 0.65*X0*X1*X3**2*X4*(X4 - log(-X2 + X5))*log(X4)/log(-X2 + X5),
 X0*X1*X2/X3,
 X2 + X1*(X2 + X1*(X2 + X1**2*X2/X0)/X0)/X0,
 X2 + X1*(X2 + X1*X2/(X0*((0.056 - log(X0))*log(X0 - X1)**0.5)**0.5))/X0,
 0.157662*X0*X1,
 log((X0 + X1)*tan(tan(sin(X2)**0.5))/log(X2)) - sin(cos(tan(sin(X2)**0.5))),
 X2*X3*lo

In [14]:
import pickle

data = {"df": sampled_df,
        "models": models,
        "config":config}

with open('gplearn_test.pkl', 'wb') as outp:
    pickle.dump(data, outp, pickle.HIGHEST_PROTOCOL)