In [3]:
from sklearn.metrics import classification_report,accuracy_score
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import importlib
from time import time

import sys
sys.path.append("../script/")

import Node
import Engine
import BGP
import Functions

# test script

In [4]:
# ref from 2segp github
# Classification dataset names - choose from following datasets 

CLASS_DATASET_NAMES = ['bcw','heart','iono','parks','sonar']
dataset_name = CLASS_DATASET_NAMES[1]


# Load the dataset
Xy = np.genfromtxt('test_data/'+dataset_name+'.csv', delimiter=',')
X = Xy[:, :-1]
y = Xy[:, -1]   # last column is the label

# simple operators

boost_num = 1000

In [7]:
importlib.reload(BGP)
importlib.reload(Engine)
importlib.reload(Node)
importlib.reload(Functions)

his_train_acc = []
his_test_acc = []

bgps = []
times = []
num_trees = []
depths = []
nodecounts = []

for _ in range(40):
    seed = np.random.randint(9999999)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed)
    
    np.random.seed(np.random.randint(9999999))
    bgp = BGP.BGP(Functions.simple_opset,X_train,y_train)
    
    train_acc = []
    test_acc = []
    
    boost_num = 10
    parameters = [
        (30, #generation
         100, #population
         100, #population batch
         30, #elites
         2 #bins
         ,1 #p-value
         ,[0,0,0] #beta
        ) for i in range(boost_num)
    ]


    t = time()
    for i in range(boost_num):
        generation,total_size,batch_size,elite_size,bins,p_value,beta = parameters[i]
        bgp.evolve(generation,total_size,batch_size,elite_size,bins,beta,0)
        pure_rate = 1 - bgp.X.shape[0]/bgp.fixed_X.shape[0]
        train_score = accuracy_score(y_train,bgp.predict(X_train))
        test_score = accuracy_score(y_test,bgp.predict(X_test))
        
    
        train_acc.append(train_score)
        test_acc.append(test_score)
        
        print(pure_rate,train_score,test_score)
        if pure_rate > 0.99:
            break
    times.append(time() - t)
    
    d = []
    nc = []
    for t in bgp.trees:
        d.append(t.depth)
        nc.append(t.numNode)
    depths.append(np.average(d))
    nodecounts.append(np.sum(nc))
    num_trees.append(len(bgp.trees))
    bgps.append(bgp)
     
    his_train_acc.append(train_acc)
    his_test_acc.append(test_acc)
    print(train_acc)
    print(test_acc)

0.08994708994709 0.6507936507936508 0.6049382716049383
0.22751322751322756 0.7883597883597884 0.691358024691358
0.29100529100529104 0.8518518518518519 0.7777777777777778
0.4656084656084656 0.8518518518518519 0.7777777777777778
0.5079365079365079 0.8941798941798942 0.7530864197530864
0.6349206349206349 0.8941798941798942 0.7530864197530864
0.7037037037037037 0.8941798941798942 0.7530864197530864
0.7354497354497355 0.9259259259259259 0.7407407407407407
0.8148148148148149 0.9259259259259259 0.7407407407407407
0.8306878306878307 0.9417989417989417 0.7530864197530864
[0.6507936507936508, 0.7883597883597884, 0.8518518518518519, 0.8518518518518519, 0.8941798941798942, 0.8941798941798942, 0.8941798941798942, 0.9259259259259259, 0.9259259259259259, 0.9417989417989417]
[0.6049382716049383, 0.691358024691358, 0.7777777777777778, 0.7777777777777778, 0.7530864197530864, 0.7530864197530864, 0.7530864197530864, 0.7407407407407407, 0.7407407407407407, 0.7530864197530864]
0.21164021164021163 0.73544973

In [56]:
vals = bgp.trees[0].predict(bgp.fixed_X).reshape(1,-1)
count_label = pd.value_counts(bgp.fixed_Y).reset_index().values.astype('int')
weights = [np.ones(num) for label,num in count_label]

val_by_classes = [np.transpose(np.transpose(vals)[bgp.fixed_Y == label]) for label,num in count_label]

val_max = np.max(vals,axis=1)
val_min = np.min(vals,axis=1)

hists_by_classes = np.array([np.stack([np.histogram(
                                            val_by_classes[c][i],
                                            range=(val_min[i],val_max[i]),
                                            bins=bins,
                                            weights = weights[c]
                                                   )[0] / num
                     for i in range(vals.shape[0])])
                    for c,(label,num) in enumerate(count_label)])

In [57]:
(table,width,val_max,val_min,bins) = bgp.engine.best[2]

prob = table.to_numpy()
total = table.sum(axis=1).to_numpy()
prob = np.divide(prob.T,total,out = np.zeros(prob.T.shape,dtype='float'), where=total!=0).T

meg = Engine.Engine.table_to_meg(table)
hist = (meg,width,val_max,val_min,bins)
index = ((vals - val_min)//width).astype('int32')
index = np.where(index >= bins,bins-1,index)
index = np.where(index < 0,0,index)

In [58]:
vals = bgp.trees[0].predict(bgp.fixed_X).reshape(1,-1)
count_label = pd.value_counts(bgp.fixed_Y).reset_index().values.astype('int')

val_by_classes = [np.transpose(np.transpose(vals)[bgp.fixed_Y == label]) for label,num in count_label]

val_max = np.max(vals,axis=1)
val_min = np.min(vals,axis=1)

hists_by_classes = np.array([np.stack([np.histogram(
                                            val_by_classes[c][i],
                                            range=(val_min[i],val_max[i]),
                                            bins=bins,
                                            weights = weights[c]
                                                   )[0] / num
                     for i in range(vals.shape[0])])
                    for c,(label,num) in enumerate(count_label)])

In [6]:
np.average([i[-1] for i in his_train_acc]),np.average([i[-1] for i in his_test_acc]),np.average(depths),np.average(nodecounts)

(0.9419312169312171, 0.7811728395061728, 8.0775, 1037.95)

In [8]:
np.average([i[-1] for i in his_train_acc]),np.average([i[-1] for i in his_test_acc]),np.average(depths),np.average(nodecounts)

(0.9481481481481481, 0.7935185185185185, 6.475, 694.6)

In [74]:
np.average([i[-1] for i in his_train_acc]),np.average([i[-1] for i in his_test_acc]),np.average(depths),np.average(nodecounts)

(0.9999371859296483, 0.9555555555555555, 6.23311507936508, 426.2)

In [76]:
np.average([i[-1] for i in his_train_acc]),np.average([i[-1] for i in his_test_acc]),np.average(depths),np.average(nodecounts)

(0.9955402010050252, 0.9567251461988304, 6.483898809523811, 361.375)