In [1]:
import sys
sys.path.append("../script/")

from sklearn.metrics import classification_report,accuracy_score
from sklearn.model_selection import train_test_split
import numpy as np
import importlib
from time import time
import Functions
from matplotlib import pyplot as plt
import xgboost

In [2]:
import numpy as np
import pandas as pd
from time import time
from Node import Node

# Engine

In [3]:
class Engine:
    def __init__(self,opset,X,Y,log_odds,p,learning_rate):
        self.generation = 0
#         self.X_train,self.X_valid,self.y_train,self.y_valid = train_test_split(X,Y,train_size=0.9)
        X = X.astype('float64')
        self.opset = opset
        
        self.num_class = len(pd.unique(Y))
        self.feature_space = X.shape[1]
        
        self.vals = X.T
        self.X = X
        self.Y = Y

        self.log_odds = log_odds
        self.p = p
        self.residual = self.Y - p
        self.learning_rate = learning_rate

        self.count_label = pd.value_counts(Y).reset_index().values
        
        self.best = (np.inf,None)
        self.nodes = [Node(True,index=i) for i in range(self.feature_space)]
    
    def loss(self,vals,bins,beta):
        fitness = []
        
        for val in vals:
            val_max = np.max(val)
            val_min = np.min(val)
            width = ((val_max - val_min)/bins)

            index = ((val - val_min)//width).astype('int32') if width != 0 else np.zeros(val.shape[0])
            index = np.where(index >= bins,bins-1,index)
            index = np.where(index < 0,0,index)

            p_bin = [sum(self.p[index==i]*(1-self.p[index==i]))  for i in range(bins)]
            residual_bin = [sum(self.residual[index==i])  for i in range(bins)]

            grad_bin = [residual_bin[i]/p_bin[i] if p_bin[i] > 0 else 0 for i in range(bins)]

            grads = np.zeros(index.shape[0])
            for i in range(bins):
                grads[index==i] = grad_bin[i]

            log_odds_1 = self.log_odds + self.learning_rate * grads
            p_1 = np.exp(log_odds_1)
            p_1 = p_1/(1+p_1)

            fitness.append(sum((self.Y-p_1)**2))

        return fitness
    
    def dloss(self,vals):
        return [0 for i in range(len(vals))]
    
    def evolve(self,total_size,batch_size,elite_size,bins,beta,verbose):
        self.generation += 1
        
        if verbose:
            print("\tgeneration:",self.generation)
            t = time()
        
        num_batches = total_size//batch_size
        pool = self.nodes

        elites_funcs = []
        elite_sons = []
        elite_vals = []

        elites_fitness = []
        for j in range(num_batches):

            funcs = np.random.choice(list(self.opset.keys()),size=batch_size)
            arg_count = [self.opset[func] for func in funcs]
            sons = np.random.choice(pool,size = sum(arg_count))
            it = iter(sons)
            sons = [[next(it) for _ in range(arg_count[i])] for i in range(batch_size)]
            vals = [funcs[i]([self.vals[s.index] for s in sons[i]]) for i in range(batch_size)]

            vals = np.stack(vals)
            # fitness = self.loss(vals,bins,beta)
            fitness = self.dloss(vals)
            
            elites_funcs.extend(funcs)
            elite_sons.extend(sons)
            elite_vals.extend(vals)
            elites_fitness.extend(fitness)

            rank = np.argsort(elites_fitness)

            elites_funcs = [elites_funcs[index] for index in rank[:elite_size]]
            elite_sons = [elite_sons[index] for index in rank[:elite_size]]
            elite_vals = [elite_vals[index] for index in rank[:elite_size]]
            elites_fitness = [elites_fitness[index] for index in rank[:elite_size]]

        for index in range(elite_size):
            node = Node(False,
                func=elites_funcs[index],
                sons=elite_sons[index],
                index=len(self.nodes),
                fit=elites_fitness[index] 
            )
            
            self.nodes.append(node)
            self.vals = np.append(self.vals,[elite_vals[index]],axis=0)
            
            if index == 0:
                if self.best[0] > node.fitness:
                    val = elite_vals[index]
                    val_max = np.max(val)
                    val_min = np.min(val)
                    width = ((val_max - val_min)/bins)
                                      
                    self.best = (node.fitness,node,(val_max,val_min,width,bins))
            # self.test_param_same(node)
      

        if verbose:
            print("\t",np.min(elites_fitness))
            print("\ttime",time()-t)
        return None

    def test_param_same(self,node):
        v1 = node.predict(self.X)
        v2 = self.vals[node.index]
        if np.any(v1!=v2):
            print(node.index,v1==v2)

In [4]:
# ref from 2segp github
# Classification dataset names - choose from following datasets 

CLASS_DATASET_NAMES = ['bcw','heart','iono','parks','sonar']
dataset_name = CLASS_DATASET_NAMES[1]


# Load the dataset
Xy = np.genfromtxt('test_data/'+dataset_name+'.csv', delimiter=',')
X = Xy[:, :-1]
y = Xy[:, -1]   # last column is the label

# simple operators

boost_num = 1000

seed = np.random.randint(9999999)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed)

init_log_odds = sum(y==1)/y_train.shape[0]    
init_p = np.exp(init_log_odds)
init_p = init_p/(1+init_p)

In [5]:
df = pd.read_csv('../data/sleep.tsv',delimiter='\t')
X = df.iloc[:,:-1].to_numpy()
y = df.iloc[:,-1].to_numpy()
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3)

FileNotFoundError: [Errno 2] No such file or directory: '../data/sleep.tsv'

In [6]:
from sklearn.tree import DecisionTreeClassifier

num_feature = 1500

eg = Engine(Functions.simple_opset,X_train,y_train,init_log_odds,init_p,0)
for i in range(3):
    eg.evolve(500,500,500,2,[0,0,0],0)
nodes = eg.nodes[60:60+num_feature]
# nodes = eg.nodes[0:60]

In [7]:
num_estimator = 1
max_depth = 100
ratio = 1

vals = np.stack([n.predict(X_train) for n in nodes]).T
clfs = [DecisionTreeClassifier(max_depth=max_depth) for i in range(num_estimator)]
all_index = [i for i in range(len(nodes))]
indexes = [np.random.choice(all_index,size=int(len(all_index)*ratio),replace=False) for i in range(num_estimator)]

for i in range(num_estimator):
    feature_index = indexes[i]
    feature_batch = vals[:,feature_index]
    clfs[i].fit(feature_batch,y_train)

vals = np.stack([n.predict(X_train) for n in nodes]).T

feature_index = indexes[0]
feature_batch = vals[:,feature_index]
prob = clfs[0].predict_proba(feature_batch)

for i in range(1,num_estimator):
    feature_index = indexes[i]
    feature_batch = vals[:,feature_index]
    p = clfs[i].predict_proba(feature_batch)
    prob += p
    
pred = np.argmax(prob,axis=1)
print(classification_report(y_train,pred))


vals = np.stack([n.predict(X_test) for n in nodes]).T

feature_index = indexes[0]
feature_batch = vals[:,feature_index]
prob = clfs[0].predict_proba(feature_batch)

for i in range(1,num_estimator):
    feature_index = indexes[i]
    feature_batch = vals[:,feature_index]
    p = clfs[i].predict_proba(feature_batch)
    prob += p
    
pred = np.argmax(prob,axis=1)
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       111
         1.0       1.00      1.00      1.00        78

    accuracy                           1.00       189
   macro avg       1.00      1.00      1.00       189
weighted avg       1.00      1.00      1.00       189

              precision    recall  f1-score   support

         0.0       0.62      0.72      0.67        39
         1.0       0.69      0.60      0.64        42

    accuracy                           0.65        81
   macro avg       0.66      0.66      0.65        81
weighted avg       0.66      0.65      0.65        81



In [7]:
from sklearn.tree import DecisionTreeClassifier

num_feature = 1000
num_estimator = 100
max_depth = 40
ratio = 0.1

eg = Engine(Functions.simple_opset,X_train,y_train,0,0,0)
for i in range(3):
    eg.evolve(1000,1000,1000,2,[0,0,0],0)
nodes = eg.nodes[60:60+num_feature]
# nodes = eg.nodes[0:60]
vals = np.stack([n.predict(X_train) for n in nodes]).T

clfs = [DecisionTreeClassifier(max_depth=max_depth) for i in range(num_estimator)]
all_index = [i for i in range(len(nodes))]
indexes = [np.random.choice(all_index,size=int(len(all_index)*ratio),replace=False) for i in range(num_estimator)]

for i in range(num_estimator):
    feature_index = indexes[i]
    feature_batch = vals[:,feature_index]
    clfs[i].fit(feature_batch,y_train)
    
vals = np.stack([n.predict(X_train) for n in nodes]).T

feature_index = indexes[0]
feature_batch = vals[:,feature_index]
prob = clfs[0].predict_proba(feature_batch)

for i in range(1,num_estimator):
    feature_index = indexes[i]
    feature_batch = vals[:,feature_index]
    p = clfs[i].predict_proba(feature_batch)
    prob += p
    
pred = np.argmax(prob,axis=1)
print(classification_report(y_train,pred))


vals = np.stack([n.predict(X_test) for n in nodes]).T

feature_index = indexes[0]
feature_batch = vals[:,feature_index]
prob = clfs[0].predict_proba(feature_batch)

for i in range(1,num_estimator):
    feature_index = indexes[i]
    feature_batch = vals[:,feature_index]
    p = clfs[i].predict_proba(feature_batch)
    prob += p
    
pred = np.argmax(prob,axis=1)
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       108
         1.0       1.00      1.00      1.00        81

    accuracy                           1.00       189
   macro avg       1.00      1.00      1.00       189
weighted avg       1.00      1.00      1.00       189

              precision    recall  f1-score   support

         0.0       0.79      0.90      0.84        42
         1.0       0.88      0.74      0.81        39

    accuracy                           0.83        81
   macro avg       0.84      0.82      0.82        81
weighted avg       0.83      0.83      0.83        81



In [6]:
from sklearn.tree import DecisionTreeClassifier

num_feature = 3000
num_estimator = 1000
max_depth = 5
ratio = 0.05

eg = Engine(Functions.simple_opset,X_train,y_train,0,0,0)
for i in range(3):
    eg.evolve(1000,1000,1000,2,[0,0,0],0)
nodes = eg.nodes[60:60+num_feature]
vals = np.stack([n.predict(X_train) for n in nodes]).T

clfs = [DecisionTreeClassifier(max_depth=max_depth) for i in range(num_estimator)]
all_index = [i for i in range(len(nodes))]
indexes = [np.random.choice(all_index,size=int(len(all_index)*ratio),replace=False) for i in range(num_estimator)]

for i in range(num_estimator):
    feature_index = indexes[i]
    feature_batch = vals[:,feature_index]
    clfs[i].fit(feature_batch,y_train)
    
vals = np.stack([n.predict(X_train) for n in nodes]).T

feature_index = indexes[0]
feature_batch = vals[:,feature_index]
prob = clfs[0].predict_proba(feature_batch)

for i in range(1,num_estimator):
    feature_index = indexes[i]
    feature_batch = vals[:,feature_index]
    p = clfs[i].predict_proba(feature_batch)
    prob += p
    
pred = np.argmax(prob,axis=1)
print(classification_report(y_train,pred))


vals = np.stack([n.predict(X_test) for n in nodes]).T

feature_index = indexes[0]
feature_batch = vals[:,feature_index]
prob = clfs[0].predict_proba(feature_batch)

for i in range(1,num_estimator):
    feature_index = indexes[i]
    feature_batch = vals[:,feature_index]
    p = clfs[i].predict_proba(feature_batch)
    prob += p
    
pred = np.argmax(prob,axis=1)
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99       108
         1.0       0.99      0.99      0.99        81

    accuracy                           0.99       189
   macro avg       0.99      0.99      0.99       189
weighted avg       0.99      0.99      0.99       189

              precision    recall  f1-score   support

         0.0       0.79      0.90      0.84        42
         1.0       0.88      0.74      0.81        39

    accuracy                           0.83        81
   macro avg       0.84      0.82      0.82        81
weighted avg       0.83      0.83      0.83        81



# Small datasets

In [8]:
# ref from 2segp github
# Classification dataset names - choose from following datasets 

CLASS_DATASET_NAMES = ['bcw','heart','iono','parks','sonar']
dataset_name = CLASS_DATASET_NAMES[1]


# Load the dataset
Xy = np.genfromtxt('test_data/'+dataset_name+'.csv', delimiter=',')
X = Xy[:, :-1]
y = Xy[:, -1]   # last column is the label

# simple operators

boost_num = 1000

seed = np.random.randint(9999999)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed)

# Higgs Boson

In [None]:
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.utils import shuffle
def plot_roc_curve(true_y, y_prob):
    """
    plots the roc curve based of the probabilities
    """
    
    fpr, tpr, thresholds = roc_curve(true_y, y_prob)
    plt.plot(fpr, tpr)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    print(roc_auc_score(true_y,prob[:,1]))

df = pd.read_csv('../data/HIGGS.csv',header=None)
X = df.iloc[:,1:].to_numpy()
y = df.iloc[:,0].to_numpy().astype('int')


In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=500000)
Xs,ys = shuffle(X_train,y_train)

In [None]:
X_train = Xs[:1050000]
y_train = ys[:1050000]

X_test = Xs[1050000:1210000]
y_test = ys[1050000:1210000]

In [None]:
train_acc = []
test_acc = []

gsgp = GStackGP(X_train,y_train,1)
for i in range(1000):
    print("train start")
    gsgp.evolve()
    print("train complete")
    
    if i%10 == 0:
        pred = gsgp.predict(X_train)
        train_acc.append(roc_auc_score(y_train,pred))
        pred = gsgp.predict(X_test)
        test_acc.append(roc_auc_score(y_test,pred))
        print("train auc:",train_acc[-1],"test auc",test_acc[-1])
        
        

train start
Loss: 271919.0944547627
train complete
train auc: 0.5258654810848914 test auc 0.5246813939060924
train start
Loss: 260271.01078482388
train complete
train start
Loss: 250622.0223460063
train complete
train start
Loss: 249377.3030609782
train complete
train start
Loss: 247674.13267773198
train complete
train start
Loss: 246092.75683943022
train complete
train start
Loss: 244864.77042022234
train complete
train start
Loss: 244109.4138296209
train complete
train start
Loss: 243205.71141343552
train complete
train start
Loss: 242986.57455942972
train complete
train start
Loss: 240457.04838524185
train complete
train auc: 0.6132988188301514 test auc 0.6120390153127323
train start
Loss: 239727.16239474286
train complete
train start
Loss: 239158.3263821821
train complete
train start
Loss: 238698.26702003824
train complete
train start
Loss: 234226.41463685918
train complete
train start
Loss: 233400.83277542077
train complete
train start
Loss: 232759.91196994
train complete
train st

In [None]:
train_acc,test_acc

([0.5258654810848914,
  0.6132988188301514,
  0.6436750800551445,
  0.6493980630859355,
  0.6538173147351156,
  0.6565614302154482,
  0.6582152004133821,
  0.6613213626863199,
  0.6635383899160323,
  0.666353789575044,
  0.668103252695043,
  0.6701625265444092,
  0.6712143528187084,
  0.6733713747050745,
  0.6746181955322843,
  0.6752672766485888,
  0.6766407141634677,
  0.6773970382984317,
  0.6779983184430831,
  0.6790438206285722,
  0.6801295273961986,
  0.6814559134172302,
  0.6821680827768668,
  0.6826158873088717,
  0.6833402715090398,
  0.6840135841631556,
  0.6850787003093606,
  0.6858402956774915,
  0.68620554784639,
  0.6870865464354777,
  0.6873719000281152,
  0.6887273207264919,
  0.6894809947705653,
  0.6904194369508995,
  0.6906725440629418,
  0.6909134399393745,
  0.691144665303587,
  0.6920069471387315,
  0.6926548970014472,
  0.6932927787363518,
  0.69436308464085,
  0.6945680994562209,
  0.695180881395156,
  0.695675791622282,
  0.6959153969002998,
  0.696364392604063