#  Imports

In [1]:
import sys
sys.path.append("../script/")

import Functions
from Engine import Engine
from GTGP import GTGP
import os

from sklearn.metrics import classification_report,accuracy_score,confusion_matrix
from sklearn.model_selection import train_test_split
import numpy as np
import importlib
from time import time
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
import numpy as np
import pandas as pd
from time import time

from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.utils import shuffle

def plot_roc_curve(true_y, y_prob):
    """
    plots the roc curve based of the probabilities
    """
    
    fpr, tpr, thresholds = roc_curve(true_y, y_prob)
    plt.plot(fpr, tpr)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    print(roc_auc_score(true_y,prob[:,1]))

def print_stats(dataset,header):
    titles = ['./benchmark/','./benchmark_DC/','./benchmark_xgb/','./benchmark_GBDT/','./benchmark_RF/']
    clfs = ['GP','Decision Tree','Xgboost','GBDT','Random Forest']
    means = []
    stds = []

    available_clfs = []
    for (i,title) in enumerate(titles):
        path = title+dataset+".csv"
        if os.path.isfile(path):
            available_clfs.append(clfs[i])

            df = pd.read_csv(path,names=header)
            mean = df.mean()
            std  = df.std()

            means.append(mean)
            stds.append(std)
    means = pd.DataFrame(means,index=available_clfs).T
    stds = pd.DataFrame(stds,index=available_clfs).T

    means.loc['num nodes'] = means.loc['num nodes'].round(2)
    stds.loc['num nodes'] = stds.loc['num nodes'].round(2)
    means.loc['num trees'] = means.loc['num trees'].round(2)
    stds.loc['num trees'] = stds.loc['num trees'].round(2)

    result = means.round(5).astype("string") +" +- "+ stds.round(4).astype("string")
    return result

def print_stats_roc(dataset):
    header = ['train acc','test acc','train roc','test roc','num trees','average depth','num nodes']
    return print_stats(dataset,header) 

def print_stats_f1(dataset):
    header = ['train acc','test acc','train f1','test f1','num trees','average depth','num nodes']
    return print_stats(dataset,header) 

# small dataset dimension

In [2]:
dataset = ['adult','soybean','confidence','vowel','prnn_synth','parity5','lupus','haberman','molecular_biology_promoters','labor','cars']

rows = []
for name in dataset:
    df = pd.read_csv("../data/"+name+".tsv",delimiter='\t')
    X = df.iloc[:,:-1]
    y = df.iloc[:,-1]
    class_count = pd.value_counts(y)
    imbalance_ratio = max(class_count)/min(class_count)

    rows.append([int(X.shape[0]*0.7),X.shape[0] - int(X.shape[0]*0.7),X.shape[1],class_count.shape[0],imbalance_ratio])

pd.DataFrame(rows,index=dataset,columns=['train size','test size','features','classes','imbalance'])

Unnamed: 0,train size,test size,features,classes,imbalance
adult,34189,14653,14,2,3.179173
soybean,472,203,35,18,6.571429
confidence,50,22,3,6,1.0
vowel,693,297,13,11,1.0
prnn_synth,175,75,2,2,1.0
parity5,22,10,5,2,1.0
lupus,60,27,3,2,1.485714
haberman,214,92,3,2,2.777778
molecular_biology_promoters,74,32,57,2,1.0
labor,39,18,16,2,1.85


# large dataset dimension

In [None]:
dataset = ['confidence','vowel','prnn_synth','parity5','lupus','haberman','molecular_biology_promoters','labor','cars']

rows = []
for name in dataset:
    df = pd.read_csv("../data/"+name+".tsv",delimiter='\t')
    X = df.iloc[:,:-1]
    y = df.iloc[:,-1]
    class_count = pd.value_counts(y)
    imbalance_ratio = max(class_count)/min(class_count)

    rows.append([X.shape[0],X.shape[1],class_count.shape[0],imbalance_ratio])

pd.DataFrame(rows,index=dataset,columns=['size','features','classes','imbalance'])

# adult

In [32]:
dataset = 'adult'
result = print_stats_roc(dataset)
result

Unnamed: 0,GP,Decision Tree,Xgboost,GBDT,Random Forest
train acc,0.88458 +- 0.0024,0.99993 +- 0.0,0.90374 +- 0.0015,0.86853 +- 0.0011,0.99991 +- 0.0
test acc,0.87296 +- 0.0022,0.81273 +- 0.0026,0.87209 +- 0.0022,0.86652 +- 0.0027,0.8575 +- 0.0026
train roc,0.94189 +- 0.0023,0.9999 +- 0.0,0.95974 +- 0.001,0.92435 +- 0.001,1.0 +- 0.0
test roc,0.92862 +- 0.0021,0.74506 +- 0.0037,0.92698 +- 0.002,0.92124 +- 0.0023,0.90653 +- 0.0023
num trees,44.27 +- 1.93,1.0 +- 0.0,100.0 +- 0.0,100.0 +- 0.0,100.0 +- 0.0
average depth,1.0 +- 0.0,45.3 +- 4.5649,6.0 +- 0.0,3.0 +- 0.0,45.51533 +- 0.5477
num nodes,132.8 +- 5.79,10036.13 +- 88.66,6829.27 +- 131.12,1498.0 +- 2.92,998296.93 +- 5793.53


In [31]:
dataset = 'adult'
result = print_stats_roc(dataset)
result

Unnamed: 0,GP,Decision Tree,Xgboost,GBDT,Random Forest
train acc,0.90248 +- 0.0015,0.99993 +- 0.0,0.90374 +- 0.0015,0.86853 +- 0.0011,0.99991 +- 0.0
test acc,0.87328 +- 0.0024,0.81273 +- 0.0026,0.87209 +- 0.0022,0.86652 +- 0.0027,0.8575 +- 0.0026
train roc,0.95849 +- 0.0011,0.9999 +- 0.0,0.95974 +- 0.001,0.92435 +- 0.001,1.0 +- 0.0
test roc,0.92917 +- 0.0019,0.74506 +- 0.0037,0.92698 +- 0.002,0.92124 +- 0.0023,0.90653 +- 0.0023
num trees,625.37 +- 13.08,1.0 +- 0.0,100.0 +- 0.0,100.0 +- 0.0,100.0 +- 0.0
average depth,1.57467 +- 0.0106,45.3 +- 4.5649,6.0 +- 0.0,3.0 +- 0.0,45.51533 +- 0.5477
num nodes,2778.03 +- 71.21,10036.13 +- 88.66,6829.27 +- 131.12,1498.0 +- 2.92,998296.93 +- 5793.53


# Mnist_8_8_1000

In [3]:
dataset = 'mnist_8_8_1000'
result = print_stats_f1(dataset)
result

Unnamed: 0,GP,Decision Tree,Xgboost,GBDT,Random Forest
train acc,1.0 +- 0.0,1.0 +- 0.0,1.0 +- 0.0,1.0 +- 0.0,1.0 +- 0.0
test acc,0.97224 +- 0.0046,0.83789 +- 0.0139,0.96064 +- 0.0075,0.95646 +- 0.0066,0.97114 +- 0.0058
train f1,1.0 +- 0.0,1.0 +- 0.0,1.0 +- 0.0,1.0 +- 0.0,1.0 +- 0.0
test f1,0.97222 +- 0.0046,0.83775 +- 0.0142,0.96064 +- 0.0075,0.95651 +- 0.0066,0.97104 +- 0.0058
num trees,1905.94 +- 246.33,1.0 +- 0.0,3000.0 +- 0.0,1000.0 +- 0.0,100.0 +- 0.0
average depth,1.14734 +- 0.0177,13.0 +- 0.9469,1.33008 +- 0.0122,3.0 +- 0.0,12.64433 +- 0.1604
num nodes,6346.56 +- 911.84,224.0 +- 11.44,9478.53 +- 197.94,14768.4 +- 72.51,26707.67 +- 478.78


# Mnist_8_8_10

In [115]:
dataset = 'mnist_8_8_10'
result = print_stats_f1(dataset)
result

Unnamed: 0,GP,Decision Tree,Xgboost,GBDT,Random Forest
train acc,1.0 +- 0.0,1.0 +- 0.0,1.0 +- 0.0,1.0 +- 0.0,1.0 +- 0.0
test acc,0.589 +- 0.0519,0.2487 +- 0.0573,0.2095 +- 0.0427,0.3134 +- 0.0515,0.5263 +- 0.0561
train f1,1.0 +- 0.0,1.0 +- 0.0,1.0 +- 0.0,1.0 +- 0.0,1.0 +- 0.0
test f1,0.5758 +- 0.0579,0.2353 +- 0.0517,0.1838 +- 0.0395,0.3016 +- 0.0468,0.5134 +- 0.0588
num trees,237.1 +- 7.86,1.0 +- 0.0,1000.0 +- 0.0,1000.0 +- 0.0,100.0 +- 0.0
average depth,1.0794 +- 0.0219,6.3667 +- 1.0334,1.05 +- 0.0777,1.048 +- 0.0746,4.8757 +- 0.0752
num nodes,753.57 +- 28.49,19.0 +- 0.0,3100.0 +- 155.36,3016.0 +- 149.15,1207.0 +- 18.28


# Sleep

In [21]:
dataset = 'sleep'
result = print_stats_f1(dataset)
result

Unnamed: 0,GP,Decision Tree,Xgboost,GBDT,Random Forest
train acc,0.8114 +- 0.0009,0.9997 +- 0.0,0.8147 +- 0.0011,0.7585 +- 0.0007,0.9997 +- 0.0
test acc,0.7649 +- 0.0014,0.6644 +- 0.0021,0.775 +- 0.0014,0.7543 +- 0.0012,0.7707 +- 0.0014
train f1,0.7305 +- 0.0015,0.9996 +- 0.0001,0.7346 +- 0.0017,0.6476 +- 0.0013,0.9996 +- 0.0001
test f1,0.6612 +- 0.0018,0.587 +- 0.0021,0.6728 +- 0.0026,0.641 +- 0.0022,0.665 +- 0.0026
num trees,2522.33 +- 42.99,1.0 +- 0.0,500.0 +- 0.0,500.0 +- 0.0,100.0 +- 0.0
average depth,1.9456 +- 0.0094,39.5667 +- 1.5013,6.0 +- 0.0,3.0 +- 0.0,38.395 +- 0.3344
num nodes,13638.07 +- 258.6,39577.13 +- 123.8,47875.8 +- 604.16,7499.33 +- 1.6,3611695.27 +- 4454.49


# gisette

In [26]:
dataset = 'gisette'
result = print_stats_roc(dataset)
result

Unnamed: 0,GP,Decision Tree,Xgboost,GBDT,Random Forest
train acc,0.9982 +- 0.0008,1.0 +- 0.0,1.0 +- 0.0,1.0 +- 0.0,1.0 +- 0.0
test acc,0.977 +- 0.0018,0.9282 +- 0.004,0.9771 +- 0.0034,0.9779 +- 0.0036,0.9695 +- 0.0025
train roc,0.9999 +- 0.0001,1.0 +- 0.0,1.0 +- 0.0,1.0 +- 0.0,1.0 +- 0.0
test roc,0.9976 +- 0.0004,0.9282 +- 0.004,0.9967 +- 0.001,0.9968 +- 0.0009,0.9946 +- 0.0004
num trees,4214.44 +- 44.66,1.0 +- 0.0,1000.0 +- 0.0,1000.0 +- 0.0,100.0 +- 0.0
average depth,1.0 +- 0.0,32.0 +- 0.0,1.8252 +- 0.0306,3.0 +- 0.0,29.4823 +- 0.5601
num nodes,12643.32 +- 133.97,343.73 +- 9.52,6322.13 +- 95.49,12254.43 +- 241.6,49255.0 +- 392.56


# fars

In [28]:
dataset = 'fars'
result = print_stats_f1(dataset)
result

Unnamed: 0,GP,Decision Tree,Xgboost,GBDT,Random Forest
train acc,0.83946 +- 0.0044,0.97425 +- 0.0004,0.83921 +- 0.0012,0.8051 +- 0.0019,0.97423 +- 0.0004
test acc,0.80295 +- 0.0015,0.75757 +- 0.0017,0.80308 +- 0.0012,0.80034 +- 0.0021,0.78025 +- 0.0013
train f1,0.63277 +- 0.018,0.95936 +- 0.0012,0.77272 +- 0.0066,0.64494 +- 0.049,0.95969 +- 0.0012
test f1,0.55695 +- 0.006,0.52072 +- 0.0063,0.55734 +- 0.0048,0.52853 +- 0.0181,0.54253 +- 0.006
num trees,2583.0 +- 95.23,1.0 +- 0.0,800.0 +- 0.0,800.0 +- 0.0,100.0 +- 0.0
average depth,1.96006 +- 0.0099,52.03333 +- 3.8906,5.56083 +- 0.0728,3.0 +- 0.0,52.35167 +- 0.4323
num nodes,14001.87 +- 546.25,31381.6 +- 108.71,40801.8 +- 591.71,11919.07 +- 25.51,2652551.8 +- 4937.95


: 

In [59]:
dataset = 'fars'
result = print_stats_f1(dataset)
result

Unnamed: 0,GP,Decision Tree,Xgboost,GBDT,Random Forest
train acc,0.8395 +- 0.0041,0.9742 +- 0.0004,0.8392 +- 0.0012,0.8051 +- 0.0019,0.9742 +- 0.0004
test acc,0.8028 +- 0.0014,0.7576 +- 0.0017,0.8031 +- 0.0012,0.8003 +- 0.0021,0.7803 +- 0.0013
train f1,0.6324 +- 0.0165,0.9594 +- 0.0012,0.7727 +- 0.0066,0.6449 +- 0.049,0.9597 +- 0.0012
test f1,0.5572 +- 0.0056,0.5207 +- 0.0063,0.5573 +- 0.0048,0.5285 +- 0.0181,0.5425 +- 0.006
num trees,2577.5 +- 105.14,1.0 +- 0.0,800.0 +- 0.0,800.0 +- 0.0,100.0 +- 0.0
average depth,1.9595 +- 0.0101,52.0333 +- 3.8906,5.5608 +- 0.0728,3.0 +- 0.0,52.3517 +- 0.4323
num nodes,13977.94 +- 595.75,31381.6 +- 108.71,40801.8 +- 591.71,11919.07 +- 25.51,2652551.8 +- 4937.95


# confidence

In [290]:
dataset = 'confidence'
result = print_stats_f1(dataset)
result

Unnamed: 0,GP,Decision Tree,Xgboost,GBDT,Random Forest
train acc,0.99867 +- 0.0051,1.0 +- 0.0,0.98867 +- 0.0125,1.0 +- 0.0,1.0 +- 0.0
test acc,0.83788 +- 0.0672,0.75303 +- 0.0833,0.78636 +- 0.061,0.76061 +- 0.0716,0.80909 +- 0.0636
train f1,0.99868 +- 0.005,1.0 +- 0.0,0.98848 +- 0.0128,1.0 +- 0.0,1.0 +- 0.0
test f1,0.82744 +- 0.0801,0.74135 +- 0.0917,0.77482 +- 0.0679,0.75414 +- 0.0768,0.80368 +- 0.0726
num trees,16.07 +- 0.45,1.0 +- 0.0,60.0 +- 0.0,6000.0 +- 0.0,100.0 +- 0.0
average depth,1.0 +- 0.0,6.2 +- 0.9248,2.02389 +- 0.1308,1.26153 +- 0.025,6.059 +- 0.1989
num nodes,48.2 +- 1.35,20.4 +- 2.58,310.6 +- 16.33,14479.93 +- 1025.99,2004.93 +- 102.99


# connect-4

In [22]:
dataset = 'connect-4'
result = print_stats_f1(dataset)
result

Unnamed: 0,GP,Decision Tree,Xgboost,GBDT,Random Forest
train acc,0.86595 +- 0.0015,1.0 +- 0.0,0.86228 +- 0.0016,0.81741 +- 0.001,1.0 +- 0.0
test acc,0.83762 +- 0.0023,0.72811 +- 0.0039,0.83113 +- 0.0019,0.8029 +- 0.0018,0.81077 +- 0.002
train f1,0.7261 +- 0.0033,1.0 +- 0.0,0.70446 +- 0.0037,0.62551 +- 0.0028,1.0 +- 0.0
test f1,0.66795 +- 0.0046,0.56871 +- 0.0049,0.64053 +- 0.0047,0.59325 +- 0.0038,0.60434 +- 0.0039
num trees,3398.28 +- 75.23,1.0 +- 0.0,300.0 +- 0.0,3000.0 +- 0.0,100.0 +- 0.0
average depth,1.60751 +- 0.0097,32.5 +- 1.7956,6.0 +- 0.0,3.0 +- 0.0,34.69633 +- 0.1697
num nodes,14873.48 +- 378.07,26447.2 +- 261.04,30083.2 +- 317.47,44897.2 +- 37.32,2875735.2 +- 7283.49


# Shuttle

In [20]:
dataset = 'shuttle'
result = print_stats_f1(dataset)
result

Unnamed: 0,GP,Decision Tree,Xgboost,GBDT,Random Forest
train acc,1.0 +- 0.0,1.0 +- 0.0,1.0 +- 0.0,0.99957 +- 0.001,1.0 +- 0.0
test acc,0.99981 +- 0.0001,0.99969 +- 0.0002,0.9998 +- 0.0002,0.99934 +- 0.0009,0.9998 +- 0.0001
train f1,1.0 +- 0.0,1.0 +- 0.0,1.0 +- 0.0,0.97358 +- 0.0655,1.0 +- 0.0
test f1,0.98751 +- 0.0118,0.96182 +- 0.0337,0.96247 +- 0.0322,0.9479 +- 0.0681,0.94929 +- 0.0458
num trees,202.8 +- 15.16,1.0 +- 0.0,700.0 +- 0.0,700.0 +- 0.0,100.0 +- 0.0
average depth,1.72903 +- 0.0368,8.96667 +- 1.2452,1.73052 +- 0.0394,2.81448 +- 0.1693,11.296 +- 0.2402
num nodes,1005.4 +- 95.4,62.0 +- 5.55,3108.67 +- 94.85,8728.0 +- 704.14,9854.53 +- 281.48


# madelon

In [18]:
dataset = 'madelon'
result = print_stats_roc(dataset)
result

Unnamed: 0,GP,Decision Tree,Xgboost,GBDT,Random Forest
train acc,0.99635 +- 0.0014,1.0 +- 0.0,1.0 +- 0.0,1.0 +- 0.0,1.0 +- 0.0
test acc,0.79133 +- 0.0114,0.75031 +- 0.0215,0.80389 +- 0.0186,0.73778 +- 0.0085,0.70117 +- 0.0234
train roc,0.99989 +- 0.0001,1.0 +- 0.0,1.0 +- 0.0,1.0 +- 0.0,1.0 +- 0.0
test roc,0.87057 +- 0.0095,0.75031 +- 0.0215,0.87814 +- 0.0161,0.81693 +- 0.0085,0.77496 +- 0.0212
num trees,1109.0 +- 37.4,1.0 +- 0.0,100.0 +- 0.0,1000.0 +- 0.0,100.0 +- 0.0
average depth,2.28994 +- 0.0876,15.45 +- 2.4106,5.90933 +- 0.0427,3.0 +- 0.0,20.71767 +- 0.3722
num nodes,7470.6 +- 510.07,310.87 +- 14.35,4771.0 +- 64.6,14216.07 +- 137.14,44370.27 +- 242.67


# higgs

In [14]:



dataset = 'higgs'
result = print_stats_roc(dataset)
result

Unnamed: 0,GP,Decision Tree,Xgboost,GBDT,Random Forest
train acc,0.74824 +- 0.0014,1.0 +- 0.0,0.79297 +- 0.002,0.75717 +- 0.0012,1.0 +- 0.0
test acc,0.72545 +- 0.0034,0.63113 +- 0.0041,0.72594 +- 0.0048,0.72635 +- 0.004,0.72222 +- 0.0042
train roc,0.82934 +- 0.0012,1.0 +- 0.0,0.87651 +- 0.0019,0.83905 +- 0.001,1.0 +- 0.0
test roc,0.80349 +- 0.0035,0.62987 +- 0.0041,0.80501 +- 0.0039,0.80435 +- 0.0037,0.79944 +- 0.0044
num trees,1973.3 +- 33.44,1.0 +- 0.0,100.0 +- 0.0,1000.0 +- 0.0,100.0 +- 0.0
average depth,1.75121 +- 0.0077,42.36667 +- 2.6585,6.0 +- 0.0,3.0 +- 0.0,39.37 +- 0.3267
num nodes,9177.6 +- 167.31,32250.47 +- 125.22,9956.65 +- 218.11,14844.0 +- 37.59,3072347.47 +- 5955.26


# kdd99

In [9]:
# no bug version
dataset = 'kdd99'
result = print_stats_f1(dataset)
result

Unnamed: 0,GP,Decision Tree,Xgboost,GBDT,Random Forest
train acc,1.0 +- 0.0,1.0 +- 0.0,1.0 +- 0.0,0.90664 +- 0.2646,1.0 +- 0.0
test acc,0.99981 +- 0.0,0.9995 +- 0.0001,0.9998 +- 0.0,0.90642 +- 0.2644,0.99975 +- 0.0
train f1,0.99989 +- 0.0001,0.9999 +- 0.0001,0.9999 +- 0.0001,0.56389 +- 0.1945,0.9999 +- 0.0001
test f1,0.79271 +- 0.0404,0.74277 +- 0.0351,0.78061 +- 0.0315,0.4816 +- 0.1599,0.78279 +- 0.0307
num trees,2305.07 +- 42.95,1.0 +- 0.0,2300.0 +- 0.0,2300.0 +- 0.0,100.0 +- 0.0
average depth,1.50323 +- 0.0107,33.1 +- 2.6044,2.12628 +- 0.0396,2.89752 +- 0.0621,25.11233 +- 0.5136
num nodes,9448.3 +- 214.0,461.47 +- 18.93,15125.0 +- 386.94,27929.5 +- 1367.3,62650.33 +- 917.36


# yeast

In [126]:
dataset = 'yeast'
result = print_stats_f1(dataset)
result

Unnamed: 0,GP,Decision Tree,Xgboost,GBDT,Random Forest
train acc,0.62359 +- 0.012,1.0 +- 0.0,0.82377 +- 0.0099,0.87401 +- 0.0082,1.0 +- 0.0
test acc,0.58744 +- 0.02,0.5012 +- 0.0175,0.60255 +- 0.017,0.58626 +- 0.0161,0.61096 +- 0.0173
train f1,0.55028 +- 0.0269,1.0 +- 0.0,0.83335 +- 0.016,0.93812 +- 0.0048,1.0 +- 0.0
test f1,0.49494 +- 0.0314,0.4329 +- 0.0265,0.54244 +- 0.028,0.51301 +- 0.0321,0.54235 +- 0.0343
num trees,91.0 +- 9.94,1.0 +- 0.0,90.0 +- 0.0,900.0 +- 0.0,100.0 +- 0.0
average depth,1.29567 +- 0.0559,22.96667 +- 2.2203,5.80556 +- 0.0613,3.0 +- 0.0,22.829 +- 0.4026
num nodes,353.59 +- 53.16,720.4 +- 17.8,3188.0 +- 67.33,13067.2 +- 83.2,64954.4 +- 683.22


# vowel

In [4]:
dataset = 'vowel'
result = print_stats_f1(dataset)
result

Unnamed: 0,GP,Decision Tree,Xgboost,GBDT,Random Forest
train acc,1.0 +- 0.0,1.0 +- 0.0,1.0 +- 0.0,1.0 +- 0.0,1.0 +- 0.0
test acc,0.95309 +- 0.0129,0.77576 +- 0.0189,0.89776 +- 0.024,0.87015 +- 0.026,0.95219 +- 0.0147
train f1,1.0 +- 0.0,1.0 +- 0.0,1.0 +- 0.0,1.0 +- 0.0,1.0 +- 0.0
test f1,0.95296 +- 0.0131,0.77554 +- 0.0194,0.89735 +- 0.024,0.87041 +- 0.0258,0.95197 +- 0.0149
num trees,1579.47 +- 19.71,1.0 +- 0.0,1100.0 +- 0.0,1100.0 +- 0.0,100.0 +- 0.0
average depth,1.96673 +- 0.0102,14.13333 +- 0.9732,2.87821 +- 0.0274,3.0 +- 0.0,14.44 +- 0.1835
num nodes,8893.6 +- 151.78,241.87 +- 12.05,10097.93 +- 117.26,15908.27 +- 106.17,27109.87 +- 280.15


# prnn_synth

In [240]:
dataset = 'prnn_synth'
result = print_stats_roc(dataset)
result

Unnamed: 0,GP,Decision Tree,Xgboost,GBDT,Random Forest
train acc,1.0 +- 0.0,1.0 +- 0.0,1.0 +- 0.0,1.0 +- 0.0,1.0 +- 0.0
test acc,0.85822 +- 0.0328,0.80252 +- 0.0336,0.84311 +- 0.0362,0.83956 +- 0.0411,0.84533 +- 0.0275
train roc,1.0 +- 0.0,1.0 +- 0.0,1.0 +- 0.0,1.0 +- 0.0,1.0 +- 0.0
test roc,0.93561 +- 0.0197,0.80208 +- 0.0336,0.91484 +- 0.0258,0.91406 +- 0.0292,0.9351 +- 0.0206
num trees,151.87 +- 7.09,1.0 +- 0.0,1000.0 +- 0.0,1000.0 +- 0.0,1000.0 +- 0.0
average depth,2.31348 +- 0.0494,10.18889 +- 1.6208,1.96077 +- 0.1436,3.0 +- 0.0,9.4054 +- 0.4078
num nodes,1177.0 +- 72.1,59.18 +- 7.65,5389.4 +- 369.7,13817.33 +- 298.92,45291.07 +- 3036.94


# parity5

In [114]:
dataset = 'parity5'
result = print_stats_f1(dataset)
result

Unnamed: 0,GP,Decision Tree,Xgboost,GBDT,Random Forest
train acc,1.0 +- 0.0,1.0 +- 0.0,0.72933 +- 0.074,0.98933 +- 0.0584,1.0 +- 0.0
test acc,0.92381 +- 0.0898,0.05714 +- 0.0712,0.17619 +- 0.1226,0.11905 +- 0.1067,0.00952 +- 0.0362
train f1,1.0 +- 0.0,1.0 +- 0.0,0.72709 +- 0.0754,0.98932 +- 0.0585,1.0 +- 0.0
test f1,0.92257 +- 0.0915,0.05 +- 0.0623,0.15919 +- 0.1168,0.10701 +- 0.1005,0.00833 +- 0.0317
num trees,3.07 +- 1.41,1.0 +- 0.0,1000.0 +- 0.0,1000.0 +- 0.0,100.0 +- 0.0
average depth,1.85389 +- 0.309,5.0 +- 0.0,1.78203 +- 0.2357,3.0 +- 0.0,4.975 +- 0.0155
num nodes,15.2 +- 4.24,46.07 +- 1.64,5059.0 +- 1013.02,14936.4 +- 340.51,2739.73 +- 59.66


# lupus

In [148]:
dataset = 'lupus'
result = print_stats_roc(dataset)
result

Unnamed: 0,GP,Decision Tree,Xgboost,GBDT,Random Forest
train acc,0.90889 +- 0.0419,0.99056 +- 0.0084,0.88889 +- 0.0295,0.87833 +- 0.0301,0.99056 +- 0.0084
test acc,0.70741 +- 0.064,0.64691 +- 0.0665,0.68272 +- 0.0713,0.68889 +- 0.0738,0.67284 +- 0.0702
train roc,0.99738 +- 0.0045,0.99009 +- 0.0088,0.96318 +- 0.0134,0.96084 +- 0.0183,0.99967 +- 0.0003
test roc,0.74962 +- 0.0723,0.63231 +- 0.0671,0.75095 +- 0.0744,0.74271 +- 0.0709,0.69915 +- 0.0618
num trees,10.57 +- 0.94,1.0 +- 0.0,10.0 +- 0.0,10.0 +- 0.0,100.0 +- 0.0
average depth,1.0 +- 0.0,10.4 +- 1.9405,4.42 +- 0.3547,3.0 +- 0.0,8.18867 +- 0.6084
num nodes,31.7 +- 2.81,37.67 +- 5.88,121.2 +- 8.26,126.27 +- 17.25,3090.8 +- 261.02


In [92]:
dataset = 'lupus'
result = print_stats_roc(dataset)
result

Unnamed: 0,GP,Decision Tree,Xgboost,GBDT,Random Forest
train acc,0.90833 +- 0.0396,0.90111 +- 0.0262,0.88889 +- 0.0295,0.87833 +- 0.0301,0.99056 +- 0.0084
test acc,0.70864 +- 0.0581,0.66543 +- 0.0711,0.68272 +- 0.0713,0.68889 +- 0.0738,0.67284 +- 0.0702
train roc,0.99747 +- 0.0041,0.89166 +- 0.0311,0.96318 +- 0.0134,0.96084 +- 0.0183,0.99967 +- 0.0003
test roc,0.75095 +- 0.0702,0.64315 +- 0.0718,0.75095 +- 0.0744,0.74271 +- 0.0709,0.69915 +- 0.0618
num trees,10.53 +- 0.97,1.0 +- 0.0,10.0 +- 0.0,10.0 +- 0.0,100.0 +- 0.0
average depth,1.0 +- 0.0,5.0 +- 0.0,4.42 +- 0.3547,3.0 +- 0.0,8.18867 +- 0.6084
num nodes,31.6 +- 2.92,20.47 +- 4.2,121.2 +- 8.26,126.27 +- 17.25,3090.8 +- 261.02


# madelon gp selection

In [52]:
dataset = 'madelon_gp_selection'
result = print_stats_f1(dataset)
result

Unnamed: 0,Decision Tree,Xgboost,GBDT,Random Forest
train acc,1.0 +- 0.0,1.0 +- 0.0,0.8835 +- 0.0,1.0 +- 0.0
test acc,0.803 +- 0.0068,0.845 +- 0.0,0.8024 +- 0.001,0.8711 +- 0.0053
train f1,1.0 +- 0.0,1.0 +- 0.0,0.9543 +- 0.0,1.0 +- 0.0
test f1,0.803 +- 0.0068,0.9214 +- 0.0,0.8732 +- 0.0007,0.947 +- 0.0015
num trees,1.0 +- 0.0,100.0 +- 0.0,100.0 +- 0.0,100.0 +- 0.0
average depth,17.0 +- 0.0,6.0 +- 0.0,3.0 +- 0.0,18.1937 +- 0.2321
num nodes,465.13 +- 1.17,4716.0 +- 0.0,1474.0 +- 0.0,46690.0 +- 255.26


In [51]:
dataset = 'madelon_chi_square_selection'
result = print_stats_f1(dataset)
result

Unnamed: 0,Decision Tree,Xgboost,GBDT,Random Forest
train acc,1.0 +- 0.0,1.0 +- 0.0,0.877 +- 0.0,1.0 +- 0.0
test acc,0.7861 +- 0.0059,0.845 +- 0.0,0.7816 +- 0.0019,0.8648 +- 0.0075
train f1,1.0 +- 0.0,1.0 +- 0.0,0.9548 +- 0.0,1.0 +- 0.0
test f1,0.7861 +- 0.0059,0.9202 +- 0.0,0.8549 +- 0.0009,0.942 +- 0.0027
num trees,1.0 +- 0.0,100.0 +- 0.0,100.0 +- 0.0,100.0 +- 0.0
average depth,19.0 +- 0.0,6.0 +- 0.0,3.0 +- 0.0,18.1683 +- 0.1593
num nodes,482.47 +- 2.4,4932.0 +- 0.0,1486.0 +- 0.0,47553.67 +- 265.06


# gistte selection

In [10]:
dataset = 'gisette_gp_selection'
result = print_stats_f1(dataset)
result

Unnamed: 0,Decision Tree,Xgboost,GBDT,Random Forest
train acc,1.0 +- 0.0,1.0 +- 0.0,1.0 +- 0.0,1.0 +- 0.0
test acc,0.92342 +- 0.004,0.97903 +- 0.0002,0.97467 +- 0.0006,0.97133 +- 0.0012
train f1,1.0 +- 0.0,1.0 +- 0.0,1.0 +- 0.0,1.0 +- 0.0
test f1,0.92342 +- 0.004,0.99685 +- 0.0002,0.99642 +- 0.0009,0.99511 +- 0.0001
num trees,1.0 +- 0.0,1000.0 +- 0.0,1000.0 +- 0.0,100.0 +- 0.0
average depth,31.5 +- 1.5127,2.22394 +- 0.0338,3.0 +- 0.0,28.9 +- 0.6582
num nodes,356.8 +- 14.37,7514.58 +- 85.85,13418.67 +- 199.76,48673.33 +- 1508.04


In [11]:
dataset = 'gisette_chi_square_selection'
result = print_stats_f1(dataset)
result

Unnamed: 0,Decision Tree,Xgboost,GBDT,Random Forest
train acc,1.0 +- 0.0,1.0 +- 0.0,1.0 +- 0.0,1.0 +- 0.0
test acc,0.93657 +- 0.003,0.978 +- 0.0,0.977 +- 0.0,0.968 +- 0.0
train f1,1.0 +- 0.0,1.0 +- 0.0,1.0 +- 0.0,1.0 +- 0.0
test f1,0.93657 +- 0.003,0.99661 +- 0.0,0.99604 +- 0.0,0.99526 +- 0.0
num trees,1.0 +- 0.0,1000.0 +- 0.0,1000.0 +- 0.0,100.0 +- 0.0
average depth,31.0 +- 0.0,2.178 +- 0.0,3.0 +- 0.0,29.68 +- 0.0
num nodes,352.73 +- 15.44,7326.0 +- 0.0,13116.0 +- 0.0,47010.0 +- 0.0


# haberman

In [74]:
dataset = 'haberman'
result = print_stats_roc(dataset)
result

Unnamed: 0,GP,Decision Tree,Xgboost,GBDT,Random Forest
train acc,0.77368 +- 0.0116,0.77336 +- 0.0147,0.76526 +- 0.0159,0.76963 +- 0.0214,0.78349 +- 0.0188
test acc,0.75652 +- 0.0276,0.74203 +- 0.0303,0.74819 +- 0.0272,0.73913 +- 0.0154,0.74203 +- 0.0228
train roc,0.76135 +- 0.0231,0.67816 +- 0.0507,0.74172 +- 0.0159,0.7766 +- 0.02,0.81969 +- 0.0174
test roc,0.71271 +- 0.0601,0.62956 +- 0.0788,0.699 +- 0.0501,0.68295 +- 0.0481,0.69859 +- 0.0469
num trees,10.33 +- 1.37,1.0 +- 0.0,10.0 +- 0.0,10.0 +- 0.0,10.0 +- 0.0
average depth,1.0 +- 0.0,2.0 +- 0.0,1.0 +- 0.0,2.0 +- 0.0,3.0 +- 0.0
num nodes,31.0 +- 4.12,7.0 +- 0.0,30.0 +- 0.0,70.0 +- 0.0,131.47 +- 6.12


# molecular

In [377]:
dataset = 'molecular'
result = print_stats_roc(dataset)
result

Unnamed: 0,GP,Decision Tree,Xgboost,GBDT,Random Forest
train acc,1.0 +- 0.0,1.0 +- 0.0,1.0 +- 0.0,1.0 +- 0.0,1.0 +- 0.0
test acc,0.88125 +- 0.0785,0.73333 +- 0.0942,0.84375 +- 0.0895,0.81562 +- 0.1023,0.87292 +- 0.0544
train roc,1.0 +- 0.0,1.0 +- 0.0,1.0 +- 0.0,1.0 +- 0.0,1.0 +- 0.0
test roc,0.95013 +- 0.0516,0.73042 +- 0.0958,0.91602 +- 0.0642,0.89701 +- 0.0778,0.94876 +- 0.0371
num trees,151.53 +- 12.39,1.0 +- 0.0,1000.0 +- 0.0,100.0 +- 0.0,100.0 +- 0.0
average depth,1.0867 +- 0.0308,4.86667 +- 0.8193,1.0432 +- 0.0064,3.0 +- 0.0,5.55867 +- 0.1091
num nodes,483.2 +- 41.18,19.0 +- 2.92,1372.2 +- 100.55,1483.53 +- 15.01,2424.2 +- 75.38


# labor

In [375]:
dataset = 'labor'
result = print_stats_roc(dataset)
result

Unnamed: 0,GP,Decision Tree,Xgboost,GBDT,Random Forest
train acc,1.0 +- 0.0,1.0 +- 0.0,0.99829 +- 0.0065,1.0 +- 0.0,1.0 +- 0.0
test acc,0.94259 +- 0.0494,0.82315 +- 0.0712,0.86296 +- 0.0696,0.83519 +- 0.0819,0.91481 +- 0.0614
train roc,1.0 +- 0.0,1.0 +- 0.0,1.0 +- 0.0,1.0 +- 0.0,1.0 +- 0.0
test roc,0.98981 +- 0.0154,0.79955 +- 0.0856,0.9463 +- 0.0514,0.91644 +- 0.0781,0.97685 +- 0.0364
num trees,434.6 +- 8.23,1.0 +- 0.0,1000.0 +- 0.0,1000.0 +- 0.0,1000.0 +- 0.0
average depth,1.0 +- 0.0,3.8 +- 0.6587,1.01683 +- 0.004,1.34803 +- 0.0072,4.0125 +- 0.2347
num nodes,1303.8 +- 24.7,11.87 +- 2.19,1422.67 +- 568.17,2888.53 +- 372.43,11676.0 +- 929.77


# soybean

In [224]:
dataset = 'soybean'
result = print_stats_f1(dataset)
result

Unnamed: 0,GP,Decision Tree,Xgboost,GBDT,Random Forest
train acc,0.99831 +- 0.0013,0.75085 +- 0.0011,0.99831 +- 0.0013,0.99873 +- 0.0011,0.99873 +- 0.0011
test acc,0.93892 +- 0.0138,0.67291 +- 0.0206,0.93005 +- 0.0146,0.9353 +- 0.015,0.93284 +- 0.015
train f1,0.99931 +- 0.0005,0.68372 +- 0.0004,0.99931 +- 0.0005,0.99948 +- 0.0004,0.99948 +- 0.0004
test f1,0.96087 +- 0.0113,0.64513 +- 0.0115,0.94804 +- 0.0127,0.95523 +- 0.0132,0.9603 +- 0.0123
num trees,1208.63 +- 35.28,1.0 +- 0.0,1800.0 +- 0.0,1800.0 +- 0.0,100.0 +- 0.0
average depth,1.53325 +- 0.0177,17.5 +- 2.474,1.55711 +- 0.0114,2.99496 +- 0.0007,17.031 +- 0.4091
num nodes,5170.9 +- 172.7,126.4 +- 5.59,6803.4 +- 94.49,26342.07 +- 106.17,16361.73 +- 331.91


In [207]:
dataset = 'soybean'
result = print_stats_roc(dataset)
result

Unnamed: 0,GP,Decision Tree,Xgboost,GBDT,Random Forest
train acc,0.99831 +- 0.0013,0.75085 +- 0.0011,,0.99873 +- 0.0011,0.99873 +- 0.0011
test acc,0.93892 +- 0.0138,0.67291 +- 0.0206,,0.93514 +- 0.0149,0.93284 +- 0.015
train roc,0.99931 +- 0.0005,0.68372 +- 0.0004,,0.99948 +- 0.0004,0.99948 +- 0.0004
test roc,0.96087 +- 0.0113,0.64513 +- 0.0115,,0.95489 +- 0.0131,0.9603 +- 0.0123
num trees,1208.63 +- 35.28,1.0 +- 0.0,,1800.0 +- 0.0,100.0 +- 0.0
average depth,1.53325 +- 0.0177,17.5 +- 2.474,,2.99507 +- 0.0006,17.031 +- 0.4091
num nodes,5170.9 +- 172.7,126.4 +- 5.59,,26341.6 +- 106.97,16361.73 +- 331.91


# cars

In [141]:
dataset = 'cars'
result = print_stats_f1(dataset)
result

Unnamed: 0,GP,Decision Tree,Xgboost,GBDT,Random Forest
train acc,1.0 +- 0.0,1.0 +- 0.0,1.0 +- 0.0,1.0 +- 0.0,1.0 +- 0.0
test acc,0.99379 +- 0.0086,0.89284 +- 0.0365,0.97345 +- 0.0172,0.97203 +- 0.0137,0.91554 +- 0.0264
train f1,1.0 +- 0.0,1.0 +- 0.0,1.0 +- 0.0,1.0 +- 0.0,1.0 +- 0.0
test f1,0.99161 +- 0.0117,0.85264 +- 0.0509,0.96677 +- 0.0218,0.96264 +- 0.0189,0.87863 +- 0.0368
num trees,20.17 +- 2.44,1.0 +- 0.0,3000.0 +- 0.0,3000.0 +- 0.0,1000.0 +- 0.0
average depth,1.0 +- 0.0,9.96667 +- 0.9994,1.0 +- 0.0,1.0 +- 0.0,10.92697 +- 0.271
num nodes,60.5 +- 7.31,59.53 +- 7.97,9000.0 +- 0.0,9000.0 +- 0.0,76136.67 +- 2187.63
