In [4]:
sys.path.append('../')
import sys
import matplotlib.patches as mpatches
import pandas as pd
import seaborn as sns
import time
import numpy as np

from sklearn.model_selection import train_test_split
from pmlb import fetch_data
from pmlb import classification_dataset_names, regression_dataset_names
from sklearn.feature_selection import  RFE
from itertools import repeat
from sklearn.ensemble import RandomForestClassifier


from ControlBurn.ControlBurnExperiment import build_trees_bag_experiment
from ControlBurn.ControlBurnExperiment import solve_step_experiment

from ControlBurn.ControlBurnExperiment import plot_tradeoff_curve
from ControlBurn.RandomForestBaseline import RandomForestBaseline


# Run Experiment Comparing Runtimes

In [6]:
names = ['Hill_Valley_with_noise', 'Hill_Valley_without_noise',
       'analcatdata_bankruptcy', 'analcatdata_boxing2',
       'analcatdata_cyyoung8092', 'analcatdata_cyyoung9302',
       'analcatdata_japansolvent', 'analcatdata_lawsuit', 'appendicitis',
       'australian', 'biomed', 'breast_cancer_wisconsin', 'buggyCrx',
       'bupa', 'chess', 'churn', 'clean1', 'cleve', 'colic', 'credit_a',
       'crx', 'diabetes', 'dis', 'glass2', 'haberman', 'heart_c',
       'horse_colic', 'hypothyroid', 'ionosphere', 'lupus', 'phoneme',
       'pima', 'prnn_crabs', 'prnn_synth', 'ring', 'sonar', 'spambase',
       'spectf', 'tokyo1', 'twonorm', 'wdbc']

data_details = pd.read_csv('../Data/pmlb_meta.csv')
pmlb = data_details[data_details['dataset'].isin(names)]


results = pd.DataFrame()
n = 0
while n < 10:
    print(n)
    try:
        nfeat = []
        NAME_time = []
        baseline_time = []
        rfe_time = []

        for name in names:
            data = fetch_data(name)
            data = data.sample(1000,replace = True)
            y = data['target']
            X = data.drop('target',axis = 1)

            nfeat.append(len(X.columns))
            max_depth= 10
            problem_type = 'Classification'
            loss_type = 'logistic'
            optimization_type = 'penalized'
            lambd=  0.014
            
            if name in ['Hill_Valley_with_noise','Hill_Valley_without_noise']:
                lambd = 0.004
            if name == 'clean1':
                lambd = 0.005

            threshold= 10**-3
            ntrials = 10
            features_to_find = min(len(X.columns),10)

            xTrain,xTest,yTrain,yTest= train_test_split(  X, y, test_size=0.33)
            arg = [xTrain,yTrain,xTest,yTest, max_depth,problem_type,loss_type,lambd,threshold,optimization_type]

            ts = time.time()
            
            tree = build_trees_bag_experiment(arg)
            res = solve_step_experiment(arg,tree)
            nfeatures = res[2]
            
            te = time.time()
            NAME_time.append(te-ts)

            ts = time.time()
            model = RandomForestClassifier(n_estimators = 100)
            rf = model.fit(xTrain,yTrain)
            imp = pd.DataFrame(np.column_stack((xTrain.columns,rf.feature_importances_)),columns = ['features','scores']).sort_values('scores',ascending = False)
            to_use = imp.head(nfeatures)['features'].values
            rf1 = model.fit(xTrain[to_use],yTrain)
            pred = rf1.predict_proba(xTest[to_use])[:,1]    
            te = time.time()

            baseline_time.append(te-ts)
            ts = time.time()
            selector = RFE(RandomForestClassifier(n_estimators = 100), n_features_to_select=nfeatures, step=1)
            selector = selector.fit(xTrain, yTrain)
            selector.support_
            te = time.time()
            rfe_time.append(te-ts)
            temp = np.column_stack((nfeat,NAME_time,baseline_time,rfe_time))
            temp = pd.DataFrame(temp,columns = ['features','name','baseline','RFE'])
            results = results.append(temp)
            
    except:
        print('error')
    n = n+1

#Store Results
results = results[results['features']<10]
rfe_to_plot = []
control_burn_to_plot = []
for i in np.sort(results['features'].unique()):
        control_burn_to_plot.append(results[results['features'] == i]['name'])
        rfe_to_plot.append(results[results['features'] == i]['RFE'])

0
error
1
error
2
error
3
error
4
error
5
error
6
error
7
error
8
error
9
error


In [15]:
    nfeat = []
    NAME_time = []
    baseline_time = []
    rfe_time = []

    for name in names:
        print(name)
        data = fetch_data(name)
        data = data.sample(1000,replace = True)
        y = data['target']
        X = data.drop('target',axis = 1)

        nfeat.append(len(X.columns))
        max_depth= 10
        problem_type = 'Classification'
        loss_type = 'logistic'
        optimization_type = 'penalized'
        lambd=  0.014
            
        if name in ['Hill_Valley_with_noise','Hill_Valley_without_noise']:
            lambd = 0.004
        if name == 'clean1':
            lambd = 0.005

        threshold= 10**-3
        ntrials = 10
        features_to_find = min(len(X.columns),10)

        xTrain,xTest,yTrain,yTest= train_test_split(  X, y, test_size=0.33)
        arg = [xTrain,yTrain,xTest,yTest, max_depth,problem_type,loss_type,lambd,threshold,optimization_type]

        ts = time.time()

        tree = build_trees_bag_experiment(arg)
        res = solve_step_experiment(arg,tree)
        nfeatures = res[2]
            
        te = time.time()
        NAME_time.append(te-ts)

        ts = time.time()
        model = RandomForestClassifier(n_estimators = 100)
        rf = model.fit(xTrain,yTrain)
        imp = pd.DataFrame(np.column_stack((xTrain.columns,rf.feature_importances_)),columns = ['features','scores']).sort_values('scores',ascending = False)
        to_use = imp.head(nfeatures)['features'].values
        rf1 = model.fit(xTrain[to_use],yTrain)
        pred = rf1.predict_proba(xTest[to_use])[:,1]    
        te = time.time()

        baseline_time.append(te-ts)
        ts = time.time()
        selector = RFE(RandomForestClassifier(n_estimators = 100), n_features_to_select=nfeatures, step=1)
        selector = selector.fit(xTrain, yTrain)
        selector.support_
        te = time.time()
        rfe_time.append(te-ts)
        temp = np.column_stack((nfeat,NAME_time,baseline_time,rfe_time))
        temp = pd.DataFrame(temp,columns = ['features','name','baseline','RFE'])
        results = results.append(temp)
            

Hill_Valley_with_noise
Hill_Valley_without_noise
analcatdata_bankruptcy
analcatdata_boxing2
analcatdata_cyyoung8092
analcatdata_cyyoung9302
analcatdata_japansolvent
analcatdata_lawsuit
appendicitis
australian
biomed
breast_cancer_wisconsin
buggyCrx
bupa
chess
churn


ValueError: at least one array or dtype is required

In [14]:
res

[[], 0.5, 0, 0.5]

# Plot

In [None]:

fig = plt.figure(figsize = (10,8))
ax  = fig.add_subplot(111)
plt.violinplot(rfe_to_plot, positions = np.sqrt(results['features'].unique()),showextrema=False)
plt.violinplot(control_burn_to_plot , positions =  np.sqrt(results['features'].unique()),showextrema= False)
plt.ylabel('Computation Time in Seconds')
plt.xlabel('Number of Features')
plt.xticks([3,4,5,6,7,8,9,10],[9,16,25,36,49,65,81,100])
labels = []
def add_label(violin, label,hatch = None):
    color = violin["bodies"][0].get_facecolor().flatten()
    labels.append((mpatches.Patch(color=color,hatch = hatch), label))

positions = np.arange(3,13,3)
p1 = ax.violinplot(rfe_to_plot, positions = np.sqrt(results['features'].unique()),showextrema=False )

for pc in p1['bodies']:
    pc.set_facecolor('grey')
    pc.set_edgecolor('black')
    pc.set_hatch('/')
    pc.set_alpha(.99)
    
add_label(p1, "RFE",'/')  
positions = np.arange(1, 10, 2)
p2 = ax.violinplot(control_burn_to_plot , positions =  np.sqrt(results['features'].unique()),showextrema= False)


for pc in p2['bodies']:
    pc.set_facecolor('orangered')
    pc.set_edgecolor('black')

add_label(p2, r'\textsc{ControlBurn}')  