In [None]:
import pandas as pd
import os
from autorank import autorank, plot_stats, create_report, latex_table
pd.set_option('display.max_rows', 400)
pd.set_option('display.max_columns', 25)
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

In [None]:
def parse_gct(filename, out_csv):
    # print(filename)
    file = filename.split('/')[-1]
    STRact = 1 if 'STR' in filename else 0
    prunemethod = ""
    if 'LRU' in file:
        if "Oza" in file:
            prunemethod = "Oza"
        if STRact:
            prunemethod += 'LRUSTR'
        else:
            prunemethod += 'LRU'
    elif 'LFU' in file:
        if STRact:
            prunemethod += 'LFUSTR'
        else:
            prunemethod += 'LFU'
    else:
        prunemethod = file.split('-')[0]
    if 'g50' not in filename:
        grace_period = 200
    else:
        grace_period = 50
        prunemethod += 'g'
    if 'c005' not in filename:
        split_confidence = 0.0000001
    else:
        split_confidence = 0.05
        prunemethod += 'c'
    if 't01' not in filename:
        tie_threshold = 0.05
    else:
        tie_threshold = 0.1
        prunemethod += 't'
    if 'MC' in filename:
        leafAlg='MC'
    else:
        leafAlg='NBA'

    dataset = '.'.join(file.split('.')[0:-1]).split('-')[-1]
    if dataset == 'MC' or 'iter' in dataset:
        dataset = '.'.join(file.split('.')[0:-1]).split('-')[-2]
    
    df = pd.read_csv(filename)
    wanted = ['evaluation time (cpu seconds)', 'evaluationTime', 'classifications correct (percent)',
             'Precision (percent)', 'Recall (percent)', 'model serialized size (bytes)',
              'tree size (nodes)', 'tree size (leaves)', 'tree depth', 'INTERVAL', 
              'THRESHOLD (depth 0)', 'PRUNED', 'PRUNE-SPLIT', 'PRUNE-LEARN', 'AVG-NODESIZE', 'MAX-NODESIZE']
    for i in range(len(wanted)):
        w = wanted[i]
        if w not in df.columns:
            if w == 'AVG-NODESIZE' or w == 'MAX-NODESIZE':
                wanted[i] = f'[avg] {w}'
                if wanted[i] not in df.columns:
                    df[wanted[i]] = 0
            else:
                df[w] = 0
    plist = [dataset, prunemethod, leafAlg] + list(df[wanted].tail(1).values.flatten().tolist())
    pstr = ''
    for i in plist:
        pstr += f"{i},"
    out_csv.write(f"{pstr[:-1]}\n")


In [None]:
def print_res_gct(RESDIR):
    out_csv = 'output.csv'
    with open(out_csv,"w+") as of:
        of.write('Dataset,prune_method,leafAlg,time,evaluationTime,acc,prec,recall,model_size_(bytes),tree_nodes,tree_leaves,tree_depth,interval,pruning_threshold,total_pruned,split_pruned,learn_pruned,avg_size,max_size\n')
        for f in os.listdir(RESDIR):
            if 'term' not in f:
                parse_gct(f"{RESDIR}/{f}", of)
    df = pd.read_csv('output.csv')
    print("lines:",len(df))
    df = df [['Dataset','prune_method','leafAlg','interval','pruning_threshold','time','evaluationTime','acc','prec','recall','tree_depth','total_pruned','split_pruned','learn_pruned','avg_size','max_size']]
    df = df.sort_values(by=['Dataset','prune_method','interval','pruning_threshold'])
    return df

In [None]:
RESDIR='ICDE/ICDE-results-synthethic/'
df = print_res_gct(RESDIR)

In [None]:
print(df.Dataset.unique(), len(df.Dataset.unique()))

In [None]:
df.prune_method = df.prune_method.str.replace("HRAPTr", "RAP-HT")
df.prune_method.unique()

In [None]:
drift_data = ['AGR_a', 'AGR_g', 'HPlane_f', 'HPlane_m', 'HPlane_s', 'LED_a', 'LED_g', 'RBF_f', 'RBF_m', 'RBF_s', 'Wform_d5n']
non_drift_data = ['AGR', 'HPlane', 'LED', 'RBF', 'RTG', 'Wave']
real_data = ['GMSC', 'airlines', 'covtypeNorm', 'elecNormNew', 'nomao']

In [None]:
for ds in df.Dataset.unique():
    adf = df[df.Dataset == ds]
    # print()
    print(f"{ds:10} {adf.acc.min():0.6f} {adf.acc.max():0.6f}   - {len(df[df.interval == 2000])}   {len(df[df.interval == 10000])}  {len(df[df.interval == 500])}")


In [None]:
megadf = df[(df.prune_method == 'RAP-HT') & (df.leafAlg == 'NBA')].groupby(by=['Dataset','prune_method','interval','pruning_threshold'], as_index=False)
allHR = megadf[['time','acc','avg_size','max_size','total_pruned']].agg(['mean','std'])
allHR

In [None]:
allHR.reset_index(inplace=True)
allHR['new_id'] =  allHR['prune_method'] + '-' + allHR['pruning_threshold'].astype(str) + '-' + allHR['interval'].astype(str)
allHR.set_index('new_id', drop=False, inplace=True)
allHR.drop(['prune_method', 'interval','pruning_threshold'], axis=1, inplace=True)

print('total pruned')
pivoted = allHR.pivot(index='Dataset', columns='new_id')[('total_pruned','mean')]
display(pivoted)

print('runtime')
pivoted = allHR.pivot(index='Dataset', columns='new_id')[('time','mean')]
display(pivoted)
display(pivoted.describe())

column = group
row = observation

In [None]:
df.prune_method.unique()

In [None]:
df.pruning_threshold = df.pruning_threshold.astype(int)
df.pruning_threshold.unique()

In [None]:
df.interval = df.interval.astype(int)
df.interval = df.interval.astype(str)
df.interval = df.interval.str.replace('10000','10k')
df.interval = df.interval.str.replace('000','k')
df.interval.unique()

# getting best RAP-HT configuration based on ranking

In [None]:
def create_report_measure(df, measure, using_autorank=True, ascending=True, display_measure_tables=False, display_ranks_tables=False, display_static=False, display_drift=False, filename=None):
    print("reports for measure:", measure)
    resultdf=pd.DataFrame()
    autorankdf=pd.DataFrame()
    ranksdf=pd.DataFrame()
    for ds in df.Dataset.unique():
        adf = df[(df.Dataset == ds) & (df.leafAlg == 'NBA')].copy()
        rdf = adf.groupby(by=['prune_method','pruning_threshold','interval'],as_index=False)[['time', 'evaluationTime', 'acc','avg_size', 'max_size', 'total_pruned']].agg(['mean','std'])
        ardf = adf.groupby(by=['prune_method','pruning_threshold','interval'],as_index=False)[['time', 'evaluationTime', 'acc','avg_size', 'max_size', 'total_pruned']].mean()
        ranksrdf = ardf.copy()
        ranksrdf['ranks'] = ranksrdf[measure].rank(ascending=ascending, method='first')
        rdf = rdf.reset_index()
        ardf = ardf.reset_index()
        ranksrdf.reset_index()
        rdf['new_id'] = rdf['prune_method'] + '-' + rdf['pruning_threshold'].astype(str) + '-' + rdf['interval'].astype(str)
        ardf['new_id'] = ardf['prune_method'] + '-' + ardf['pruning_threshold'].astype(str) + '-' + ardf['interval'].astype(str)
        ranksrdf['new_id'] = ranksrdf['prune_method'] + '-' + ranksrdf['pruning_threshold'].astype(str) + '-' + ranksrdf['interval'].astype(str)
        rdf['Dataset'] = ds
        ardf['Dataset'] = ds
        ranksrdf['Dataset'] = ds
        rdf = rdf.set_index('new_id', drop=False)
        ardf = ardf.set_index('new_id', drop=False)
        ranksrdf = ranksrdf.set_index('new_id', drop=False)
        drdf = rdf.pivot(index='new_id', columns='Dataset')[(measure,'mean')]
        dardf = ardf.pivot(index='new_id', columns='Dataset')[measure]
        dranksrdf = ranksrdf.pivot(index='new_id', columns='Dataset')['ranks']
        resultdf = pd.concat([resultdf,drdf], axis=1)
        autorankdf = pd.concat([autorankdf,dardf], axis=1)
        ranksdf = pd.concat([ranksdf, dranksrdf], axis=1)
        
        
    if display_measure_tables:
        full = resultdf.transpose()
        static = resultdf[['AGR', 'HPlane', 'LED', 'RBF', 'RTG', 'Wave']].sort_index(level=1, axis=1).transpose()
        drift = resultdf[['AGR_a', 'AGR_g' , 'HPlane_f', 'HPlane_m', 'HPlane_s', 'LED_a', 'LED_g', 'RBF_f', 'RBF_m', 'RBF_s', 'Wform_d5n']].sort_index(level=1, axis=1).transpose()
        print("full")
        display(full)
        display(full.describe())
        if display_static:
            print('static')
            display(static)
            display(static.describe())
        if display_drift:
            print('drift')
            display(drift)
            display(drift.describe())

    if display_ranks_tables:
        static = ranksdf[['AGR', 'HPlane', 'LED', 'RBF', 'RTG', 'Wave']].sort_index(level=1, axis=1).transpose()
        drift = ranksdf[['AGR_a', 'AGR_g' , 'HPlane_f', 'HPlane_m', 'HPlane_s', 'LED_a', 'LED_g', 'RBF_f', 'RBF_m', 'RBF_s', 'Wform_d5n']].sort_index(level=1, axis=1).transpose()
        print("full")
        display(ranksdf.transpose())
        display(ranksdf.transpose().describe())
        if display_static:
            print('static')
            display(static)
            display(static.describe())
        if display_drift:
            print('drift')
            display(drift)
            display(drift.describe())
    
    if using_autorank:
        order = 'ascending' if ascending else 'descending'
        print(order)
        results = autorank(autorankdf.transpose(), alpha=0.05, verbose=True, order=order, force_mode='nonparametric')
        display(results[0])
        fig, maxi = plt.subplots()
        plot_stats(results, allow_insignificant=True, ax=maxi)
        if filename != None:
            plt.savefig(f'{filename}.png')
        create_report(results)


def create_report_mine(df, using_autorank=True, filename=None):
    resultdf=pd.DataFrame()
    for ds in df.Dataset.unique():
        adf = df[(df.Dataset == ds) & (df.leafAlg == 'NBA')].copy()
        # print(ds)
        rdf = adf.groupby(by=['prune_method','pruning_threshold','interval'],as_index=False)[['time', 'evaluationTime', 'acc','avg_size']].agg(['mean','std'])
        rdf['rank'] = rdf[('acc','mean')].rank(ascending=using_autorank, method='average')
        rdf = rdf.reset_index()
        rdf['new_id'] = rdf['prune_method'] + '-' + rdf['pruning_threshold'].astype(str) + '-' + rdf['interval'].astype(str)
        rdf['Dataset'] = ds
        rdf = rdf.set_index('new_id', drop=False)
        # display(rdf)
        drdf = rdf.pivot(index='new_id', columns='Dataset')['rank']
        resultdf = pd.concat([resultdf,drdf], axis=1)
        
    static = resultdf[['AGR', 'HPlane', 'LED', 'RBF', 'RTG', 'Wave']]
    drift = resultdf[['AGR_a', 'AGR_g' , 'HPlane_f', 'HPlane_m', 'HPlane_s', 'LED_a', 'LED_g', 'RBF_f', 'RBF_m', 'RBF_s', 'Wform_d5n']]
    print("full")
    display(resultdf.transpose())
    display(resultdf.transpose().describe())
    print('static')
    display(static.transpose())
    display(static.transpose().describe())
    print('drift')
    display(drift.transpose())
    display(drift.transpose().describe())
    
    ax = None
    if using_autorank:
        # display(resultdf)
        results = autorank(resultdf.transpose(), alpha=0.05, verbose=True)
        display(results[0])
        fig, maxi = plt.subplots()
        plot_stats(results, allow_insignificant=True, ax=maxi)
        if filename != None:
            fig.savefig(f'{filename}.png', dpi=300)
        create_report(results)

    return ax



# Standard RAP-HT

hdf = df[(~df.prune_method.isin(['HRAPTrgct', 'RAP-HTgct', 'RAPHTgct', 'EFDT', 'EFDTgct', 'HTgct', 'HAT','HT']))]
print(hdf.prune_method.unique())


create_report_mine(hdf, using_autorank=False)

create_report_mine(hdf, using_autorank=False)

In [None]:
hdf = df[(~df.prune_method.isin(['HTtie', 'HTconf', 'HTct', 'HRAPTrgct', 'RAP-HTgct', 'RAPHTgct', 'HRAPTrgct', 'EFDT', 'EFDTgct', 'HTgct', 'HAT','HT']))]
print(hdf.prune_method.unique())


create_report_mine(hdf, using_autorank=True, filename='RAP-HT_parameter')

	

In [None]:
create_report_measure(hdf, 'acc', ascending=False, using_autorank=True, filename='RAP-HT-parameter-acc')

In [None]:
create_report_measure(hdf, 'time', ascending=True, using_autorank=True)

In [None]:
create_report_measure(hdf, 'avg_size', ascending=True, using_autorank=True)

In [None]:
create_report_measure(hdf, 'max_size', ascending=True, using_autorank=True)

In [None]:
create_report_measure(hdf, 'total_pruned', ascending=False, using_autorank=True, filename='RAP-HT-parameter-pruned')

# gct version

In [None]:
hdf = df[(~df.prune_method.isin(['RAP-HT', 'RAPHT', 'HRAPTr', 'EFDT', 'EFDTgct', 'HTgct', 'HAT','HT', 'HTtie', 'HTconf', 'HTct']))]
print(hdf.prune_method.unique())

In [None]:
create_report_measure(hdf, using_autorank=True, measure='acc')

# both RAP-HT and RAP-HT gct

In [None]:
hdf = df[(~df.prune_method.isin(['EFDT', 'EFDTgct', 'HTgct', 'HAT','HT', 'HTconf', 'HTct', 'HTtie']))]
print(hdf.prune_method.unique())


In [None]:
create_report_measure(hdf, using_autorank=True)

---
# all methods
### filter bad RAP-HT out

In [1]:
hdf = df[~(df.prune_method.isin(['HTtie', 'HTconf', 'HTct', 'RAP-HTgct', 'EFDTgct', 'HTgct', 'HTct']))]
hdf = hdf[(hdf.interval == '0') | ((hdf.pruning_threshold == 5) & (hdf.interval == '2k'))]
# hdf = df[(~df.prune_method.isin(['EFDT', 'EFDTgct', 'HTgct', 'HAT','HT']))]
print(hdf.prune_method.unique())
measure_tables = False
drift_tables = False
static_tables = False

NameError: name 'df' is not defined

# create report by measures

In [None]:
create_report_measure(hdf, 'acc', using_autorank=True, ascending=False, display_measure_tables=measure_tables, display_drift=drift_tables, display_static=static_tables, filename='RAP-HT_literature_acc')

In [None]:
create_report_measure(hdf, 'time', using_autorank=True, ascending=True, display_measure_tables=measure_tables, display_drift=drift_tables, display_static=static_tables, filename='RAP-HT_literature_time')

In [None]:
create_report_measure(hdf, 'avg_size', using_autorank=True, ascending=True, display_measure_tables=True, display_drift=drift_tables, display_static=static_tables, filename='RAP-HT_literature_avg_size')

In [None]:
create_report_measure(hdf, 'max_size', using_autorank=True, ascending=True, display_measure_tables=measure_tables, display_drift=drift_tables, display_static=static_tables, filename='RAP-HT_literature_max_size')

# Tables for paper


In [None]:
def custom_function(row, ds, two_decimals=True):
    if two_decimals:
        return f"{row[('mean', ds)]:0.2f} \u00B1 {row[('std', ds)]:0.2f}"
    else:
        return f"{row[('mean', ds)]:0.1f} \u00B1 {row[('std', ds)]:0.1f}"

In [None]:
def create_report_paper(df, measure='acc', using_autorank=True, two_decimals=True):
    drift_data = ['AGR_a', 'AGR_g', 'HPlane_f', 'HPlane_m', 'HPlane_s', 'LED_a', 'LED_g', 'RBF_f', 'RBF_m', 'RBF_s', 'Wform_d5n']
    non_drift_data = ['AGR', 'HPlane', 'LED', 'RBF', 'RTG', 'Wave']
    datasets = drift_data + non_drift_data
    resultdf=pd.DataFrame()
    for ds in df.Dataset.unique():
        adf = df[(df.Dataset == ds) & (df.leafAlg == 'NBA')].copy()
        # print(ds)
        rdf = adf.groupby(by=['prune_method','pruning_threshold','interval'],as_index=False)[['time', 'evaluationTime', 'acc','avg_size','max_size', 'total_pruned']].agg(['mean','std'])
        # rdf['rank'] = rdf[('acc','mean')].rank(ascending=using_autorank, method='average')
        rdf = rdf.reset_index()
        rdf['new_id'] = rdf['prune_method'] + '-' + rdf['pruning_threshold'].astype(str) + '-' + rdf['interval'].astype(str)
        rdf['Dataset'] = ds
        rdf = rdf.set_index('new_id', drop=False)
        # display(rdf)
        drdf = rdf.pivot(index='new_id', columns='Dataset')[measure]
        resultdf = pd.concat([resultdf,drdf], axis=1)
        
    # static = resultdf[['AGR', 'HPlane', 'LED', 'RBF', 'RTG', 'Wave']]
    # drift = resultdf[['AGR_a', 'AGR_g' , 'HPlane_f', 'HPlane_m', 'HPlane_s', 'LED_a', 'LED_g', 'RBF_f', 'RBF_m', 'RBF_s', 'Wform_d5n']]
    print("full")
    display(resultdf)
    # display(resultdf)
    # print('static')
    # display(static.transpose())
    # display(static.transpose().describe())
    # print('drift')
    # display(drift.transpose())
    # display(drift.transpose().describe())
    
    # if using_autorank:
    #     # display(resultdf)
    #     results = autorank(resultdf.transpose(), alpha=0.05, verbose=True)
    #     display(results[0])
    #     plot_stats(results, allow_insignificant=True)
    #     create_report(results)
    mydf = resultdf.copy()
    
    for ds in datasets:
        mydf[('table',ds)] = mydf.apply(lambda x : custom_function(x, ds, two_decimals=two_decimals), axis=1)
    display(mydf)
    latexdf = mydf.drop(columns=['mean', 'std']).transpose().droplevel(0)
    print(latexdf.columns)
    print(latexdf.to_latex())

In [None]:
hdf = df[(~df.prune_method.isin(['HTtie', 'HTconf', 'HTct', 'HRAPTrgct', 'EFDT', 'EFDTgct', 'HTgct', 'HAT','HT']))]
print(hdf.prune_method.unique())


In [None]:
create_report_paper(hdf,using_autorank=False)
# mdf

In [None]:
hdf = df[(~df.prune_method.isin(['HTtie', 'HTconf', 'HTct', 'HRAPTrgct', 'EFDTgct', 'HTgct']))]
hdf = hdf[(hdf.interval == '0') | ((hdf.pruning_threshold == 5) & (hdf.interval == '2k'))]
print(hdf.prune_method.unique())


In [None]:
create_report_paper(hdf,using_autorank=False)
# mdf

In [None]:
create_report_paper(hdf,using_autorank=False,measure='time')
# mdf

In [None]:
create_report_paper(hdf,using_autorank=False,measure='avg_size', two_decimals=False)
# mdf

In [None]:
create_report_paper(hdf,using_autorank=False,measure='max_size',two_decimals=False)
# mdf

In [None]:
hdf = df[(~df.prune_method.isin(['HTtie', 'HTconf', 'HTct', 'HRAPTrgct', 'EFDT', 'EFDTgct', 'HTgct', 'HAT','HT']))]
print(hdf.prune_method.unique())


In [None]:
create_report_paper(hdf,using_autorank=False,measure='total_pruned',two_decimals=False)

In [None]:
create_report_paper(hdf,using_autorank=False,measure='avg_size',two_decimals=False)

In [None]:
import random

In [None]:

def get_list_and_count(seed, size):
	ret = []
	random.seed(seed)
	for i in range(size):
		if random.uniform(0,1) <= 0.1:
			ret.append(i)
	return ret

In [None]:
lst1 = get_list_and_count(1,10000)
print(lst1,'\n',len(lst1))

In [None]:
lst2 = get_list_and_count(2,10000)
print(lst2,'\n',len(lst2))

In [None]:
len([x for x in lst1 if x in lst2 ])