# Benchmark of Active Learning

### tools

In [1]:
import pandas as pd
import os
import numpy as np

In [2]:
from scipy import stats

In [3]:
def align_idx(idxArr_list):
    res = idxArr_list[0]
    for idxArr in idxArr_list[1:]:
        if idxArr is None:
            continue

        res = np.intersect1d(res, idxArr)
    return res

In [4]:
def tinterval_check(mean_poy, std_poy, n_poy, mean_XZ2021):
    se = std_poy / np.sqrt(n_poy)
    ci_95 = stats.t.interval(alpha=0.95, df=n_poy-1, loc=mean_poy, scale=se)
    if ci_95[0] <= mean_XZ2021 <= ci_95[1]:
        decision_95 = 0  # not significantly different with 95 confidence interval
    else:
        decision_95 = 1  # significantly different with 95 confidence interval

    ci_99 = stats.t.interval(alpha=0.99, df=n_poy-1, loc=mean_poy, scale=se)
    if ci_99[0] <= mean_XZ2021 <= ci_99[1]:
        decision_99 = 0  # not significantly different with 95 confidence interval
    else:
        decision_99 = 1  # significantly different with 95 confidence interval

    return decision_95, decision_99

### Download all results

Link: [https://drive.google.com/file/d/1qzezDD_fe43ctNBHC4H5W0w6skJcBlxB/view?usp=share_link](https://drive.google.com/file/d/1qzezDD_fe43ctNBHC4H5W0w6skJcBlxB/view?usp=share_link)

## Align results

We align all results on
- small datasets $n < 2000$ : more than 100 indicis
- large datasets $n \geq 2000$ : more than 10 indicis.

In [5]:
qs_list = ['uniform', 'us', 'qbc', 'hintsvm', 'quire', 'albl', 'dwus', 'vr', 'kcenter',  # libact
           'margin', 'graph', 'hier', 'infodiv', 'mcm',  # google
           'eer', 'bmdr', 'spal', 'lal',  # alipy
           'bsoDtst']
small_data_list = ["appendicitis", "sonar", "parkinsons", "ex8b", "heart", "haberman", "ionosphere", "clean1",
             "breast", "wdbc", "australian", "diabetes", "mammographic", "ex8a", "tic", "german",
             "splice", "gcloudb", "gcloudub", "checkerboard"]
large_data_list = ["spambase", "banana", "phoneme", "ringnorm", "twonorm", "phishing"]
data_list = small_data_list + large_data_list

In [6]:
names = os.listdir('./aubc/')
qs_map_pos = {k: i for i, k in enumerate(qs_list)}
table3_idx = {k: [None for _ in range(len(qs_list))] for k in data_list}

In [7]:
for name in names:
    if not name.endswith('.csv'):
        continue
    terms = name.split('-')
    if 'look' in name:
        qs = terms[1] + terms[6].split('_')[-1][-4:]
    else:
        qs = terms[1]

    data = terms[0]

    res = pd.read_csv(os.path.join('./aubc/', name))

    idx = res['res_expno'].unique()

    if data in large_data_list:
        if len(idx) < 10:
            print(f'{data}-{qs}: {len(idx)} < 10 times')
            continue
    else:
        if len(idx) < 100:
            print(f'{data}-{qs}: {len(idx)} < 100 times')
            continue

    table3_idx[data][qs_map_pos[qs]] = idx

clean1-vr: 8 < 100 times
phoneme-vr: 6 < 10 times
checkerboard-spal: 96 < 100 times
spambase-quire: 1 < 10 times


In [8]:
aligned_idx = []
for data in table3_idx:
    align_idx_arr = align_idx(table3_idx[data])
    n_exp = len(align_idx_arr)
    if data in large_data_list:
        align_idx_arr = align_idx_arr[:10]
        assert align_idx_arr.shape[0] == 10, f'Size of {data} is not correct. {(align_idx_arr.shape[0])}'
    else:
        align_idx_arr = align_idx_arr[:100]
        assert align_idx_arr.shape[0] == 100, f'Size of {data} is not correct. {(align_idx_arr.shape[0])}'

    aligned_idx.append([data, n_exp, f'{align_idx_arr.tolist()}'])

aligned_idx = pd.DataFrame(aligned_idx)
# aligned_idx.to_csv('output/aligned_idx.csv')

In [9]:
aligned_idx.head()

Unnamed: 0,0,1,2
0,appendicitis,150,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
1,sonar,140,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
2,parkinsons,150,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
3,ex8b,148,"[0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14..."
4,heart,150,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."


## Mean AUBCs

- small datasets $n < 2000$ : only use first 100 indicis
- large datasets $n \geq 2000$ : only use first 10 indicis.

Calculate average (mean) and standard deviation of AUBCs.

In [10]:
aligned_idx_dict = {}
for data, idx in zip(aligned_idx[0], aligned_idx[2]):
    aligned_idx_dict[data] = eval(idx)

In [11]:
table3_mean = []
table3_std = []
index = []
for name in names:
    if not name.endswith('.csv'):
        continue

    terms = name.split('-')
    if 'look' in name:
        qs = terms[1] + terms[6].split('_')[-1][-4:]
    else:
        qs = terms[1]

    data = terms[0]

    if (data, qs) in index:
        breakpoint()

    index.append((data, qs))
    res = pd.read_csv(os.path.join('./aubc/', name))

    # aligned index
    if aligned_idx_dict is not None:
        res = res[res['res_expno'].isin(aligned_idx_dict[data])]

    cnt_aubc = res['res_tst_score'].count()
    if data in large_data_list:
        if cnt_aubc < 10:
            continue
    else:
        if cnt_aubc < 100:
            continue

    mean_aubc = res['res_tst_score'].mean()
    std_aubc = res['res_tst_score'].std()
    mean_aubc = round(mean_aubc, 4)
    std_aubc = round(std_aubc, 4)

    table3_mean.append([data, qs, mean_aubc])
    table3_std.append([data, qs, std_aubc])

In [12]:
table3_mean = pd.DataFrame(table3_mean)
table3_std = pd.DataFrame(table3_std)

table3_mean.columns = ['data', 'qs', 'aubc_mean']
table3_std.columns = ['data', 'qs', 'aubc_std']

table3_mean = pd.pivot(table3_mean, values='aubc_mean', index=['qs'], columns=['data'])
table3_std = pd.pivot(table3_std, values='aubc_std', index=['qs'], columns=['data'])

In [13]:
table3_mean = table3_mean.reindex(index=qs_list, columns=data_list)
table3_std = table3_std.reindex(index=qs_list, columns=data_list)

Export to LaTeX.

In [14]:
al_methods = qs_list[1:-1]

In [15]:
# largest three values in each column
# make them as bold, bold, underline
lbracebracket = f'{chr(123)}'
rbracebracket = f'{chr(125)}'
tbf = f'{chr(92)}textbf'
tit = f'{chr(92)}textit'
udl = f'{chr(92)}underline'
# str of mean and std AUBCs
report1_mean = table3_mean.astype(str)
report1_std = table3_std.astype(str)
for d in table3_mean.columns:
    bst_q = table3_mean.loc[al_methods, d].nlargest(3+1).index  # as margin == infodiv in the current setting
    if 'infodiv' not in bst_q:
        bst_q = bst_q[:3]
    else:
        bst_q = bst_q.drop('infodiv')

    report1_mean.loc[bst_q[0], d] = f'{tbf}{lbracebracket}{tit}{lbracebracket}{report1_mean.loc[bst_q[0], d]}{rbracebracket}{rbracebracket}'
    report1_mean.loc[bst_q[1], d] = f'{tbf}{lbracebracket}{report1_mean.loc[bst_q[1], d]}{rbracebracket}'
    report1_mean.loc[bst_q[2], d] = f'{tit}{lbracebracket}{report1_mean.loc[bst_q[2], d]}{rbracebracket}'

    bst_q_std = table3_std.loc[al_methods, d].nsmallest(3+1).index
    if 'infodiv' not in bst_q_std:
        bst_q_std = bst_q_std[:3]
    else:
        bst_q_std = bst_q_std.drop('infodiv')

    report1_std.loc[bst_q_std[0], d] = f'{tbf}{lbracebracket}{tit}{lbracebracket}{report1_std.loc[bst_q_std[0], d]}{rbracebracket}{rbracebracket}'
    report1_std.loc[bst_q_std[1], d] = f'{tbf}{lbracebracket}{tit}{lbracebracket}{report1_std.loc[bst_q_std[1], d]}{rbracebracket}{rbracebracket}'
    report1_std.loc[bst_q_std[2], d] = f'{tbf}{lbracebracket}{tit}{lbracebracket}{report1_std.loc[bst_q_std[2], d]}{rbracebracket}{rbracebracket}'

In [16]:
report1 = report1_mean + '(' + report1_std + ')'
report_index = ['uniform', 'us', 'qbc', 'hintsvm', 'quire', 'albl', 'dwus', 'vr',
                'kcenter', 'margin', 'graph', 'hier', 'infodiv', 'mcm', 'eer', 'bmdr',
                'spal', 'lal', 'bsoDtst']
report1 = report1.loc[report_index, :]
report1.index = ['uniform', 'us', 'qbc', 'hintsvm', 'quire', 'albl', 'dwus', 'vr',
                'kcenter', 'margin', 'graph', 'hier', 'infodiv', 'mcm', 'eer', 'bmdr',
                'spal', 'lal', 'bso']

report1 = report1.T

In [17]:
report1 = report1.replace(to_replace='nan(nan)', value='too long (time)')

In [18]:
# checkerboard-spal: 96 < 100 times
# spambase-quire: 1 < 10 times
report1.loc['checkerboard', 'spal'] = 'error'
report1.loc['spambase', 'quire'] = 'error'

In [19]:
# Add XZ2021 results
xz2021_table3 = pd.read_csv('table3-xz2021.csv')
xz2021_table3 = xz2021_table3.set_index('XZ2021')

In [20]:
xz2021_table3_data = xz2021_table3.index
for d in xz2021_table3_data:
    qs = xz2021_table3.loc[d, 'BEST_mhd']
    report1.loc[d, qs] = f'{udl}{lbracebracket}{report1.loc[d, qs]}{rbracebracket}'
    qs = xz2021_table3.loc[d, 'WORST_mhd']
    report1.loc[d, qs] = f'{udl}{lbracebracket}{report1.loc[d, qs]}{rbracebracket}'
    qs = 'uniform'
    report1.loc[d, qs] = f'{udl}{lbracebracket}{report1.loc[d, qs]}{rbracebracket}'
    qs = 'bso'
    report1.loc[d, qs] = f'{udl}{lbracebracket}{report1.loc[d, qs]}{rbracebracket}'

In [21]:
with pd.option_context("max_colwidth", -1):
    for i, col_i in enumerate(range(0, 24, 6)):
        report1.iloc[:, col_i:col_i+6].to_latex(f'./aubc/output/table1-intermediate-{i}.tex',
                                                position='h',
                                                caption='Intermediate table of mean(std) AUBCs',
                                                escape=False)

## Reproducibility of Table 3

We check whether mean of AUBCs in [Zhan et al., 2021] locating in
- confidence interval with $\alpha=0.05$ significance level.
- confidence interval with $\alpha=0.01$ significance level.

We suppose both of experiments have the same settings.
They will generate independent, identical distribution (i.i.d.) results.

*ChatGPT*
> If you have the mean of one sample and you want to compare it to the median of another sample, you can use the confidence interval for the mean of the first sample to see if the median of the second sample falls within the interval. This will give you an idea of whether the median of the second sample is significantly different from the mean of the first sample, but it will not be the same as the Mann-Whitney U test, which compares the medians of two independent samples.
> This will calculate the 95% confidence interval for the mean of the first sample. You can then compare the median of the second sample to this interval to see if it falls within the interval. If the median falls within the interval, it suggests that the median is not significantly different from the mean of the first sample. If the median falls outside the interval, it suggests that the median is significantly different from the mean of the first sample.
> Keep in mind that this approach will give you an idea of whether the median of the second sample is significantly different from the mean of the first sample, but it will not provide a formal hypothesis test or p-value like the Mann-Whitney U test.

In [22]:
xz2021_table3.columns = ['uniform', 'bsoDtst', 'Avg', 'BEST_val', 'BEST_mhd', 'WORST_val', 'WORST_mhd']
xz2021_table3.head()

Unnamed: 0_level_0,uniform,bsoDtst,Avg,BEST_val,BEST_mhd,WORST_val,WORST_mhd
XZ2021,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
appendicitis,0.836,0.881,0.844,0.859,eer,0.826,dwus
sonar,0.617,0.83,0.755,0.775,lal,0.732,hintsvm
parkinsons,0.84,0.865,0.845,0.858,qbc,0.829,hintsvm
ex8b,0.866,0.924,0.89,0.909,spal,0.864,hintsvm
heart,0.808,0.848,0.787,0.83,infodiv,0.718,dwus


In [23]:
table3_mean.loc['uniform', 'sonar'], table3_std.loc['uniform', 'sonar']

(0.7463, 0.0379)

In [24]:
res = tinterval_check(table3_mean.loc['uniform', 'ionosphere'], table3_std.loc['uniform', 'ionosphere'], 100,
                      xz2021_table3.loc['ionosphere', 'uniform'])
res

(1, 1)

In [25]:
report2 = xz2021_table3.copy().applymap(lambda x: None)

In [26]:
# 1. deal with uniform and bso
for qs_name in ['uniform', 'bsoDtst']:
    for data_name in report2.index:
        if data_name in large_data_list:
            n_samples = 10
        else:
            n_samples = 100

        if np.isnan(xz2021_table3.loc[data_name, qs_name]):
            continue
            
        d_95, d_99 = tinterval_check(
            table3_mean.loc[qs_name, data_name],
            table3_std.loc[qs_name, data_name],
            n_samples,
            xz2021_table3.loc[data_name, qs_name]
        )

        # update value with Poy's results
        report2.loc[data_name, qs_name] = table3_mean.loc[qs_name, data_name]

        # show results
        if d_95 == 1:
            report2.loc[data_name, qs_name] = f'{report2.loc[data_name, qs_name]}*'

        if d_99 == 1:
            report2.loc[data_name, qs_name] = f'{report2.loc[data_name, qs_name]}*'

In [27]:
# 2. deal with Avg
report2_avg = table3_mean.loc[al_methods, :].mean().round(4)
report2_avg_std = table3_mean.loc[al_methods, :].std().round(6)
report2_avg_cnt = len(al_methods)

In [28]:
col = 'Avg'
for data_name in report2.index:
    if np.isnan(xz2021_table3.loc[data_name, col]):
        continue

    d_95_avg, d_99_avg = tinterval_check(
        report2_avg.loc[data_name],
        report2_avg_std.loc[data_name],
        report2_avg_cnt,
        xz2021_table3.loc[data_name, col]
    )

    # update value with Poy's results
    report2.loc[data_name, col] = report2_avg.loc[data_name]

    # show results
    if d_95_avg == 1:
        report2.loc[data_name, col] = f'{report2.loc[data_name, col]}*'

    if d_99_avg == 1:
        report2.loc[data_name, col] = f'{report2.loc[data_name, col]}*'

In [29]:
for data_name in report2.index:
    # update value with Poy's results
    qs_name = table3_mean.loc[al_methods, data_name].idxmax()
    report2.loc[data_name, 'BEST_val'] = table3_mean.loc[qs_name, data_name]
    if xz2021_table3.loc[data_name, 'BEST_mhd'] != qs_name:
        report2.loc[data_name, 'BEST_mhd'] = f'{qs_name}*'
    else:
        report2.loc[data_name, 'BEST_mhd'] = qs_name

In [30]:
for data_name in report2.index:
    # update value with Poy's results
    qs_name = table3_mean.loc[al_methods, data_name].idxmin()
    report2.loc[data_name, 'WORST_val'] = table3_mean.loc[qs_name, data_name]
    if xz2021_table3.loc[data_name, 'WORST_mhd'] != qs_name:
        report2.loc[data_name, 'WORST_mhd'] = f'{qs_name}*'
    else:
        report2.loc[data_name, 'WORST_mhd'] = qs_name

In [31]:
# update columns
report2.columns = ['RS', 'BSO', 'Avg', 'BEST_val', 'BEST_mhd', 'WORST_val', 'WORST_mhd']
report2.to_latex('./aubc/output/table2-table3.tex',
                 position='h',
                 label='tab2:tab3',
                 caption='Reporduce of Table 3~\\citep{XZ2021}')