# Wilcoxon SIGNED test-BIAS
By: Sam<br>
Updated at: 16/10/2022<br>
Compare performance of discretizers <br>
Matched pairs settings
Sample: 270 ML models after discretization (as at 16/10/2022)
Purpose: pair-wise comparison metrics of the models using different discretization method

===

Input data: instrinsic properties and model performance metrics
!!! **NB: Please update the data for metrics and export to csv before running this script!

In [3]:
# Import library
import pandas as pd
import numpy as np
from scipy import stats
import math
import random

In [4]:
# Import evaluation data (updated at 16/10/2022)
data = pd.read_csv("all_evaluation_161022.csv")

In [5]:
data.head()

Unnamed: 0,dataset,disc,param,inconsistency,models,accuracy,con_features,time_disc,time_train,bias,variance
0,iris,EWD,4,0.06666667,ID3,0.84,4,0.016412,0.008698225,0.158,0.055
1,iris,EWD,7,0.02,ID3,0.79,4,0.015692,0.010634899,0.158,0.054
2,iris,EWD,10,0.006666667,ID3,0.95,4,0.01638,0.010643005,0.053,0.014
3,iris,EFD,4,0.04,ID3,0.84,4,0.016688,0.009439945,0.158,0.049
4,iris,EFD,7,0.04,ID3,0.95,4,0.023941,0.010675907,0.053,0.07


In [6]:
data['disc'].unique() # get list of discretizers

array(['EWD', 'EFD', 'FFD', 'ChiMerge', 'DT'], dtype=object)

In [7]:
data.columns

Index(['dataset', 'disc', 'param', 'inconsistency', 'models', 'accuracy',
       'con_features', 'time_disc', 'time_train', 'bias', 'variance'],
      dtype='object')

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 270 entries, 0 to 269
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   dataset        270 non-null    object 
 1   disc           270 non-null    object 
 2   param          270 non-null    int64  
 3   inconsistency  270 non-null    object 
 4   models         270 non-null    object 
 5   accuracy       270 non-null    object 
 6   con_features   270 non-null    int64  
 7   time_disc      270 non-null    float64
 8   time_train     269 non-null    object 
 9   bias           234 non-null    object 
 10  variance       234 non-null    object 
dtypes: float64(1), int64(2), object(8)
memory usage: 23.3+ KB


In [9]:
data['models'].unique()

array(['ID3', 'CNB', 'Knn-VDM'], dtype=object)

# Wilcoxon_Complete pipeline

## 1.  Wilcoxon signed t test, bias
Ref: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.wilcoxon.html
Implement 4 replications: (DONE)
- Regardless algorithms
- Filter for each algorithm: CNB, ID3, Knn

## 1. 1 Bias, no filter in algorithm

In [10]:
# Preparation: Prepare list of metrics for each discretization (test_list)
# Step 1: Obtain bias for each discretization, convert into numeric, string values will be return as NaN
ewd_bias = pd.to_numeric(data[data['disc']=="EWD"]['bias'],errors='coerce').tolist()
efd_bias = pd.to_numeric(data[data['disc']=="EFD"]['bias'],errors='coerce').tolist()
ffd_bias = pd.to_numeric(data[data['disc']=="FFD"]['bias'],errors='coerce').tolist()
cm_bias = pd.to_numeric(data[data['disc']=="ChiMerge"]['bias'],errors='coerce').tolist()
dt_bias = pd.to_numeric(data[data['disc']=="DT"]['bias'],errors='coerce').tolist()
# Check number of metrics available for each discretizer
# print(len(ewd_bias))
# print(len(efd_bias))
# print(len(ffd_bias))
# print(len(cm_bias))
# print(len(dt_bias))

# Step 2: filter numeric values
raw_list = [ewd_bias, efd_bias, ffd_bias, cm_bias, dt_bias]
num_list = [] # filter numeric values only
for metric in raw_list:
#     metric_new = []
#     for x in metric:
#         if math.isnan(x) == False:
#             metric_new.append(x)
    metric_new = [x for x in metric if (math.isnan(x) == False)] # using list comprehension
    #print(metric_new)
    #print(len(metric_new))
    num_list.append(metric_new)
    
# Step 3: random sample
# For discretization methods with different value, randomly select so that the final sample size are equal
# Reference: https://docs.python.org/3/library/random.html
k = min(len(metric) for metric in num_list)
test_list = []
for metric in num_list:
    random.seed(20)
    if len(metric) > k:
        metric = random.sample(metric, k=k)
    else: metric = metric
    #print(metric)
    test_list.append(metric)

In [11]:
# Initialiation
disc_key = ['ewd', 'efd', 'ffd', 'cm', 'dt']
disc_value = test_list # list of metrics for each discretization after preparation
test_stat = []
p_value = []
disc_compare = []
# Create dictionary store discretization and series of accuracy
disc = {}
for key in disc_key:
    for value in disc_value:
        disc[key] = value
        disc_value.remove(value)
        break  

# Create loop for Wilcoxon test (two sided) - handle diff = 0
for i in disc_key:
    for j in disc_key:
        if i != j:
            disc_compare.append(f'{i} vs {j}')
            # print(f'{i} vs {j}')
            # Compute difference list
            diff_list = []
            for m in range(0, len(disc[i])):
                diff = disc[i][m] - disc[j][m]
                #print(disc[i][m])
                #print(disc[j][m])
#                 print('diff = ', diff)
#                 print('-------------')
                diff_list.append(diff)
                # print(diff_list)
                
            if all(item == 0 for item in diff_list) == False: # if the diff list does not contain all 0
                test_stat.append(stats.wilcoxon(diff_list).statistic)
                p_value.append(stats.wilcoxon(diff_list).pvalue)
            else: # if the diff list contain only0, cannot do Wilcoxon test
                error = 'N/A'
                test_stat.append(error)
                p_value.append(error)
# print(test_stat)
# print(p_value)

In [12]:
# Convert 3 lists to dataframe
disc_compare = pd.DataFrame(disc_compare, columns=['disc_compare'])
test_stat = pd.DataFrame(test_stat, columns=['wtest_stat'])
p_value = pd.DataFrame(p_value, columns=['p_value'])

In [13]:
# Result table
wt_result = pd.concat([disc_compare, test_stat, p_value], axis = 1)
# Drop duplicate
wt_result.drop_duplicates(subset=['p_value'], inplace=True)

In [14]:
wt_result

Unnamed: 0,disc_compare,wtest_stat,p_value
0,ewd vs efd,251.5,0.81517
1,ewd vs ffd,201.0,0.238783
2,ewd vs cm,267.0,0.80938
3,ewd vs dt,218.5,0.267936
5,efd vs ffd,240.0,0.653594
6,efd vs cm,248.5,0.771929
7,efd vs dt,187.0,0.231913
10,ffd vs cm,236.0,0.600577
11,ffd vs dt,124.0,0.008847
15,cm vs dt,211.0,0.214305


## 1.2 Bias, only CNB

In [15]:
# Preparation: Prepare list of metrics for each discretization (test_list)
# Step 1: Obtain bias for each discretization, convert into numeric, string values will be return as NaN
# Filter for CNB
# Obtain bias for each discretization, convert into numeric, string values will be return as NaN
ewd_bias_cnb = pd.to_numeric(data[(data['disc']=="EWD") & (data['models']=="CNB")]['bias'],errors='coerce').tolist()
efd_bias_cnb = pd.to_numeric(data[(data['disc']=="EFD") & (data['models']=="CNB")]['bias'],errors='coerce').tolist()
ffd_bias_cnb = pd.to_numeric(data[(data['disc']=="FFD") & (data['models']=="CNB")]['bias'],errors='coerce').tolist()
cm_bias_cnb = pd.to_numeric(data[(data['disc']=="ChiMerge") & (data['models']=="CNB")]['bias'],errors='coerce').tolist()
dt_bias_cnb = pd.to_numeric(data[(data['disc']=="DT") & (data['models']=="CNB")]['bias'],errors='coerce').tolist()
# Check number of metrics available for each discretizer
print(len(ewd_bias_cnb))
print(len(efd_bias_cnb))
print(len(ffd_bias_cnb))
print(len(cm_bias_cnb))
print(len(dt_bias_cnb))

# Step 2: filter numeric values
raw_list = [ewd_bias_cnb,efd_bias_cnb, ffd_bias_cnb, cm_bias_cnb, dt_bias_cnb]
num_list = [] # filter numeric values only
for metric in raw_list:
#     metric_new = []
#     for x in metric:
#         if math.isnan(x) == False:
#             metric_new.append(x)
    metric_new = [x for x in metric if (math.isnan(x) == False)] # using list comprehension
    #print(metric_new)
    #print(len(metric_new))
    num_list.append(metric_new)
    
# Step 3: random sample
# For discretization methods with different value, randomly select so that the final sample size are equal
# Reference: https://docs.python.org/3/library/random.html
k = min(len(metric) for metric in num_list)
test_list = []
for metric in num_list:
    random.seed(20)
    if len(metric) > k:
        metric = random.sample(metric, k=k)
    else: metric = metric
    #print(metric)
    test_list.append(metric)

15
15
20
20
20


In [16]:
# Initialiation
disc_key = ['ewd', 'efd', 'ffd', 'cm', 'dt']
disc_value = test_list # list of metrics for each discretization after preparation
test_stat = []
p_value = []
disc_compare = []
# Create dictionary store discretization and series of accuracy
disc = {}
for key in disc_key:
    for value in disc_value:
        disc[key] = value
        disc_value.remove(value)
        break  

# Create loop for Wilcoxon test (two sided) - handle diff = 0
for i in disc_key:
    for j in disc_key:
        if i != j:
            disc_compare.append(f'{i} vs {j}')
            # print(f'{i} vs {j}')
            # Compute difference list
            diff_list = []
            for m in range(0, len(disc[i])):
                # diff = disc[i][m] - disc[j][m]
                #print(disc[i][m])
                #print(disc[j][m])
#                 print('diff = ', diff)
#                 print('-------------')
                diff_list.append(diff)
                # print(diff_list)
                
            if all(item == 0 for item in diff_list) == False: # if the diff list does not contain all 0
                test_stat.append(stats.wilcoxon(diff_list).statistic)
                p_value.append(stats.wilcoxon(diff_list).pvalue)
            else: # if the diff list contain only0, cannot do Wilcoxon test
                error = 'N/A'
                test_stat.append(error)
                p_value.append(error)
# print(test_stat)
# print(p_value)

In [17]:
# Convert 3 lists to dataframe
disc_compare = pd.DataFrame(disc_compare, columns=['disc_compare'])
test_stat = pd.DataFrame(test_stat, columns=['wtest_stat'])
p_value = pd.DataFrame(p_value, columns=['p_value'])

In [18]:
# Result table
wt_result = pd.concat([disc_compare, test_stat, p_value], axis = 1)
# Drop duplicate
wt_result.drop_duplicates(subset=['p_value'], inplace=True)

In [19]:
wt_result

Unnamed: 0,disc_compare,wtest_stat,p_value
0,ewd vs efd,0.0,0.000488


## 1.2 Bias, only ID3

In [20]:
# Preparation: Prepare list of metrics for each discretization (test_list)
# Step 1: Obtain bias for each discretization, convert into numeric, string values will be return as NaN
# Filter for ID3
# Obtain bias for each discretization, convert into numeric, string values will be return as NaN
ewd_bias_ID3 = pd.to_numeric(data[(data['disc']=="EWD") & (data['models']=="ID3")]['bias'],errors='coerce').tolist()
efd_bias_ID3 = pd.to_numeric(data[(data['disc']=="EFD") & (data['models']=="ID3")]['bias'],errors='coerce').tolist()
ffd_bias_ID3 = pd.to_numeric(data[(data['disc']=="FFD") & (data['models']=="ID3")]['bias'],errors='coerce').tolist()
cm_bias_ID3 = pd.to_numeric(data[(data['disc']=="ChiMerge") & (data['models']=="ID3")]['bias'],errors='coerce').tolist()
dt_bias_ID3 = pd.to_numeric(data[(data['disc']=="DT") & (data['models']=="ID3")]['bias'],errors='coerce').tolist()
# Check number of metrics available for each discretizer
print(len(ewd_bias_ID3))
print(len(efd_bias_ID3))
print(len(ffd_bias_ID3))
print(len(cm_bias_ID3))
print(len(dt_bias_ID3))

# Step 2: filter numeric values
raw_list = [ewd_bias_ID3,efd_bias_ID3, ffd_bias_ID3, cm_bias_ID3, dt_bias_ID3]
num_list = [] # filter numeric values only
for metric in raw_list:
#     metric_new = []
#     for x in metric:
#         if math.isnan(x) == False:
#             metric_new.append(x)
    metric_new = [x for x in metric if (math.isnan(x) == False)] # using list comprehension
    #print(metric_new)
    #print(len(metric_new))
    num_list.append(metric_new)
    
# Step 3: random sample
# For discretization methods with different value, randomly select so that the final sample size are equal
# Reference: https://docs.python.org/3/library/random.html
k = min(len(metric) for metric in num_list)
test_list = []
for metric in num_list:
    random.seed(20)
    if len(metric) > k:
        metric = random.sample(metric, k=k)
    else: metric = metric
    #print(metric)
    test_list.append(metric)

15
15
20
20
20


In [21]:
# Initialiation
disc_key = ['ewd', 'efd', 'ffd', 'cm', 'dt']
disc_value = test_list # list of metrics for each discretization after preparation
test_stat = []
p_value = []
disc_compare = []
# Create dictionary store discretization and series of accuracy
disc = {}
for key in disc_key:
    for value in disc_value:
        disc[key] = value
        disc_value.remove(value)
        break  

# Create loop for Wilcoxon test (two sided) - handle diff = 0
for i in disc_key:
    for j in disc_key:
        if i != j:
            disc_compare.append(f'{i} vs {j}')
            # print(f'{i} vs {j}')
            # Compute difference list
            diff_list = []
            for m in range(0, len(disc[i])):
                diff = disc[i][m] - disc[j][m]
                #print(disc[i][m])
                #print(disc[j][m])
#                 print('diff = ', diff)
#                 print('-------------')
                diff_list.append(diff)
                # print(diff_list)
                
            if all(item == 0 for item in diff_list) == False: # if the diff list does not contain all 0
                test_stat.append(stats.wilcoxon(diff_list).statistic)
                p_value.append(stats.wilcoxon(diff_list).pvalue)
            else: # if the diff list contain only0, cannot do Wilcoxon test
                error = 'N/A'
                test_stat.append(error)
                p_value.append(error)
# print(test_stat)
# print(p_value)



In [22]:
# Convert 3 lists to dataframe
disc_compare = pd.DataFrame(disc_compare, columns=['disc_compare'])
test_stat = pd.DataFrame(test_stat, columns=['wtest_stat'])
p_value = pd.DataFrame(p_value, columns=['p_value'])

In [23]:
# Result table
wt_result = pd.concat([disc_compare, test_stat, p_value], axis = 1)
# Drop duplicate
wt_result.drop_duplicates(subset=['p_value'], inplace=True)

In [24]:
wt_result

Unnamed: 0,disc_compare,wtest_stat,p_value
0,ewd vs efd,12.0,0.03417
1,ewd vs ffd,36.5,0.207764
2,ewd vs cm,33.0,0.220899
3,ewd vs dt,27.0,0.063721
5,efd vs ffd,43.5,0.389404
6,efd vs cm,40.0,0.432626
7,efd vs dt,29.0,0.083252
8,ffd vs ewd,36.5,0.187622
9,ffd vs efd,43.5,0.359131
10,ffd vs cm,22.0,0.18217


## 1.3 Bias, only KNN-VDM


In [25]:
# Preparation: Prepare list of metrics for each discretization (test_list)
# Step 1: Obtain bias for each discretization, convert into numeric, string values will be return as NaN
# Filter for Knn-VDM
# Obtain bias for each discretization, convert into numeric, string values will be return as NaN
ewd_bias_knn = pd.to_numeric(data[(data['disc']=="EWD") & (data['models']=="Knn-VDM")]['bias'],errors='coerce').tolist()
efd_bias_knn = pd.to_numeric(data[(data['disc']=="EFD") & (data['models']=="Knn-VDM")]['bias'],errors='coerce').tolist()
ffd_bias_knn = pd.to_numeric(data[(data['disc']=="FFD") & (data['models']=="Knn-VDM")]['bias'],errors='coerce').tolist()
cm_bias_knn = pd.to_numeric(data[(data['disc']=="ChiMerge") & (data['models']=="Knn-VDM")]['bias'],errors='coerce').tolist()
dt_bias_knn = pd.to_numeric(data[(data['disc']=="ChiMerge") & (data['models']=="Knn-VDM")]['bias'],errors='coerce').tolist()

# Check number of metrics available for each discretizer
print(len(ewd_bias_knn))
print(len(efd_bias_knn))
print(len(ffd_bias_knn))
print(len(cm_bias_knn))
print(len(dt_bias_knn))

# Step 2: filter numeric values
raw_list = [ewd_bias_knn,efd_bias_knn, ffd_bias_knn, cm_bias_knn, dt_bias_knn]
num_list = [] # filter numeric values only
for metric in raw_list:
#     metric_new = []
#     for x in metric:
#         if math.isnan(x) == False:
#             metric_new.append(x)
    metric_new = [x for x in metric if (math.isnan(x) == False)] # using list comprehension
    #print(metric_new)
    #print(len(metric_new))
    num_list.append(metric_new)
    
# Step 3: random sample
# For discretization methods with different value, randomly select so that the final sample size are equal
# Reference: https://docs.python.org/3/library/random.html
k = min(len(metric) for metric in num_list)
test_list = []
for metric in num_list:
    random.seed(20)
    if len(metric) > k:
        metric = random.sample(metric, k=k)
    else: metric = metric
    #print(metric)
    test_list.append(metric)

15
15
20
20
20


In [26]:
# Initialiation
disc_key = ['ewd', 'efd', 'ffd', 'cm', 'dt']
disc_value = test_list # list of metrics for each discretization after preparation
test_stat = []
p_value = []
disc_compare = []
# Create dictionary store discretization and series of accuracy
disc = {}
for key in disc_key:
    for value in disc_value:
        disc[key] = value
        disc_value.remove(value)
        break  

In [27]:
# Create loop for Wilcoxon test (two sided) - handle diff = 0
for i in disc_key:
    for j in disc_key:
        if i != j:
            disc_compare.append(f'{i} vs {j}')
            #print(f'{i} vs {j}')
            # Compute difference list
            diff_list = []
            for m in range(0, len(disc[i])):
                diff = disc[i][m] - disc[j][m]
                #print(disc[i][m])
                #print(disc[j][m])
#                 print('diff = ', diff)
#                 print('-------------')
                diff_list.append(diff)
                # print(diff_list)
                
            if all(item == 0 for item in diff_list) == False: # if the diff list does not contain all 0
                test_stat.append(stats.wilcoxon(diff_list).statistic)
                p_value.append(stats.wilcoxon(diff_list).pvalue)
            else: # if the diff list contain only0, cannot do Wilcoxon test
                error = 'N/A'
                test_stat.append(error)
                p_value.append(error)
# print(test_stat)
# print(p_value)



In [28]:
# Convert 3 lists to dataframe
disc_compare = pd.DataFrame(disc_compare, columns=['disc_compare'])
test_stat = pd.DataFrame(test_stat, columns=['wtest_stat'])
p_value = pd.DataFrame(p_value, columns=['p_value'])

In [29]:
# Result table
wt_result = pd.concat([disc_compare, test_stat, p_value], axis = 1)
# Drop duplicate
wt_result.drop_duplicates(subset=['p_value'], inplace=True)

In [30]:
wt_result

Unnamed: 0,disc_compare,wtest_stat,p_value
0,ewd vs efd,4.0,0.21875
1,ewd vs ffd,6.0,0.4375
2,ewd vs cm,7.0,0.5625
5,efd vs ffd,8.5,0.84375
6,efd vs cm,6.0,0.68583
9,ffd vs efd,8.5,0.6875
10,ffd vs cm,10.0,1.0
15,cm vs dt,,
