# Wilcoxon SIGNED t-test-Variance
By: Sam<br>
Updated at: 16/10/2022<br>
Compare performance of discretizers <br>
Matched pairs settings
Sample: 270 ML models after discretization (as at 16/10)
Purpose: pair-wise comparison metrics of the models using different discretization method

===

Input data: instrinsic properties and model performance metrics
!!! **NB: Please update the data for metrics and export to csv before running this script!

In [1]:
# Import library
import pandas as pd
import numpy as np
from scipy import stats
import math
import random

In [2]:
# Import evaluation data (updated at 16/10/2022)
data = pd.read_csv("all_evaluation_161022.csv")

In [3]:
data.head()

Unnamed: 0,dataset,disc,param,inconsistency,models,accuracy,con_features,time_disc,time_train,bias,variance
0,iris,EWD,4,0.06666667,ID3,0.84,4,0.016412,0.008698225,0.158,0.055
1,iris,EWD,7,0.02,ID3,0.79,4,0.015692,0.010634899,0.158,0.054
2,iris,EWD,10,0.006666667,ID3,0.95,4,0.01638,0.010643005,0.053,0.014
3,iris,EFD,4,0.04,ID3,0.84,4,0.016688,0.009439945,0.158,0.049
4,iris,EFD,7,0.04,ID3,0.95,4,0.023941,0.010675907,0.053,0.07


In [4]:
data['disc'].unique() # get list of discretizers

array(['EWD', 'EFD', 'FFD', 'ChiMerge', 'DT'], dtype=object)

In [5]:
data.columns

Index(['dataset', 'disc', 'param', 'inconsistency', 'models', 'accuracy',
       'con_features', 'time_disc', 'time_train', 'bias', 'variance'],
      dtype='object')

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 270 entries, 0 to 269
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   dataset        270 non-null    object 
 1   disc           270 non-null    object 
 2   param          270 non-null    int64  
 3   inconsistency  270 non-null    object 
 4   models         270 non-null    object 
 5   accuracy       270 non-null    object 
 6   con_features   270 non-null    int64  
 7   time_disc      270 non-null    float64
 8   time_train     269 non-null    object 
 9   bias           234 non-null    object 
 10  variance       234 non-null    object 
dtypes: float64(1), int64(2), object(8)
memory usage: 23.3+ KB


In [7]:
data['models'].unique()

array(['ID3', 'CNB', 'Knn-VDM'], dtype=object)

# Wilcoxon_Complete pipeline

##.  Wilcoxon signed t test, Variance
Ref: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.wilcoxon.html
Implement 4 replications: (DONE)
- Regardless algorithms
- Filter for each algorithm: CNB, ID3, Knn

## 1. Variance, no filter in algorithm

In [8]:
# Preparation: Prepare list of metrics for each discretization (test_list)
# Step 1: Obtain variance for each discretization, convert into numeric, string values will be return as NaN
ewd_variance = pd.to_numeric(data[data['disc']=="EWD"]['variance'],errors='coerce').tolist()
efd_variance = pd.to_numeric(data[data['disc']=="EFD"]['variance'],errors='coerce').tolist()
ffd_variance = pd.to_numeric(data[data['disc']=="FFD"]['variance'],errors='coerce').tolist()
cm_variance = pd.to_numeric(data[data['disc']=="ChiMerge"]['variance'],errors='coerce').tolist()
dt_variance = pd.to_numeric(data[data['disc']=="DT"]['variance'],errors='coerce').tolist()
# Check number of metrics available for each discretizer
# print(len(ewd_variance))
# print(len(efd_variance))
# print(len(ffd_variance))
# print(len(cm_variance))
# print(len(dt_variance))

# Step 2: filter numeric values
raw_list = [ewd_variance, efd_variance, ffd_variance, cm_variance, dt_variance]
num_list = [] # filter numeric values only
for metric in raw_list:
#     metric_new = []
#     for x in metric:
#         if math.isnan(x) == False:
#             metric_new.append(x)
    metric_new = [x for x in metric if (math.isnan(x) == False)] # using list comprehension
    #print(metric_new)
    #print(len(metric_new))
    num_list.append(metric_new)
    
# Step 3: random sample
# For discretization methods with different value, randomly select so that the final sample size are equal
# Reference: https://docs.python.org/3/library/random.html
k = min(len(metric) for metric in num_list)
test_list = []
for metric in num_list:
    random.seed(20)
    if len(metric) > k:
        metric = random.sample(metric, k=k)
    else: metric = metric
    #print(metric)
    test_list.append(metric)

In [9]:
# Initialiation
disc_key = ['ewd', 'efd', 'ffd', 'cm', 'dt']
disc_value = test_list # list of metrics for each discretization after preparation
test_stat = []
p_value = []
disc_compare = []
# Create dictionary store discretization and series of accuracy
disc = {}
for key in disc_key:
    for value in disc_value:
        disc[key] = value
        disc_value.remove(value)
        break  

# Create loop for Wilcoxon test (two sided) - handle diff = 0
for i in disc_key:
    for j in disc_key:
        if i != j:
            disc_compare.append(f'{i} vs {j}')
           # print(f'{i} vs {j}')
            # Compute difference list
            diff_list = []
            for m in range(0, len(disc[i])):
                diff = disc[i][m] - disc[j][m]
                #print(disc[i][m])
                #print(disc[j][m])
#                 print('diff = ', diff)
#                 print('-------------')
                diff_list.append(diff)
                # print(diff_list)
                
            if all(item == 0 for item in diff_list) == False: # if the diff list does not contain all 0
                test_stat.append(stats.wilcoxon(diff_list).statistic)
                p_value.append(stats.wilcoxon(diff_list).pvalue)
            else: # if the diff list contain only0, cannot do Wilcoxon test
                error = 'N/A'
                test_stat.append(error)
                p_value.append(error)
# print(test_stat)
# print(p_value)

In [10]:
# Convert 3 lists to dataframe
disc_compare = pd.DataFrame(disc_compare, columns=['disc_compare'])
test_stat = pd.DataFrame(test_stat, columns=['wtest_stat'])
p_value = pd.DataFrame(p_value, columns=['p_value'])

In [11]:
# Result table
wt_result = pd.concat([disc_compare, test_stat, p_value], axis = 1)
# Drop duplicate
wt_result.drop_duplicates(subset=['p_value'], inplace=True)

In [12]:
wt_result

Unnamed: 0,disc_compare,wtest_stat,p_value
0,ewd vs efd,252.5,0.616843
1,ewd vs ffd,194.0,0.122209
2,ewd vs cm,256.5,0.888461
3,ewd vs dt,260.5,0.72082
5,efd vs ffd,217.0,0.256521
6,efd vs cm,235.5,0.421357
7,efd vs dt,277.5,0.957249
10,ffd vs cm,182.0,0.078399
11,ffd vs dt,192.5,0.115836
15,cm vs dt,245.5,0.531717


## 1.2 Variance, only CNB

In [13]:
# Preparation: Prepare list of metrics for each discretization (test_list)
# Step 1: Obtain variance for each discretization, convert into numeric, string values will be return as NaN
# Filter for CNB
# Obtain variance for each discretization, convert into numeric, string values will be return as NaN
ewd_variance_cnb = pd.to_numeric(data[(data['disc']=="EWD") & (data['models']=="CNB")]['variance'],errors='coerce').tolist()
efd_variance_cnb = pd.to_numeric(data[(data['disc']=="EFD") & (data['models']=="CNB")]['variance'],errors='coerce').tolist()
ffd_variance_cnb = pd.to_numeric(data[(data['disc']=="FFD") & (data['models']=="CNB")]['variance'],errors='coerce').tolist()
cm_variance_cnb = pd.to_numeric(data[(data['disc']=="ChiMerge") & (data['models']=="CNB")]['variance'],errors='coerce').tolist()
dt_variance_cnb = pd.to_numeric(data[(data['disc']=="DT") & (data['models']=="CNB")]['variance'],errors='coerce').tolist()
# Check number of metrics available for each discretizer
print(len(ewd_variance_cnb))
print(len(efd_variance_cnb))
print(len(ffd_variance_cnb))
print(len(cm_variance_cnb))
print(len(dt_variance_cnb))

# Step 2: filter numeric values
raw_list = [ewd_variance_cnb,efd_variance_cnb, ffd_variance_cnb, cm_variance_cnb, dt_variance_cnb]
num_list = [] # filter numeric values only
for metric in raw_list:
#     metric_new = []
#     for x in metric:
#         if math.isnan(x) == False:
#             metric_new.append(x)
    metric_new = [x for x in metric if (math.isnan(x) == False)] # using list comprehension
    #print(metric_new)
    #print(len(metric_new))
    num_list.append(metric_new)
    
# Step 3: random sample
# For discretization methods with different value, randomly select so that the final sample size are equal
# Reference: https://docs.python.org/3/library/random.html
k = min(len(metric) for metric in num_list)
test_list = []
for metric in num_list:
    random.seed(20)
    if len(metric) > k:
        metric = random.sample(metric, k=k)
    else: metric = metric
    #print(metric)
    test_list.append(metric)

15
15
20
20
20


In [14]:
# Initialiation
disc_key = ['ewd', 'efd', 'ffd', 'cm', 'dt']
disc_value = test_list # list of metrics for each discretization after preparation
test_stat = []
p_value = []
disc_compare = []
# Create dictionary store discretization and series of accuracy
disc = {}
for key in disc_key:
    for value in disc_value:
        disc[key] = value
        disc_value.remove(value)
        break  

# Create loop for Wilcoxon test (two sided) - handle diff = 0
for i in disc_key:
    for j in disc_key:
        if i != j:
            disc_compare.append(f'{i} vs {j}')
            #print(f'{i} vs {j}')
            # Compute difference list
            diff_list = []
            for m in range(0, len(disc[i])):
                diff = disc[i][m] - disc[j][m]
                #print(disc[i][m])
                #print(disc[j][m])
#                 print('diff = ', diff)
#                 print('-------------')
                diff_list.append(diff)
                # print(diff_list)
                
            if all(item == 0 for item in diff_list) == False: # if the diff list does not contain all 0
                test_stat.append(stats.wilcoxon(diff_list).statistic)
                p_value.append(stats.wilcoxon(diff_list).pvalue)
            else: # if the diff list contain only0, cannot do Wilcoxon test
                error = 'N/A'
                test_stat.append(error)
                p_value.append(error)
# print(test_stat)
# print(p_value)



In [15]:
# Convert 3 lists to dataframe
disc_compare = pd.DataFrame(disc_compare, columns=['disc_compare'])
test_stat = pd.DataFrame(test_stat, columns=['wtest_stat'])
p_value = pd.DataFrame(p_value, columns=['p_value'])

In [16]:
# Result table
wt_result = pd.concat([disc_compare, test_stat, p_value], axis = 1)
# Drop duplicate
wt_result.drop_duplicates(subset=['p_value'], inplace=True)

In [17]:
wt_result

Unnamed: 0,disc_compare,wtest_stat,p_value
0,ewd vs efd,31.0,0.569336
1,ewd vs ffd,12.0,0.03418
2,ewd vs cm,30.0,0.789675
3,ewd vs dt,32.0,0.62207
5,efd vs ffd,24.0,0.266113
6,efd vs cm,13.5,0.052246
7,efd vs dt,27.0,0.380371
10,ffd vs cm,10.0,0.020996
11,ffd vs dt,5.0,0.004883
13,cm vs efd,13.5,0.04248


## 2 Variance, only ID3

In [18]:
# Preparation: Prepare list of metrics for each discretization (test_list)
# Step 1: Obtain variance for each discretization, convert into numeric, string values will be return as NaN
# Filter for ID3
# Obtain variance for each discretization, convert into numeric, string values will be return as NaN
ewd_variance_ID3 = pd.to_numeric(data[(data['disc']=="EWD") & (data['models']=="ID3")]['variance'],errors='coerce').tolist()
efd_variance_ID3 = pd.to_numeric(data[(data['disc']=="EFD") & (data['models']=="ID3")]['variance'],errors='coerce').tolist()
ffd_variance_ID3 = pd.to_numeric(data[(data['disc']=="FFD") & (data['models']=="ID3")]['variance'],errors='coerce').tolist()
cm_variance_ID3 = pd.to_numeric(data[(data['disc']=="ChiMerge") & (data['models']=="ID3")]['variance'],errors='coerce').tolist()
dt_variance_ID3 = pd.to_numeric(data[(data['disc']=="DT") & (data['models']=="ID3")]['variance'],errors='coerce').tolist()
# Check number of metrics available for each discretizer
print(len(ewd_variance_ID3))
print(len(efd_variance_ID3))
print(len(ffd_variance_ID3))
print(len(cm_variance_ID3))
print(len(dt_variance_ID3))

# Step 2: filter numeric values
raw_list = [ewd_variance_ID3,efd_variance_ID3, ffd_variance_ID3, cm_variance_ID3, dt_variance_ID3]
num_list = [] # filter numeric values only
for metric in raw_list:
#     metric_new = []
#     for x in metric:
#         if math.isnan(x) == False:
#             metric_new.append(x)
    metric_new = [x for x in metric if (math.isnan(x) == False)] # using list comprehension
    #print(metric_new)
    #print(len(metric_new))
    num_list.append(metric_new)
    
# Step 3: random sample
# For discretization methods with different value, randomly select so that the final sample size are equal
# Reference: https://docs.python.org/3/library/random.html
k = min(len(metric) for metric in num_list)
test_list = []
for metric in num_list:
    random.seed(20)
    if len(metric) > k:
        metric = random.sample(metric, k=k)
    else: metric = metric
    #print(metric)
    test_list.append(metric)

15
15
20
20
20


In [19]:
# Initialiation
disc_key = ['ewd', 'efd', 'ffd', 'cm', 'dt']
disc_value = test_list # list of metrics for each discretization after preparation
test_stat = []
p_value = []
disc_compare = []
# Create dictionary store discretization and series of accuracy
disc = {}
for key in disc_key:
    for value in disc_value:
        disc[key] = value
        disc_value.remove(value)
        break  

# Create loop for Wilcoxon test (two sided) - handle diff = 0
for i in disc_key:
    for j in disc_key:
        if i != j:
            disc_compare.append(f'{i} vs {j}')
            #print(f'{i} vs {j}')
            # Compute difference list
            diff_list = []
            for m in range(0, len(disc[i])):
                diff = disc[i][m] - disc[j][m]
                #print(disc[i][m])
                #print(disc[j][m])
#                 print('diff = ', diff)
#                 print('-------------')
                diff_list.append(diff)
                # print(diff_list)
                
            if all(item == 0 for item in diff_list) == False: # if the diff list does not contain all 0
                test_stat.append(stats.wilcoxon(diff_list).statistic)
                p_value.append(stats.wilcoxon(diff_list).pvalue)
            else: # if the diff list contain only0, cannot do Wilcoxon test
                error = 'N/A'
                test_stat.append(error)
                p_value.append(error)
# print(test_stat)
# print(p_value)



In [20]:
# Convert 3 lists to dataframe
disc_compare = pd.DataFrame(disc_compare, columns=['disc_compare'])
test_stat = pd.DataFrame(test_stat, columns=['wtest_stat'])
p_value = pd.DataFrame(p_value, columns=['p_value'])

In [21]:
# Result table
wt_result = pd.concat([disc_compare, test_stat, p_value], axis = 1)
# Drop duplicate
wt_result.drop_duplicates(subset=['p_value'], inplace=True)

In [22]:
wt_result

Unnamed: 0,disc_compare,wtest_stat,p_value
0,ewd vs efd,12.5,0.012016
1,ewd vs ffd,56.0,0.846924
2,ewd vs cm,46.5,0.488708
3,ewd vs dt,49.0,0.826091
5,efd vs ffd,52.0,0.678772
6,efd vs cm,33.0,0.135376
7,efd vs dt,40.5,0.302795
10,ffd vs cm,1.0,0.002863
12,cm vs ewd,46.5,0.454285
17,dt vs efd,40.5,0.276855


## 3 Variance, only KNN-VDM


In [23]:
# Preparation: Prepare list of metrics for each discretization (test_list)
# Step 1: Obtain variance for each discretization, convert into numeric, string values will be return as NaN
# Filter for Knn-VDM
# Obtain variance for each discretization, convert into numeric, string values will be return as NaN
ewd_variance_knn = pd.to_numeric(data[(data['disc']=="EWD") & (data['models']=="Knn-VDM")]['variance'],errors='coerce').tolist()
efd_variance_knn = pd.to_numeric(data[(data['disc']=="EFD") & (data['models']=="Knn-VDM")]['variance'],errors='coerce').tolist()
ffd_variance_knn = pd.to_numeric(data[(data['disc']=="FFD") & (data['models']=="Knn-VDM")]['variance'],errors='coerce').tolist()
cm_variance_knn = pd.to_numeric(data[(data['disc']=="ChiMerge") & (data['models']=="Knn-VDM")]['variance'],errors='coerce').tolist()
dt_variance_knn = pd.to_numeric(data[(data['disc']=="ChiMerge") & (data['models']=="Knn-VDM")]['variance'],errors='coerce').tolist()

# Check number of metrics available for each discretizer
print(len(ewd_variance_knn))
print(len(efd_variance_knn))
print(len(ffd_variance_knn))
print(len(cm_variance_knn))
print(len(dt_variance_knn))

# Step 2: filter numeric values
raw_list = [ewd_variance_knn,efd_variance_knn, ffd_variance_knn, cm_variance_knn, dt_variance_knn]
num_list = [] # filter numeric values only
for metric in raw_list:
#     metric_new = []
#     for x in metric:
#         if math.isnan(x) == False:
#             metric_new.append(x)
    metric_new = [x for x in metric if (math.isnan(x) == False)] # using list comprehension
    #print(metric_new)
    #print(len(metric_new))
    num_list.append(metric_new)
    
# Step 3: random sample
# For discretization methods with different value, randomly select so that the final sample size are equal
# Reference: https://docs.python.org/3/library/random.html
k = min(len(metric) for metric in num_list)
test_list = []
for metric in num_list:
    random.seed(20)
    if len(metric) > k:
        metric = random.sample(metric, k=k)
    else: metric = metric
    #print(metric)
    test_list.append(metric)

15
15
20
20
20


In [24]:
# Initialiation
disc_key = ['ewd', 'efd', 'ffd', 'cm', 'dt']
disc_value = test_list # list of metrics for each discretization after preparation
test_stat = []
p_value = []
disc_compare = []
# Create dictionary store discretization and series of accuracy
disc = {}
for key in disc_key:
    for value in disc_value:
        disc[key] = value
        disc_value.remove(value)
        break  

In [25]:
# Create loop for Wilcoxon test (two sided) - handle diff = 0
for i in disc_key:
    for j in disc_key:
        if i != j:
            disc_compare.append(f'{i} vs {j}')
            #print(f'{i} vs {j}')
            # Compute difference list
            diff_list = []
            for m in range(0, len(disc[i])):
                diff = disc[i][m] - disc[j][m]
                #print(disc[i][m])
                #print(disc[j][m])
#                 print('diff = ', diff)
#                 print('-------------')
                diff_list.append(diff)
                # print(diff_list)
                
            if all(item == 0 for item in diff_list) == False: # if the diff list does not contain all 0
                test_stat.append(stats.wilcoxon(diff_list).statistic)
                p_value.append(stats.wilcoxon(diff_list).pvalue)
            else: # if the diff list contain only0, cannot do Wilcoxon test
                error = 'N/A'
                test_stat.append(error)
                p_value.append(error)
# print(test_stat)
# print(p_value)

In [26]:
# Convert 3 lists to dataframe
disc_compare = pd.DataFrame(disc_compare, columns=['disc_compare'])
test_stat = pd.DataFrame(test_stat, columns=['wtest_stat'])
p_value = pd.DataFrame(p_value, columns=['p_value'])

In [27]:
# Result table
wt_result = pd.concat([disc_compare, test_stat, p_value], axis = 1)
# Drop duplicate
wt_result.drop_duplicates(subset=['p_value'], inplace=True)

In [28]:
wt_result

Unnamed: 0,disc_compare,wtest_stat,p_value
0,ewd vs efd,8.0,0.6875
1,ewd vs ffd,9.0,0.84375
6,efd vs cm,10.0,1.0
15,cm vs dt,,
