In [15]:
import pandas as pd
import math
import numpy as np
import ipynb

from scipy import stats

import matplotlib.pyplot as plt

In [16]:
from ipynb.fs.full.Nemenyi import draw_cd_diagram

In [17]:
filename1 = "online_shoppers_intentions"
filename2 = "marketing_campaign"
filename3 = "heart"

df1_selftraining = pd.read_csv("../pred/selftraining/"+filename1+"_metrics.csv", index_col=0)
df2_selftraining = pd.read_csv("../pred/selftraining/"+filename2+"_metrics.csv", index_col=0)
df3_selftraining = pd.read_csv("../pred/selftraining/"+filename3+"_metrics.csv", index_col=0)

df1_semiboost = pd.read_csv("../pred/semiboost/"+filename1+"_metrics.csv", index_col=0)
df2_semiboost = pd.read_csv("../pred/semiboost/"+filename2+"_metrics.csv", index_col=0)
df3_semiboost = pd.read_csv("../pred/semiboost/"+filename3+"_metrics.csv", index_col=0)

df1_labelpropagation = pd.read_csv("../pred/labelpropagation/"+filename1+"_metrics.csv", index_col=0)
df2_labelpropagation = pd.read_csv("../pred/labelpropagation/"+filename2+"_metrics.csv", index_col=0)
df3_labelpropagation = pd.read_csv("../pred/labelpropagation/"+filename3+"_metrics.csv", index_col=0)

In [18]:
unlabelled = ['0','10','20','50','90','95']

In [4]:
def returnDataframe(accuracy_level, sampling):
    selftraining = {'online_shopping_intention' : df1_selftraining.loc[accuracy_level,sampling],
                    'marketing_campaign' : df2_selftraining.loc[accuracy_level,sampling], 
                    'heart' :df3_selftraining.loc[accuracy_level,sampling] }
    semiboost = {'online_shopping_intention' : df1_semiboost.loc[accuracy_level,sampling],
                    'marketing_campaign' : df2_semiboost.loc[accuracy_level,sampling], 
                    'heart' :df3_semiboost.loc[accuracy_level,sampling] }
    labelpropagation = {'online_shopping_intention' : df1_labelpropagation.loc[accuracy_level,sampling],
                    'marketing_campaign' : df2_labelpropagation.loc[accuracy_level,sampling], 
                    'heart' :df3_labelpropagation.loc[accuracy_level,sampling] }

    data = {'selftraining':selftraining, 'semiboost':semiboost, 'labelpropagation':labelpropagation}
    new_df = pd.DataFrame(data)
    
    return new_df

In [5]:
noresampling_0_df = returnDataframe('accuracy_0', 'noresampling')
noresampling_10_df = returnDataframe('accuracy_10', 'noresampling')
noresampling_20_df = returnDataframe('accuracy_20', 'noresampling')
noresampling_50_df = returnDataframe('accuracy_50', 'noresampling')
noresampling_90_df = returnDataframe('accuracy_90', 'noresampling')
noresampling_95_df = returnDataframe('accuracy_95', 'noresampling')

noresampling_dfs = [noresampling_0_df, noresampling_10_df, noresampling_20_df, noresampling_50_df,
                   noresampling_90_df, noresampling_95_df]

undersampling_0_df = returnDataframe('accuracy_0', 'undersampling')
undersampling_10_df = returnDataframe('accuracy_10', 'undersampling')
undersampling_20_df = returnDataframe('accuracy_20', 'undersampling')
undersampling_50_df = returnDataframe('accuracy_50', 'undersampling')
undersampling_90_df = returnDataframe('accuracy_90', 'undersampling')
undersampling_95_df = returnDataframe('accuracy_95', 'undersampling')

undersampling_dfs = [undersampling_0_df, undersampling_10_df, undersampling_20_df, undersampling_50_df,
                   undersampling_90_df, undersampling_95_df]

oversampling_0_df = returnDataframe('accuracy_0', 'oversampling')
oversampling_10_df = returnDataframe('accuracy_10', 'oversampling')
oversampling_20_df = returnDataframe('accuracy_20', 'oversampling')
oversampling_50_df = returnDataframe('accuracy_50', 'oversampling')
oversampling_90_df = returnDataframe('accuracy_90', 'oversampling')
oversampling_95_df = returnDataframe('accuracy_95', 'oversampling')

oversampling_dfs = [oversampling_0_df, oversampling_10_df, oversampling_20_df, oversampling_50_df,
                   oversampling_90_df,oversampling_95_df]

model_names = ['selftraining', 'semiboost','labelpropagation']

In [6]:
# Sanity check

noresampling_0_df

Unnamed: 0,selftraining,semiboost,labelpropagation
online_shopping_intention,88.24,90.187,84.536
marketing_campaign,91.369,89.881,74.554
heart,65.934,60.44,60.44


# Friedman Test for All Labelling Levels and Resampling Combinations

In [7]:
f_stats = {}
p_values = {}
reject = {}

for i in range(6):
    print('______________________________________No-resampling-'+unlabelled[i]+'%:')
    f_test = stats.friedmanchisquare(noresampling_dfs[i][model_names[0]], noresampling_dfs[i][model_names[1]], noresampling_dfs[i][model_names[2]])
    print("F-stat: "+str(f_test[0]))
    f_stats['None-'+unlabelled[i]] = str(round(f_test[0],3))
    print("p-value: "+str(f_test[1]))
    p_values['None-'+unlabelled[i]] = str(round(f_test[1],3))
    print("Reject: "+str(f_test[1]<0.05))
    reject['None-'+unlabelled[i]] = str(f_test[1]<0.05)
for i in range(6):
    print('______________________________________Undersampling-'+unlabelled[i]+'%:')
    f_test = stats.friedmanchisquare(undersampling_dfs[i][model_names[0]], undersampling_dfs[i][model_names[1]], undersampling_dfs[i][model_names[2]])
    print("F-stat: "+str(f_test[0]))
    f_stats['Undersampling-'+unlabelled[i]] = str(round(f_test[0],3))
    print("p-value: "+str(f_test[1]))
    p_values['Undersampling-'+unlabelled[i]] = str(round(f_test[1],3))
    print("Reject: "+str(f_test[1]<0.05))
    reject['Undersampling-'+unlabelled[i]] = str(f_test[1]<0.05)
for i in range(6):
    print('______________________________________Oversampling-'+unlabelled[i]+'%:')
    f_test = stats.friedmanchisquare(oversampling_dfs[i][model_names[0]], oversampling_dfs[i][model_names[1]], oversampling_dfs[i][model_names[2]])
    print("F-stat: "+str(f_test[0]))
    f_stats['Oversampling-'+unlabelled[i]] = str(round(f_test[0],3))
    print("p-value: "+str(f_test[1]))
    p_values['Oversampling-'+unlabelled[i]] = str(round(f_test[1],3))
    print("Reject: "+str(f_test[1]<0.05))
    reject['Oversampling-'+unlabelled[i]] = str(f_test[1]<0.05)

______________________________________No-resampling-0%:
F-stat: 3.8181818181818183
p-value: 0.14821506633752016
Reject: False
______________________________________No-resampling-10%:
F-stat: 0.6666666666666643
p-value: 0.71653131057379
Reject: False
______________________________________No-resampling-20%:
F-stat: 1.2727272727272703
p-value: 0.529213341500051
Reject: False
______________________________________No-resampling-50%:
F-stat: 4.666666666666664
p-value: 0.09697196786440515
Reject: False
______________________________________No-resampling-90%:
F-stat: 3.8181818181818183
p-value: 0.14821506633752016
Reject: False
______________________________________No-resampling-95%:
F-stat: 1.2727272727272703
p-value: 0.529213341500051
Reject: False
______________________________________Undersampling-0%:
F-stat: 2.0
p-value: 0.36787944117144245
Reject: False
______________________________________Undersampling-10%:
F-stat: 0.5454545454545455
p-value: 0.7613003866968736
Reject: False
__________

In [11]:
resampling = ['None', 'Undersampling', 'Oversampling']
unlabelled = ['10','20','50','90','95']

str = ''

for r in resampling:
    str = str + '\multirow{6}{*}{'+r+'} & 0 & '+f_stats[r+"-0"]+' & '+p_values[r+"-0"]+' & '+reject[r+"-0"]+' \\\  \n'
    for u in unlabelled:
        if u!='95':
            str = str + '{} & '+u+' & '+f_stats[r+"-"+u]+' & '+p_values[r+"-"+u]+' & '+reject[r+"-"+u]+' \\\ \n'
        else:
            str = str + '{} & '+u+' & '+f_stats[r+"-"+u]+' & '+p_values[r+"-"+u]+' & '+reject[r+"-"+u]+' \\\ \\hline \n'
        
print(str)

\multirow{6}{*}{None} & 0 & 3.818 & 0.148 & False \\  
{} & 10 & 0.667 & 0.717 & False \\ 
{} & 20 & 1.273 & 0.529 & False \\ 
{} & 50 & 4.667 & 0.097 & False \\ 
{} & 90 & 3.818 & 0.148 & False \\ 
{} & 95 & 1.273 & 0.529 & False \\ \hline 
\multirow{6}{*}{Undersampling} & 0 & 2.0 & 0.368 & False \\  
{} & 10 & 0.545 & 0.761 & False \\ 
{} & 20 & 0.545 & 0.761 & False \\ 
{} & 50 & 1.636 & 0.441 & False \\ 
{} & 90 & 5.6 & 0.061 & False \\ 
{} & 95 & 3.714 & 0.156 & False \\ \hline 
\multirow{6}{*}{Oversampling} & 0 & 2.0 & 0.368 & False \\  
{} & 10 & 1.636 & 0.441 & False \\ 
{} & 20 & 3.818 & 0.148 & False \\ 
{} & 50 & 0.182 & 0.913 & False \\ 
{} & 90 & 2.0 & 0.368 & False \\ 
{} & 95 & 3.2 & 0.202 & False \\ \hline 



# Friedman Test for All Resampling Combinations (Mean for all labelling levels)

In [8]:
def returnDataframe_means(sampling):
    accuracies = ['accuracy_0','accuracy_10','accuracy_20','accuracy_50','accuracy_90','accuracy_95',]
    selftraining = {'online_shopping_intention' : np.mean(df1_selftraining.loc[accuracies,sampling]),
                    'marketing_campaign' : np.mean(df2_selftraining.loc[accuracies,sampling]), 
                    'heart' : np.mean(df3_selftraining.loc[accuracies,sampling]) }
    semiboost = {'online_shopping_intention' : np.mean(df1_semiboost.loc[accuracies,sampling]),
                    'marketing_campaign' : np.mean(df2_semiboost.loc[accuracies,sampling]), 
                    'heart' : np.mean(df3_semiboost.loc[accuracies,sampling]) }
    labelpropagation = {'online_shopping_intention' : np.mean(df1_labelpropagation.loc[accuracies,sampling]),
                    'marketing_campaign' : np.mean(df2_labelpropagation.loc[accuracies,sampling]), 
                    'heart' : np.mean(df3_labelpropagation.loc[accuracies,sampling]) }

    data = {'selftraining':selftraining, 'semiboost':semiboost, 'labelpropagation':labelpropagation}
    new_df = pd.DataFrame(data)
    
    return new_df

noresampling_df = returnDataframe_means('noresampling')
undersampling_df = returnDataframe_means('undersampling')
oversampling_df = returnDataframe_means('oversampling')

means_dfs = [noresampling_df, undersampling_df, oversampling_df]

means_dfs

[                           selftraining  semiboost  labelpropagation
 online_shopping_intention     84.995833  27.917833         84.536000
 marketing_campaign            55.456333  55.282667         52.455333
 heart                         58.791167  55.860833         56.959833,
                            selftraining  semiboost  labelpropagation
 online_shopping_intention     38.330333  27.715000         22.934500
 marketing_campaign            55.828333  56.398667         54.141833
 heart                         51.648333  52.380833         48.168500,
                            selftraining  semiboost  labelpropagation
 online_shopping_intention     28.638500  27.850167         25.178333
 marketing_campaign            57.043500  55.233000         52.653667
 heart                         52.197833  53.663000         49.633833]

In [10]:
resampling_methods = ['noresampling','undersampling','oversampling']

for i in range(3):
    print('______________________________________'+resampling_methods[i]+':')
    f_test = stats.friedmanchisquare(means_dfs[i][model_names[0]], means_dfs[i][model_names[1]], means_dfs[i][model_names[2]])
    print("F-stat: "+str(f_test[0]))
    print("p-value: "+str(f_test[1]))
    print("Reject: "+str(f_test[1]<0.05))

______________________________________noresampling:
F-stat: 4.666666666666664
p-value: 0.09697196786440515
Reject: False
______________________________________undersampling:
F-stat: 4.666666666666664
p-value: 0.09697196786440515
Reject: False
______________________________________oversampling:
F-stat: 4.666666666666664
p-value: 0.09697196786440515
Reject: False


# Friedman Test for All Labelling Levels (Mean for all Resampling Methods)

In [11]:
def returnDataframe_means2(accuracies):
    sampling = ['noresampling','undersampling','oversampling']
    selftraining = {'online_shopping_intention' : np.mean(df1_selftraining.loc[accuracies,sampling],),
                    'marketing_campaign' : np.mean(df2_selftraining.loc[accuracies,sampling]), 
                    'heart' : np.mean(df3_selftraining.loc[accuracies,sampling]) }
    semiboost = {'online_shopping_intention' : np.mean(df1_semiboost.loc[accuracies,sampling]),
                    'marketing_campaign' : np.mean(df2_semiboost.loc[accuracies,sampling]), 
                    'heart' : np.mean(df3_semiboost.loc[accuracies,sampling]) }
    labelpropagation = {'online_shopping_intention' : np.mean(df1_labelpropagation.loc[accuracies,sampling]),
                    'marketing_campaign' : np.mean(df2_labelpropagation.loc[accuracies,sampling]), 
                    'heart' : np.mean(df3_labelpropagation.loc[accuracies,sampling]) }

    data = {'selftraining':selftraining, 'semiboost':semiboost, 'labelpropagation':labelpropagation}
    new_df = pd.DataFrame(data)
    
    return new_df

df0 = returnDataframe_means2('accuracy_0')
df10 = returnDataframe_means2('accuracy_10')
df20 = returnDataframe_means2('accuracy_20')
df50 = returnDataframe_means2('accuracy_50')
df90 = returnDataframe_means2('accuracy_90')
df95 = returnDataframe_means2('accuracy_95')

means_dfs = [df0, df10, df20, df50, df90, df95]

means_dfs

[                           selftraining  semiboost  labelpropagation
 online_shopping_intention     87.014333  89.646000         72.596333
 marketing_campaign            91.567333  92.013667         74.901000
 heart                         50.549333  49.084333         52.747333,
                            selftraining  semiboost  labelpropagation
 online_shopping_intention     46.886667     15.464         38.758333
 marketing_campaign            50.892667     48.363         47.867000
 heart                         41.758000     54.945         49.084000,
                            selftraining  semiboost  labelpropagation
 online_shopping_intention     41.696000     15.464         38.479000
 marketing_campaign            50.793333     48.363         48.908667
 heart                         61.905000     54.945         52.381000,
                            selftraining  semiboost  labelpropagation
 online_shopping_intention     39.947667     15.464         38.488000
 marketing_campai

In [12]:
for i in range(6):
    print('______________________________________'+unlabelled[i]+'%:')
    f_test = stats.friedmanchisquare(means_dfs[i][model_names[0]], means_dfs[i][model_names[1]], means_dfs[i][model_names[2]])
    print("F-stat: "+str(f_test[0]))
    print("p-value: "+str(f_test[1]))
    print("Reject: "+str(f_test[1]<0.05))

______________________________________0%:
F-stat: 0.6666666666666643
p-value: 0.71653131057379
Reject: False
______________________________________10%:
F-stat: 0.6666666666666643
p-value: 0.71653131057379
Reject: False
______________________________________20%:
F-stat: 4.666666666666664
p-value: 0.09697196786440515
Reject: False
______________________________________50%:
F-stat: 0.6666666666666643
p-value: 0.71653131057379
Reject: False
______________________________________90%:
F-stat: 0.5454545454545455
p-value: 0.7613003866968736
Reject: False
______________________________________95%:
F-stat: 2.6666666666666643
p-value: 0.26359713811572705
Reject: False


# Friedman Test (Average of all Unlabelled and Resampling)

In [32]:
def returnFullDataframe():
    
    unlabelled = ['0','10','20','50','90','95']
    accuracy_level=['accuracy_0','accuracy_10','accuracy_20','accuracy_50','accuracy_90','accuracy_95']
    resampling = ['noresampling','undersampling','oversampling']
    
    selftraining = {}
    semiboost = {}
    labelpropagation = {}
    
    for i in range(6):
        for sampling in resampling:
            selftraining['osi_'+sampling+'_'+unlabelled[i]] = df1_selftraining.loc[accuracy_level[i],sampling]
            selftraining['mc_'+sampling+'_'+unlabelled[i]] = df2_selftraining.loc[accuracy_level[i],sampling] 
            selftraining['h_'+sampling+'_'+unlabelled[i]] = df3_selftraining.loc[accuracy_level[i],sampling] 
            semiboost['osi_'+sampling+'_'+unlabelled[i]] = df1_semiboost.loc[accuracy_level[i],sampling]
            semiboost['mc_'+sampling+'_'+unlabelled[i]] = df2_semiboost.loc[accuracy_level[i],sampling] 
            semiboost['h_'+sampling+'_'+unlabelled[i]] = df3_semiboost.loc[accuracy_level[i],sampling] 
            labelpropagation['osi_'+sampling+'_'+unlabelled[i]] = df1_labelpropagation.loc[accuracy_level[i],sampling]
            labelpropagation['mc_'+sampling+'_'+unlabelled[i]] = df2_labelpropagation.loc[accuracy_level[i],sampling] 
            labelpropagation['h_'+sampling+'_'+unlabelled[i]] = df3_labelpropagation.loc[accuracy_level[i],sampling] 

    data = {'selftraining':selftraining, 'semiboost':semiboost, 'labelpropagation':labelpropagation}
    new_df = pd.DataFrame(data)
    
    return new_df

total_df = returnFullDataframe()
#total_df.to_csv("total.csv")

In [30]:
f_test = stats.friedmanchisquare(total_df['selftraining'], total_df['semiboost'], total_df['labelpropagation'])
print("F-stat: "+str(f_test[0]))
print("p-value: "+str(f_test[1]))
print("Reject: "+str(f_test[1]<0.05))

F-stat: 7.238095238095151
p-value: 0.02680819595378016
Reject: True


# Nemenyi Diagram

In [33]:
df_perf = pd.read_csv('../pred/NemenyiAccuracies.csv',index_col=False)

draw_cd_diagram(df_perf=df_perf, title='Accuracy', labels=True)

['selftraining' 'semiboost' 'labelpropagation']
labelpropagation    11.0
selftraining        23.0
semiboost           14.0
dtype: float64
labelpropagation    2.166667
semiboost           2.111111
selftraining        1.722222
dtype: float64
('labelpropagation', 'selftraining', 0.0029901159311325485, True)
('selftraining', 'semiboost', 0.016645787143917238, True)
('labelpropagation', 'semiboost', 0.8756503429323933, False)
Index(['labelpropagation', 'semiboost', 'selftraining'], dtype='object')
[0, 1]
