In [82]:
import pandas as pd
import math
import numpy as np

from scipy import stats

In [52]:
filename1 = "online_shoppers_intentions"
filename2 = "marketing_campaign"
filename3 = "heart"

df1_selftraining = pd.read_csv("../pred/selftraining/"+filename1+"_metrics.csv", index_col=0)
df2_selftraining = pd.read_csv("../pred/selftraining/"+filename2+"_metrics.csv", index_col=0)
df3_selftraining = pd.read_csv("../pred/selftraining/"+filename3+"_metrics.csv", index_col=0)

df1_semiboost = pd.read_csv("../pred/semiboost/"+filename1+"_metrics.csv", index_col=0)
df2_semiboost = pd.read_csv("../pred/semiboost/"+filename2+"_metrics.csv", index_col=0)
df3_semiboost = pd.read_csv("../pred/semiboost/"+filename3+"_metrics.csv", index_col=0)

df1_labelpropagation = pd.read_csv("../pred/labelpropagation/"+filename1+"_metrics.csv", index_col=0)
df2_labelpropagation = pd.read_csv("../pred/labelpropagation/"+filename2+"_metrics.csv", index_col=0)
df3_labelpropagation = pd.read_csv("../pred/labelpropagation/"+filename3+"_metrics.csv", index_col=0)

In [75]:
unlabelled = ['0','10','20','50','90','95']

In [78]:
def returnDataframe(accuracy_level, sampling):
    selftraining = {'online_shopping_intention' : df1_selftraining.loc[accuracy_level,sampling],
                    'marketing_campaign' : df2_selftraining.loc[accuracy_level,sampling], 
                    'heart' :df3_selftraining.loc[accuracy_level,sampling] }
    semiboost = {'online_shopping_intention' : df1_semiboost.loc[accuracy_level,sampling],
                    'marketing_campaign' : df2_semiboost.loc[accuracy_level,sampling], 
                    'heart' :df3_semiboost.loc[accuracy_level,sampling] }
    labelpropagation = {'online_shopping_intention' : df1_labelpropagation.loc[accuracy_level,sampling],
                    'marketing_campaign' : df2_labelpropagation.loc[accuracy_level,sampling], 
                    'heart' :df3_labelpropagation.loc[accuracy_level,sampling] }

    data = {'selftraining':selftraining, 'semiboost':semiboost, 'labelpropagation':labelpropagation}
    new_df = pd.DataFrame(data)
    
    return new_df

In [79]:
noresampling_0_df = returnDataframe('accuracy_0', 'noresampling')
noresampling_10_df = returnDataframe('accuracy_10', 'noresampling')
noresampling_20_df = returnDataframe('accuracy_20', 'noresampling')
noresampling_50_df = returnDataframe('accuracy_50', 'noresampling')
noresampling_90_df = returnDataframe('accuracy_90', 'noresampling')
noresampling_95_df = returnDataframe('accuracy_95', 'noresampling')

noresampling_dfs = [noresampling_0_df, noresampling_10_df, noresampling_20_df, noresampling_50_df,
                   noresampling_90_df, noresampling_95_df]

undersampling_0_df = returnDataframe('accuracy_0', 'undersampling')
undersampling_10_df = returnDataframe('accuracy_10', 'undersampling')
undersampling_20_df = returnDataframe('accuracy_20', 'undersampling')
undersampling_50_df = returnDataframe('accuracy_50', 'undersampling')
undersampling_90_df = returnDataframe('accuracy_90', 'undersampling')
undersampling_95_df = returnDataframe('accuracy_95', 'undersampling')

undersampling_dfs = [undersampling_0_df, undersampling_10_df, undersampling_20_df, undersampling_50_df,
                   undersampling_90_df, undersampling_95_df]

oversampling_0_df = returnDataframe('accuracy_0', 'oversampling')
oversampling_10_df = returnDataframe('accuracy_10', 'oversampling')
oversampling_20_df = returnDataframe('accuracy_20', 'oversampling')
oversampling_50_df = returnDataframe('accuracy_50', 'oversampling')
oversampling_90_df = returnDataframe('accuracy_90', 'oversampling')
oversampling_95_df = returnDataframe('accuracy_95', 'oversampling')

oversampling_dfs = [oversampling_0_df, oversampling_10_df, oversampling_20_df, oversampling_50_df,
                   oversampling_90_df,oversampling_95_df]

model_names = ['selftraining', 'semiboost','labelpropagation']

In [94]:
# Sanity check

noresampling_0_df

Unnamed: 0,selftraining,semiboost,labelpropagation
online_shopping_intention,88.24,90.187,84.536
marketing_campaign,91.369,89.881,74.554
heart,65.934,60.44,60.44


# Friedman Test for All Labelling Levels and Resampling Combinations

In [97]:
for i in range(6):
    print('______________________________________No-resampling-'+unlabelled[i]+'%:')
    f_test = stats.friedmanchisquare(noresampling_dfs[i][model_names[0]], noresampling_dfs[i][model_names[1]], noresampling_dfs[i][model_names[2]])
    print("F-stat: "+str(f_test[0]))
    print("p-value: "+str(f_test[1]))
    print("Reject: "+str(f_test[1]<0.05))
for i in range(6):
    print('______________________________________Undersampling-'+unlabelled[i]+'%:')
    f_test = stats.friedmanchisquare(undersampling_dfs[i][model_names[0]], undersampling_dfs[i][model_names[1]], undersampling_dfs[i][model_names[2]])
    print("F-stat: "+str(f_test[0]))
    print("p-value: "+str(f_test[1]))
    print("Reject: "+str(f_test[1]<0.05))
for i in range(6):
    print('______________________________________Oversampling-'+unlabelled[i]+'%:')
    f_test = stats.friedmanchisquare(oversampling_dfs[i][model_names[0]], oversampling_dfs[i][model_names[1]], oversampling_dfs[i][model_names[2]])
    print("F-stat: "+str(f_test[0]))
    print("p-value: "+str(f_test[1]))
    print("Reject: "+str(f_test[1]<0.05))

______________________________________No-resampling-0%:
F-stat: 3.8181818181818183
p-value: 0.14821506633752016
Reject: False
______________________________________No-resampling-10%:
F-stat: 0.6666666666666643
p-value: 0.71653131057379
Reject: False
______________________________________No-resampling-20%:
F-stat: 1.2727272727272703
p-value: 0.529213341500051
Reject: False
______________________________________No-resampling-50%:
F-stat: 4.666666666666664
p-value: 0.09697196786440515
Reject: False
______________________________________No-resampling-90%:
F-stat: 3.8181818181818183
p-value: 0.14821506633752016
Reject: False
______________________________________No-resampling-95%:
F-stat: 1.2727272727272703
p-value: 0.529213341500051
Reject: False
______________________________________Undersampling-0%:
F-stat: 2.0
p-value: 0.36787944117144245
Reject: False
______________________________________Undersampling-10%:
F-stat: 0.5454545454545455
p-value: 0.7613003866968736
Reject: False
__________

# Friedman Test for All Resampling Combinations (Mean for all labelling levels)

In [104]:
def returnDataframe_means(sampling):
    accuracies = ['accuracy_0','accuracy_10','accuracy_20','accuracy_50','accuracy_90','accuracy_95',]
    selftraining = {'online_shopping_intention' : np.mean(df1_selftraining.loc[accuracies,sampling]),
                    'marketing_campaign' : np.mean(df2_selftraining.loc[accuracies,sampling]), 
                    'heart' : np.mean(df3_selftraining.loc[accuracies,sampling]) }
    semiboost = {'online_shopping_intention' : np.mean(df1_semiboost.loc[accuracies,sampling]),
                    'marketing_campaign' : np.mean(df2_semiboost.loc[accuracies,sampling]), 
                    'heart' : np.mean(df3_semiboost.loc[accuracies,sampling]) }
    labelpropagation = {'online_shopping_intention' : np.mean(df1_labelpropagation.loc[accuracies,sampling]),
                    'marketing_campaign' : np.mean(df2_labelpropagation.loc[accuracies,sampling]), 
                    'heart' : np.mean(df3_labelpropagation.loc[accuracies,sampling]) }

    data = {'selftraining':selftraining, 'semiboost':semiboost, 'labelpropagation':labelpropagation}
    new_df = pd.DataFrame(data)
    
    return new_df

noresampling_df = returnDataframe_means('noresampling')
undersampling_df = returnDataframe_means('undersampling')
oversampling_df = returnDataframe_means('oversampling')

means_dfs = [noresampling_df, undersampling_df, oversampling_df]
resampling_methods = ['noresampling','undersampling','oversampling']

for i in range(3):
    print('______________________________________'+resampling_methods[i]+':')
    f_test = stats.friedmanchisquare(means_dfs[i][model_names[0]], means_dfs[i][model_names[1]], means_dfs[i][model_names[2]])
    print("F-stat: "+str(f_test[0]))
    print("p-value: "+str(f_test[1]))
    print("Reject: "+str(f_test[1]<0.05))

______________________________________noresampling:
F-stat: 4.666666666666664
p-value: 0.09697196786440515
Reject: False
______________________________________undersampling:
F-stat: 4.666666666666664
p-value: 0.09697196786440515
Reject: False
______________________________________oversampling:
F-stat: 4.666666666666664
p-value: 0.09697196786440515
Reject: False


# Friedman Test for All Labelling Levels (Mean for all Resampling Methods)

In [108]:
def returnDataframe_means2(accuracies):
    sampling = ['noresampling','undersampling','oversampling']
    selftraining = {'online_shopping_intention' : np.mean(df1_selftraining.loc[accuracies,sampling],),
                    'marketing_campaign' : np.mean(df2_selftraining.loc[accuracies,sampling]), 
                    'heart' : np.mean(df3_selftraining.loc[accuracies,sampling]) }
    semiboost = {'online_shopping_intention' : np.mean(df1_semiboost.loc[accuracies,sampling]),
                    'marketing_campaign' : np.mean(df2_semiboost.loc[accuracies,sampling]), 
                    'heart' : np.mean(df3_semiboost.loc[accuracies,sampling]) }
    labelpropagation = {'online_shopping_intention' : np.mean(df1_labelpropagation.loc[accuracies,sampling]),
                    'marketing_campaign' : np.mean(df2_labelpropagation.loc[accuracies,sampling]), 
                    'heart' : np.mean(df3_labelpropagation.loc[accuracies,sampling]) }

    data = {'selftraining':selftraining, 'semiboost':semiboost, 'labelpropagation':labelpropagation}
    new_df = pd.DataFrame(data)
    
    return new_df

df0 = returnDataframe_means2('accuracy_0')
df10 = returnDataframe_means2('accuracy_10')
df20 = returnDataframe_means2('accuracy_20')
df50 = returnDataframe_means2('accuracy_50')
df90 = returnDataframe_means2('accuracy_90')
df95 = returnDataframe_means2('accuracy_95')

means_dfs = [df0, df10, df20, df50, df90, df95]

for i in range(6):
    print('______________________________________'+unlabelled[i]+'%:')
    f_test = stats.friedmanchisquare(means_dfs[i][model_names[0]], means_dfs[i][model_names[1]], means_dfs[i][model_names[2]])
    print("F-stat: "+str(f_test[0]))
    print("p-value: "+str(f_test[1]))
    print("Reject: "+str(f_test[1]<0.05))

KeyError: 'resampling'