# Statistical tests on the evaluations

In [1]:
import pandas as pd
from scipy.stats import kruskal
from statsmodels.stats.multitest import multipletests

In [2]:
res_dir = '../experiments/results/'
measures = ['l_measure', 'l_precision', 'l_recall']
use_cols = measures + ['track_id']

# --- first we start with the evaluation of the algos on the non-extended (original) reference hierarchies
inter_anno = pd.read_csv('evaluations/reference_eval.csv', index_col=0)

lsd_ao = pd.read_csv('evaluations/LSD_annotator_one.csv', index_col=0)
lsd_at = pd.read_csv('evaluations/LSD_annotator_two.csv', index_col=0)
lsdm_ao = pd.read_csv('evaluations/LSD_truncated_annotator_one.csv')
lsdm_at = pd.read_csv('evaluations/LSD_truncated_annotator_two.csv')
olda_ao = pd.read_csv('evaluations/OLDA_annotator_one.csv', usecols=use_cols)
olda_at = pd.read_csv('evaluations/OLDA_annotator_two.csv', usecols=use_cols)
mscom_ao = pd.read_csv('evaluations/mscom_annotator_one.csv', usecols=use_cols)
mscom_at = pd.read_csv('evaluations/mscom_annotator_two.csv', usecols=use_cols)
dmscom_ao = pd.read_csv('evaluations/dmscom_annotator_one.csv', usecols=use_cols)
dmscom_at = pd.read_csv('evaluations/dmscom_annotator_two.csv', usecols=use_cols)

# --- and then with the evaluation of the algos on the extended reference hierarchies
inter_anno_ext =pd.read_csv('evaluations/SALAMI-interanno-expanded.csv')
lsd_ext_ao = pd.read_csv('evaluations/LSD-a1-expanded.csv')
lsd_ext_at = pd.read_csv('evaluations/LSD-a2-expanded.csv')
lsdm_ext_ao = pd.read_csv('evaluations/LSD-mono-a1-expanded.csv')
lsdm_ext_at = pd.read_csv('evaluations/LSD-mono-a2-expanded.csv')
olda_ext_ao = pd.read_csv('evaluations/OLDA-a1-expanded.csv')
olda_ext_at = pd.read_csv('evaluations/OLDA-a2-expanded.csv')
mscom_ext_ao = pd.read_csv('evaluations/MSCOM-a1-expanded.csv')
mscom_ext_at = pd.read_csv('evaluations/MSCOM-a2-expanded.csv')
dmscom_ext_ao = pd.read_csv('evaluations/DMSCOM-a1-expanded.csv')
dmscom_ext_at = pd.read_csv('evaluations/DMSCOM-a2-expanded.csv')


In [3]:
# Extracting statistics from the evaluation data [ANNOTATOR 1]
lsd_ao_d = lsd_ao.describe()[measures][1:3]
lsd_at_d = lsd_at.describe()[measures][1:3]

lsdm_ao_d = lsdm_ao.describe()[measures][1:3]
lsdm_at_d = lsdm_at.describe()[measures][1:3]

olda_ao_d = olda_ao.describe()[measures][1:3]
olda_at_d = olda_at.describe()[measures][1:3]

mscom_ao_d = mscom_ao.describe()[measures][1:3]
mscom_at_d = mscom_at.describe()[measures][1:3]

dmscom_ao_d = dmscom_ao.describe()[measures][1:3]
dmscom_at_d = dmscom_at.describe()[measures][1:3]

lsd_ao_d.rename(index={'mean':'LSD', 'std':'LSD-std'}, inplace=True)
lsdm_ao_d.rename(index={'mean':'LSDM', 'std':'LSDM-std'}, inplace=True)
olda_ao_d.rename(index={'mean':'OLDA', 'std':'OLDA-std'}, inplace=True)
mscom_ao_d.rename(index={'mean':'MSCOM', 'std':'MSCOM-std'}, inplace=True)
dmscom_ao_d.rename(index={'mean':'DMSCOM', 'std':'DMSCOM-std'}, inplace=True)

lsd_at_d.rename(index={'mean':'LSD', 'std':'LSD-std'}, inplace=True)
lsdm_at_d.rename(index={'mean':'LSDM', 'std':'LSDM-std'}, inplace=True)
olda_at_d.rename(index={'mean':'OLDA', 'std':'OLDA-std'}, inplace=True)
mscom_at_d.rename(index={'mean':'MSCOM', 'std':'MSCOM-std'}, inplace=True)
dmscom_at_d.rename(index={'mean':'DMSCOM', 'std':'DMSCOM-std'}, inplace=True)


## Pre and post analysis

### Kruskal-Wallis H-test (pre-analysis)

In [4]:
ht_lm, pk_lm = kruskal(mscom_ao['l_measure'],
                       dmscom_ao['l_measure'],
                       lsdm_ao['l_measure'],
                       lsd_ao['l_measure'],
                       olda_ao['l_measure'])

ht_lm_e, pk_lm_e = kruskal(mscom_ext_ao['l_measure'],
                           dmscom_ext_ao['l_measure'],
                           lsdm_ext_ao['l_measure'],
                           lsd_ext_ao['l_measure'],
                           olda_ext_ao['l_measure'])

print('[ANNOTATOR 1] - Kruskal-Wallis H statistic for \033[1mL-MEASURE\033[0m: {},'
      '\nwith associated p-value: {}'.format(ht_lm, pk_lm))

print('[ANNOTATOR 1 - EXT] - Kruskal-Wallis H statistic for \033[1mL-MEASURE\033[0m: {},'
      '\nwith associated p-value: {}'.format(ht_lm_e, pk_lm_e))

[ANNOTATOR 1] - Kruskal-Wallis H statistic for [1mL-MEASURE[0m: 718.3773302607678,
with associated p-value: 3.6548942312306653e-154
[ANNOTATOR 1 - EXT] - Kruskal-Wallis H statistic for [1mL-MEASURE[0m: 770.1742584466683,
with associated p-value: 2.2154913072715296e-165


In [5]:
ht_lm_t, pk_lm_t = kruskal(mscom_at['l_measure'],
                       dmscom_at['l_measure'],
                       lsdm_at['l_measure'],
                       lsd_at['l_measure'],
                       olda_at['l_measure'])

ht_lm_t_e, pk_lm_t_e = kruskal(mscom_ext_at['l_measure'],
                           dmscom_ext_at['l_measure'],
                           lsdm_ext_at['l_measure'],
                           lsd_ext_at['l_measure'],
                           olda_ext_at['l_measure'])

print('[ANNOTATOR 2] - Kruskal-Wallis H statistic for \033[1mL-MEASURE\033[0m: {},'
      '\nwith associated p-value: {}'.format(ht_lm_t, pk_lm_t))

print('[ANNOTATOR 2 - EXT] - Kruskal-Wallis H statistic for \033[1mL-MEASURE\033[0m: {},'
      '\nwith associated p-value: {}'.format(ht_lm_t_e, pk_lm_t_e))

[ANNOTATOR 2] - Kruskal-Wallis H statistic for [1mL-MEASURE[0m: 439.8033220268499,
with associated p-value: 6.952180544048487e-94
[ANNOTATOR 2 - EXT] - Kruskal-Wallis H statistic for [1mL-MEASURE[0m: 503.7734459809353,
with associated p-value: 1.0230860646964689e-107


In [6]:
ht_lp, pk_lp = kruskal(mscom_ao['l_precision'],
                       dmscom_ao['l_precision'],
                       lsdm_ao['l_precision'],
                       lsd_ao['l_precision'],
                       olda_ao['l_precision'])

ht_lp_e, pk_lp_e = kruskal(mscom_ext_ao['l_precision'],
                       dmscom_ext_ao['l_precision'],
                       lsdm_ext_ao['l_precision'],
                       lsd_ext_ao['l_precision'],
                       olda_ext_ao['l_precision'])

print('[ANNOTATOR 1] - Kruskal-Wallis H statistic for \033[1mL-PRECISION\033[0m: {},'
      '\nwith associated p-value: {}'.format(ht_lp, pk_lp))

print('[ANNOTATOR 1 - EXT] - Kruskal-Wallis H statistic for \033[1mL-PRECISION\033[0m: {},'
      '\nwith associated p-value: {}'.format(ht_lp_e, pk_lp_e))

[ANNOTATOR 1] - Kruskal-Wallis H statistic for [1mL-PRECISION[0m: 264.0762798923042,
with associated p-value: 6.033100846542327e-56
[ANNOTATOR 1 - EXT] - Kruskal-Wallis H statistic for [1mL-PRECISION[0m: 281.1314243769848,
with associated p-value: 1.2707055590757207e-59


In [7]:
ht_lp_t, pk_lp_t = kruskal(mscom_at['l_precision'],
                           dmscom_at['l_precision'],
                           lsdm_at['l_precision'],
                           lsd_at['l_precision'],
                           olda_at['l_precision'])

ht_lp_t_e, pk_lp_t_e = kruskal(mscom_ext_at['l_precision'],
                           dmscom_ext_at['l_precision'],
                           lsdm_ext_at['l_precision'],
                           lsd_ext_at['l_precision'],
                           olda_ext_at['l_precision'])

print('[ANNOTATOR 2] - Kruskal-Wallis H statistic for \033[1mL-PRECISION\033[0m: {},'
      '\nwith associated p-value: {}'.format(ht_lp_t, pk_lp_t))

print('[ANNOTATOR 2 - EXT] - Kruskal-Wallis H statistic for \033[1mL-PRECISION\033[0m: {},'
      '\nwith associated p-value: {}'.format(ht_lp_t_e, pk_lp_t_e))

[ANNOTATOR 2] - Kruskal-Wallis H statistic for [1mL-PRECISION[0m: 151.9207133014596,
with associated p-value: 7.890478391543194e-32
[ANNOTATOR 2 - EXT] - Kruskal-Wallis H statistic for [1mL-PRECISION[0m: 164.6072007392131,
with associated p-value: 1.5019804873620439e-34


In [8]:
ht_lr, pk_lr = kruskal(mscom_ao['l_recall'],
                       dmscom_ao['l_recall'],
                       lsdm_ao['l_recall'],
                       lsd_ao['l_recall'],
                       olda_ao['l_recall'])

ht_lr_e, pk_lr_e = kruskal(mscom_ext_ao['l_recall'],
                       dmscom_ext_ao['l_recall'],
                       lsdm_ext_ao['l_recall'],
                       lsd_ext_ao['l_recall'],
                       olda_ext_ao['l_recall'])

print('[ANNOTATOR 1] - Kruskal-Wallis H statistic for \033[1mL-RECALL\033[0m: {},'
      '\nwith associated p-value: {}'.format(ht_lr, pk_lr))

print('[ANNOTATOR 1 - EXT] - Kruskal-Wallis H statistic for \033[1mL-RECALL\033[0m: {},'
      '\nwith associated p-value: {}'.format(ht_lr_e, pk_lr_e))

[ANNOTATOR 1] - Kruskal-Wallis H statistic for [1mL-RECALL[0m: 1087.5925796904635,
with associated p-value: 3.702594758942016e-234
[ANNOTATOR 1 - EXT] - Kruskal-Wallis H statistic for [1mL-RECALL[0m: 1144.4839992251946,
with associated p-value: 1.7250244587805762e-246


In [9]:
ht_lr_t, pk_lr_t = kruskal(mscom_at['l_recall'],
                           dmscom_at['l_recall'],
                           lsdm_at['l_recall'],
                           lsd_at['l_recall'],
                           olda_at['l_recall'])

ht_lr_t_e, pk_lr_t_e = kruskal(mscom_ext_at['l_recall'],
                           dmscom_ext_at['l_recall'],
                           lsdm_ext_at['l_recall'],
                           lsd_ext_at['l_recall'],
                           olda_ext_at['l_recall'])

print('[ANNOTATOR 2] - Kruskal-Wallis H statistic for \033[1mL-RECALL\033[0m: {},'
      '\nwith associated p-value: {}'.format(ht_lr_t, pk_lr_t))

print('[ANNOTATOR 2 - EXT] - Kruskal-Wallis H statistic for \033[1mL-RECALL\033[0m: {},'
      '\nwith associated p-value: {}'.format(ht_lr_t_e, pk_lr_t_e))

[ANNOTATOR 2] - Kruskal-Wallis H statistic for [1mL-RECALL[0m: 682.1947892488666,
with associated p-value: 2.497085942036146e-146
[ANNOTATOR 2 - EXT] - Kruskal-Wallis H statistic for [1mL-RECALL[0m: 769.022600038672,
with associated p-value: 3.934606315142392e-165


In [10]:
# correction for multiple testing (Bonferroni method)
print(multipletests([pk_lm, pk_lm_e, pk_lm_t, pk_lm_t_e], alpha=0.001, method='b'))

(array([ True,  True,  True,  True]), array([1.46195769e-153, 8.86196523e-165, 2.78087222e-093, 4.09234426e-107]), 0.00025009380472507114, 0.00025)


In [11]:
# correction for multiple testing (Bonferroni method)
print(multipletests([pk_lp, pk_lp_e, pk_lp_t, pk_lp_t_e], alpha=0.001, method='b'))

(array([ True,  True,  True,  True]), array([2.41324034e-55, 5.08282224e-59, 3.15619136e-31, 6.00792195e-34]), 0.00025009380472507114, 0.00025)


In [12]:
# correction for multiple testing (Bonferroni method)
print(multipletests([pk_lr, pk_lr_e, pk_lr_t, pk_lr_t_e], alpha=0.001, method='b'))

(array([ True,  True,  True,  True]), array([1.48103790e-233, 6.90009784e-246, 9.98834377e-146, 1.57384253e-164]), 0.00025009380472507114, 0.00025)


In [13]:
# correction for multiple testing (Bonferroni method)
print(multipletests([pk_lm_t, pk_lp_t, pk_lr_t], alpha=0.001, method='b'))

(array([ True,  True,  True]), array([2.08565416e-093, 2.36714352e-031, 7.49125783e-146]), 0.0003334445062139757, 0.0003333333333333333)


### Kolmogorov–Smirnov tests (post-hoc analysis)

In [14]:
from scipy.stats import ks_2samp

# mscom baseline vs all
_, pks_lm_mVdm = ks_2samp(mscom_ao['l_measure'], dmscom_ao['l_measure'])
_, pks_lm_mVdm_t = ks_2samp(mscom_at['l_measure'], dmscom_at['l_measure'])

_, pks_lm_mVlsd = ks_2samp(mscom_ao['l_measure'], lsd_ao['l_measure'])
_, pks_lm_mVlsd_t = ks_2samp(mscom_at['l_measure'], lsd_at['l_measure'])

_, pks_lm_mVlsdm = ks_2samp(mscom_ao['l_measure'], lsdm_ao['l_measure'])
_, pks_lm_mVlsdm_t = ks_2samp(mscom_at['l_measure'], lsdm_at['l_measure'])

_, pks_lm_mVol = ks_2samp(mscom_ao['l_measure'], olda_ao['l_measure'])
_, pks_lm_mVol_t = ks_2samp(mscom_at['l_measure'], olda_at['l_measure'])


# dmscom vs the rest
_, pks_lm_dmVlsd = ks_2samp(dmscom_ao['l_measure'], lsd_ao['l_measure'])
_, pks_lm_dmVlsd_t = ks_2samp(dmscom_at['l_measure'], lsd_at['l_measure'])

_, pks_lm_dmVlsdm = ks_2samp(dmscom_ao['l_measure'], lsdm_ao['l_measure'])
_, pks_lm_dmVlsdm_t = ks_2samp(dmscom_at['l_measure'], lsdm_at['l_measure'])

_, pks_lm_dmVol = ks_2samp(dmscom_ao['l_measure'], olda_ao['l_measure'])
_, pks_lm_dmVol_t = ks_2samp(dmscom_at['l_measure'], olda_at['l_measure'])

# extended .....................................................................

# mscom baseline vs all
_, pks_eh_lm_mVdm = ks_2samp(mscom_ext_ao['l_measure'], dmscom_ext_ao['l_measure'])
_, pks_eh_lm_mVdm_t = ks_2samp(mscom_ext_at['l_measure'], dmscom_ext_at['l_measure'])

_, pks_eh_lm_mVlsd = ks_2samp(mscom_ext_ao['l_measure'], lsd_ext_ao['l_measure'])
_, pks_eh_lm_mVlsd_t = ks_2samp(mscom_ext_at['l_measure'], lsd_ext_at['l_measure'])

_, pks_eh_lm_mVlsdm = ks_2samp(mscom_ext_ao['l_measure'], lsdm_ext_ao['l_measure'])
_, pks_eh_lm_mVlsdm_t = ks_2samp(mscom_ext_at['l_measure'], lsdm_ext_at['l_measure'])

_, pks_eh_lm_mVol = ks_2samp(mscom_ext_ao['l_measure'], olda_ext_ao['l_measure'])
_, pks_eh_lm_mVol_t = ks_2samp(mscom_ext_at['l_measure'], olda_ext_at['l_measure'])


# dmscom vs the rest
_, pks_eh_lm_dmVlsd = ks_2samp(dmscom_ext_ao['l_measure'], lsd_ext_ao['l_measure'])
_, pks_eh_lm_dmVlsd_t = ks_2samp(dmscom_ext_at['l_measure'], lsd_ext_at['l_measure'])

_, pks_eh_lm_dmVlsdm = ks_2samp(dmscom_ext_ao['l_measure'], lsdm_ext_ao['l_measure'])
_, pks_eh_lm_dmVlsdm_t = ks_2samp(dmscom_ext_at['l_measure'], lsdm_ext_at['l_measure'])

_, pks_eh_lm_dmVol = ks_2samp(dmscom_ext_ao['l_measure'], olda_ext_ao['l_measure'])
_, pks_eh_lm_dmVol_t = ks_2samp(dmscom_ext_at['l_measure'], olda_ext_at['l_measure'])


In [15]:
print('... L-MEASURE ORI ...\n')
print('MSCOM vs ... \n\tDMSCOM: {} (annotator 1) ; {} (annotator 2)\n\tLSD: {} (annotator 1) ; {} (annotator 2)'
      '\n\tOLDA: {} (annotator 1) ; {} (annotator 2)\n\tLSDM: {} (annotator 1) ; {} (annotator 2)'
      .format(pks_lm_mVdm, pks_lm_mVdm_t, pks_lm_mVlsd, pks_lm_mVlsd_t,
              pks_lm_mVol, pks_lm_mVol_t, pks_lm_mVlsdm, pks_lm_mVlsdm_t))

print('DMSCOM vs ... \n\tLSD: {} (annotator 1) ; {} (annotator 2)\n\tOLDA: {} (annotator 1) ; {} (annotator 2)'
      '\n\tLSDM: {} (annotator 1) ; {} (annotator 2)'
      .format(pks_lm_dmVlsd, pks_lm_dmVlsd_t, pks_lm_dmVol, pks_lm_dmVol_t,
              pks_lm_dmVlsdm, pks_lm_dmVlsdm_t))


print('\n\n... L-MEASURE EXT ...\n')
print('MSCOM vs ... \n\tDMSCOM: {} (annotator 1) ; {} (annotator 2)\n\tLSD: {} (annotator 1) ; {} (annotator 2)'
      '\n\tOLDA: {} (annotator 1) ; {} (annotator 2)\n\tLSDM: {} (annotator 1) ; {} (annotator 2)'
      .format(pks_eh_lm_mVdm, pks_eh_lm_mVdm_t, pks_eh_lm_mVlsd, pks_eh_lm_mVlsd_t,
              pks_eh_lm_mVol, pks_eh_lm_mVol_t, pks_eh_lm_mVlsdm, pks_eh_lm_mVlsdm_t))

print('DMSCOM vs ... \n\tLSD: {} (annotator 1) ; {} (annotator 2)\n\tOLDA: {} (annotator 1) ; {} (annotator 2)'
      '\n\tLSDM: {} (annotator 1) ; {} (annotator 2)'
      .format(pks_eh_lm_dmVlsd, pks_eh_lm_dmVlsd_t, pks_eh_lm_dmVol, pks_eh_lm_dmVol_t,
              pks_eh_lm_dmVlsdm, pks_eh_lm_dmVlsdm_t))

... L-MEASURE ORI ...

MSCOM vs ... 
	DMSCOM: 0.0009186756223120535 (annotator 1) ; 0.0006894626946118742 (annotator 2)
	LSD: 0.22141623179238965 (annotator 1) ; 0.2387312687561134 (annotator 2)
	OLDA: 1.194001328164863e-30 (annotator 1) ; 2.824574945975957e-14 (annotator 2)
	LSDM: 7.6961564227475406e-90 (annotator 1) ; 2.568579303013612e-61 (annotator 2)
DMSCOM vs ... 
	LSD: 1.3387805178369627e-06 (annotator 1) ; 0.004725149039985884 (annotator 2)
	OLDA: 4.07354327630593e-48 (annotator 1) ; 1.2054189713166672e-28 (annotator 2)
	LSDM: 1.2413637054064433e-102 (annotator 1) ; 4.3218335360739897e-66 (annotator 2)


... L-MEASURE EXT ...

MSCOM vs ... 
	DMSCOM: 0.0010758720481016792 (annotator 1) ; 0.00022539534110383995 (annotator 2)
	LSD: 0.17684372952944585 (annotator 1) ; 0.30461848317857637 (annotator 2)
	OLDA: 1.3870257762083628e-29 (annotator 1) ; 1.948235489279115e-17 (annotator 2)
	LSDM: 1.6056632945425575e-94 (annotator 1) ; 1.5956743915327333e-65 (annotator 2)
DMSCOM vs ... 
	LS

In [16]:
# mscom baseline vs all

# MSCOM - DMSCOM ||| MSCOM - LSD ||| MSCOM - LSDM ||| MSCOM - OLDA

print('LM A1 bonferroni-corrected p-values:\n'
      + str(multipletests([pks_lm_mVdm, pks_lm_mVlsd, pks_lm_mVlsdm, pks_lm_mVol], alpha=0.005, method='b')))

print('LM A2 bonferroni-corrected p-values:\n'
      + str(multipletests([pks_lm_mVdm_t, pks_lm_mVlsd_t, pks_lm_mVlsdm_t, pks_lm_mVol_t], alpha=0.005, method='b')))

print('LM A1X bonferroni-corrected p-values:\n'
      + str(multipletests([pks_eh_lm_mVdm, pks_eh_lm_mVlsd, pks_eh_lm_mVlsdm, pks_eh_lm_mVol], alpha=0.005, method='b')))

print('LM A2X bonferroni-corrected p-values:\n'
      + str(multipletests([pks_eh_lm_mVdm_t, pks_eh_lm_mVlsd_t, pks_eh_lm_mVlsdm_t, pks_eh_lm_mVol_t], alpha=0.005, method='b')))

LM A1 bonferroni-corrected p-values:
(array([ True, False,  True,  True]), array([3.67470249e-03, 8.85664927e-01, 3.07846257e-89, 4.77600531e-30]), 0.0012523506095245551, 0.00125)
LM A2 bonferroni-corrected p-values:
(array([ True, False,  True,  True]), array([2.75785078e-03, 9.54925075e-01, 1.02743172e-60, 1.12982998e-13]), 0.0012523506095245551, 0.00125)
LM A1X bonferroni-corrected p-values:
(array([ True, False,  True,  True]), array([4.30348819e-03, 7.07374918e-01, 6.42265318e-94, 5.54810310e-29]), 0.0012523506095245551, 0.00125)
LM A2X bonferroni-corrected p-values:
(array([ True, False,  True,  True]), array([9.01581364e-04, 1.00000000e+00, 6.38269757e-65, 7.79294196e-17]), 0.0012523506095245551, 0.00125)


In [17]:
# dmscom baseline vs all

# MSCOM - DMSCOM ||| DMSCOM - LSD ||| DMSCOM - LSDM ||| DMSCOM - OLDA

print('LM A1 bonferroni-corrected p-values:\n'
      + str(multipletests([pks_lm_mVdm, pks_lm_dmVlsd, pks_lm_dmVlsdm, pks_lm_dmVol], alpha=0.005, method='b')))

print('LM A2 bonferroni-corrected p-values:\n'
      + str(multipletests([pks_lm_mVdm_t, pks_lm_dmVlsd_t, pks_lm_dmVlsdm_t, pks_lm_dmVol_t], alpha=0.005, method='b')))

print('LM A1X bonferroni-corrected p-values:\n'
      + str(multipletests([pks_eh_lm_mVdm, pks_eh_lm_dmVlsd, pks_eh_lm_dmVlsdm, pks_eh_lm_dmVol], alpha=0.005, method='b')))

print('LM A2X bonferroni-corrected p-values:\n'
      + str(multipletests([pks_eh_lm_mVdm_t, pks_eh_lm_dmVlsd_t, pks_eh_lm_dmVlsdm_t, pks_eh_lm_dmVol_t], alpha=0.005, method='b')))

LM A1 bonferroni-corrected p-values:
(array([ True,  True,  True,  True]), array([3.67470249e-003, 5.35512207e-006, 4.96545482e-102, 1.62941731e-047]), 0.0012523506095245551, 0.00125)
LM A2 bonferroni-corrected p-values:
(array([ True, False,  True,  True]), array([2.75785078e-03, 1.89005962e-02, 1.72873341e-65, 4.82167589e-28]), 0.0012523506095245551, 0.00125)
LM A1X bonferroni-corrected p-values:
(array([ True,  True,  True,  True]), array([4.30348819e-003, 3.70961077e-006, 3.05663472e-111, 5.33743190e-049]), 0.0012523506095245551, 0.00125)
LM A2X bonferroni-corrected p-values:
(array([ True, False,  True,  True]), array([9.01581364e-04, 1.82035884e-02, 2.24627296e-70, 1.39942383e-27]), 0.0012523506095245551, 0.00125)


In [18]:
_meas = 'l_precision'

# mscom baseline vs all
_, pks_lp_mVdm = ks_2samp(mscom_ao[_meas], dmscom_ao[_meas])
_, pks_lp_mVdm_t = ks_2samp(mscom_at[_meas], dmscom_at[_meas])

_, pks_lp_mVlsd = ks_2samp(mscom_ao[_meas], lsd_ao[_meas])
_, pks_lp_mVlsd_t = ks_2samp(mscom_at[_meas], lsd_at[_meas])

_, pks_lp_mVlsdm = ks_2samp(mscom_ao[_meas], lsdm_ao[_meas])
_, pks_lp_mVlsdm_t = ks_2samp(mscom_at[_meas], lsdm_at[_meas])

_, pks_lp_mVol = ks_2samp(mscom_ao[_meas], olda_ao[_meas])
_, pks_lp_mVol_t = ks_2samp(mscom_at[_meas], olda_at[_meas])


# dmscom vs the rest
_, pks_lp_dmVlsd = ks_2samp(dmscom_ao[_meas], lsd_ao[_meas])
_, pks_lp_dmVlsd_t = ks_2samp(dmscom_at[_meas], lsd_at[_meas])

_, pks_lp_dmVlsdm = ks_2samp(dmscom_ao[_meas], lsdm_ao[_meas])
_, pks_lp_dmVlsdm_t = ks_2samp(dmscom_at[_meas], lsdm_at[_meas])

_, pks_lp_dmVol = ks_2samp(dmscom_ao[_meas], olda_ao[_meas])
_, pks_lp_dmVol_t = ks_2samp(dmscom_at[_meas], olda_at[_meas])


# --- extended ------------------------------------------------
# mscom baseline vs all
_, pks_eh_lp_mVdm = ks_2samp(mscom_ext_ao[_meas], dmscom_ext_ao[_meas])
_, pks_eh_lp_mVdm_t = ks_2samp(mscom_ext_at[_meas], dmscom_ext_at[_meas])

_, pks_eh_lp_mVlsd = ks_2samp(mscom_ext_ao[_meas], lsd_ext_ao[_meas])
_, pks_eh_lp_mVlsd_t = ks_2samp(mscom_ext_at[_meas], lsd_ext_at[_meas])

_, pks_eh_lp_mVlsdm = ks_2samp(mscom_ext_ao[_meas], lsdm_ext_ao[_meas])
_, pks_eh_lp_mVlsdm_t = ks_2samp(mscom_ext_at[_meas], lsdm_ext_at[_meas])

_, pks_eh_lp_mVol = ks_2samp(mscom_ext_ao[_meas], olda_ext_ao[_meas])
_, pks_eh_lp_mVol_t = ks_2samp(mscom_ext_at[_meas], olda_ext_at[_meas])


# dmscom vs the rest
_, pks_eh_lp_dmVlsd = ks_2samp(dmscom_ext_ao[_meas], lsd_ext_ao[_meas])
_, pks_eh_lp_dmVlsd_t = ks_2samp(dmscom_ext_at[_meas], lsd_ext_at[_meas])

_, pks_eh_lp_dmVlsdm = ks_2samp(dmscom_ext_ao[_meas], lsdm_ext_ao[_meas])
_, pks_eh_lp_dmVlsdm_t = ks_2samp(dmscom_ext_at[_meas], lsdm_ext_at[_meas])

_, pks_eh_lp_dmVol = ks_2samp(dmscom_ext_ao[_meas], olda_ext_ao[_meas])
_, pks_eh_lp_dmVol_t = ks_2samp(dmscom_ext_at[_meas], olda_ext_at[_meas])

In [19]:
print('... L-PRECISION ...\n')
print('MSCOM vs ... \n\tDMSCOM: {} (annotator 1) ; {} (annotator 2)\n\tLSD: {} (annotator 1) ; {} (annotator 2)'
      '\n\tOLDA: {} (annotator 1) ; {} (annotator 2)\n\tLSDM: {} (annotator 1) ; {} (annotator 2)'
      .format(pks_lp_mVdm, pks_lp_mVdm_t, pks_lp_mVlsd, pks_lp_mVlsd_t,
              pks_lp_mVol, pks_lp_mVol_t, pks_lp_mVlsdm, pks_lp_mVlsdm_t))

print('DMSCOM vs ... \n\tLSD: {} (annotator 1) ; {} (annotator 2)\n\tOLDA: {} (annotator 1) ; {} (annotator 2)'
      '\n\tLSDM: {} (annotator 1) ; {} (annotator 2)'
      .format(pks_lp_dmVlsd, pks_lp_dmVlsd_t, pks_lp_dmVol, pks_lp_dmVol_t,
              pks_lp_dmVlsdm, pks_lp_dmVlsdm_t))


print('\n\n... L-PRECISION EXT ...\n')
print('MSCOM vs ... \n\tDMSCOM: {} (annotator 1) ; {} (annotator 2)\n\tLSD: {} (annotator 1) ; {} (annotator 2)'
      '\n\tOLDA: {} (annotator 1) ; {} (annotator 2)\n\tLSDM: {} (annotator 1) ; {} (annotator 2)'
      .format(pks_eh_lp_mVdm, pks_eh_lp_mVdm_t, pks_eh_lp_mVlsd, pks_eh_lp_mVlsd_t,
              pks_eh_lp_mVol, pks_eh_lp_mVol_t, pks_eh_lp_mVlsdm, pks_eh_lp_mVlsdm_t))

print('DMSCOM vs ... \n\tLSD: {} (annotator 1) ; {} (annotator 2)\n\tOLDA: {} (annotator 1) ; {} (annotator 2)'
      '\n\tLSDM: {} (annotator 1) ; {} (annotator 2)'
      .format(pks_eh_lp_dmVlsd, pks_eh_lp_dmVlsd_t, pks_eh_lp_dmVol, pks_eh_lp_dmVol_t,
              pks_eh_lp_dmVlsdm, pks_eh_lp_dmVlsdm_t))

... L-PRECISION ...

MSCOM vs ... 
	DMSCOM: 5.49542505878406e-05 (annotator 1) ; 0.0003128393912348299 (annotator 2)
	LSD: 0.02297833544384675 (annotator 1) ; 0.02483260180443838 (annotator 2)
	OLDA: 4.650077377856885e-29 (annotator 1) ; 4.0382927903984434e-16 (annotator 2)
	LSDM: 3.8104714497823494e-11 (annotator 1) ; 0.00010292346086552116 (annotator 2)
DMSCOM vs ... 
	LSD: 0.00012909236857204244 (annotator 1) ; 0.0299909828849617 (annotator 2)
	OLDA: 1.6536163118223736e-49 (annotator 1) ; 2.9146669578708804e-32 (annotator 2)
	LSDM: 7.176729868741062e-19 (annotator 1) ; 1.965331261936809e-09 (annotator 2)


... L-PRECISION EXT ...

MSCOM vs ... 
	DMSCOM: 1.3722182908820416e-05 (annotator 1) ; 4.0038932079391275e-05 (annotator 2)
	LSD: 0.008540167472645336 (annotator 1) ; 0.029218540613003708 (annotator 2)
	OLDA: 9.836045267392468e-32 (annotator 1) ; 1.948235489279115e-17 (annotator 2)
	LSDM: 5.05794314004289e-09 (annotator 1) ; 2.1768406847885015e-05 (annotator 2)
DMSCOM vs ... 
	LSD

In [20]:
# mscom baseline vs all

# MSCOM - DMSCOM ||| MSCOM - LSD ||| MSCOM - LSDM ||| MSCOM - OLDA

print('LP A1 bonferroni-corrected p-values:\n'
      + str(multipletests([pks_lp_mVdm, pks_lp_mVlsd, pks_lp_mVlsdm, pks_lp_mVol], alpha=0.005, method='b')))

print('LP A2 bonferroni-corrected p-values:\n'
      + str(multipletests([pks_lp_mVdm_t, pks_lp_mVlsd_t, pks_lp_mVlsdm_t, pks_lp_mVol_t], alpha=0.005, method='b')))

print('LP A1X bonferroni-corrected p-values:\n'
      + str(multipletests([pks_eh_lp_mVdm, pks_eh_lp_mVlsd, pks_eh_lp_mVlsdm, pks_eh_lp_mVol], alpha=0.005, method='b')))

print('LP A2X bonferroni-corrected p-values:\n'
      + str(multipletests([pks_eh_lp_mVdm_t, pks_eh_lp_mVlsd_t, pks_eh_lp_mVlsdm_t, pks_eh_lp_mVol_t], alpha=0.005, method='b')))

LP A1 bonferroni-corrected p-values:
(array([ True, False,  True,  True]), array([2.19817002e-04, 9.19133418e-02, 1.52418858e-10, 1.86003095e-28]), 0.0012523506095245551, 0.00125)
LP A2 bonferroni-corrected p-values:
(array([ True, False,  True,  True]), array([1.25135756e-03, 9.93304072e-02, 4.11693843e-04, 1.61531712e-15]), 0.0012523506095245551, 0.00125)
LP A1X bonferroni-corrected p-values:
(array([ True, False,  True,  True]), array([5.48887316e-05, 3.41606699e-02, 2.02317726e-08, 3.93441811e-31]), 0.0012523506095245551, 0.00125)
LP A2X bonferroni-corrected p-values:
(array([ True, False,  True,  True]), array([1.60155728e-04, 1.16874162e-01, 8.70736274e-05, 7.79294196e-17]), 0.0012523506095245551, 0.00125)


In [21]:
# dmscom baseline vs all

# MSCOM - DMSCOM ||| DMSCOM - LSD ||| DMSCOM - LSDM ||| DMSCOM - OLDA

print('LM A1 bonferroni-corrected p-values:\n'
      + str(multipletests([pks_lp_mVdm, pks_lp_dmVlsd, pks_lp_dmVlsdm, pks_lp_dmVol], alpha=0.005, method='b')))

print('LM A2 bonferroni-corrected p-values:\n'
      + str(multipletests([pks_lp_mVdm_t, pks_lp_dmVlsd_t, pks_lp_dmVlsdm_t, pks_lp_dmVol_t], alpha=0.005, method='b')))

print('LM A1 bonferroni-corrected p-values:\n'
      + str(multipletests([pks_eh_lp_mVdm, pks_eh_lp_dmVlsd, pks_eh_lp_dmVlsdm, pks_eh_lp_dmVol], alpha=0.005, method='b')))

print('LM A2 bonferroni-corrected p-values:\n'
      + str(multipletests([pks_eh_lp_mVdm_t, pks_eh_lp_dmVlsd_t, pks_eh_lp_dmVlsdm_t, pks_eh_lp_dmVol_t], alpha=0.005, method='b')))

LM A1 bonferroni-corrected p-values:
(array([ True,  True,  True,  True]), array([2.19817002e-04, 5.16369474e-04, 2.87069195e-18, 6.61446525e-49]), 0.0012523506095245551, 0.00125)
LM A2 bonferroni-corrected p-values:
(array([ True, False,  True,  True]), array([1.25135756e-03, 1.19963932e-01, 7.86132505e-09, 1.16586678e-31]), 0.0012523506095245551, 0.00125)
LM A1 bonferroni-corrected p-values:
(array([ True,  True,  True,  True]), array([5.48887316e-05, 1.49317992e-04, 4.16313439e-18, 1.09538771e-49]), 0.0012523506095245551, 0.00125)
LM A2 bonferroni-corrected p-values:
(array([ True, False,  True,  True]), array([1.60155728e-04, 5.33248971e-02, 5.09528905e-12, 6.89177010e-33]), 0.0012523506095245551, 0.00125)


In [22]:
_meas = 'l_recall'

# mscom baseline vs all
_, pks_lr_mVdm = ks_2samp(mscom_ao[_meas], dmscom_ao[_meas])
_, pks_lr_mVdm_t = ks_2samp(mscom_at[_meas], dmscom_at[_meas])

_, pks_lr_mVlsd = ks_2samp(mscom_ao[_meas], lsd_ao[_meas])
_, pks_lr_mVlsd_t = ks_2samp(mscom_at[_meas], lsd_at[_meas])

_, pks_lr_mVlsdm = ks_2samp(mscom_ao[_meas], lsdm_ao[_meas])
_, pks_lr_mVlsdm_t = ks_2samp(mscom_at[_meas], lsdm_at[_meas])

_, pks_lr_mVol = ks_2samp(mscom_ao[_meas], olda_ao[_meas])
_, pks_lr_mVol_t = ks_2samp(mscom_at[_meas], olda_at[_meas])


# dmscom vs the rest
_, pks_lr_dmVlsd = ks_2samp(dmscom_ao[_meas], lsd_ao[_meas])
_, pks_lr_dmVlsd_t = ks_2samp(dmscom_at[_meas], lsd_at[_meas])

_, pks_lr_dmVlsdm = ks_2samp(dmscom_ao[_meas], lsdm_ao[_meas])
_, pks_lr_dmVlsdm_t = ks_2samp(dmscom_at[_meas], lsdm_at[_meas])

_, pks_lr_dmVol = ks_2samp(dmscom_ao[_meas], olda_ao[_meas])
_, pks_lr_dmVol_t = ks_2samp(dmscom_at[_meas], olda_at[_meas])


# --- extended ------------------------------------------------
# mscom baseline vs all
_, pks_eh_lr_mVdm = ks_2samp(mscom_ext_ao[_meas], dmscom_ext_ao[_meas])
_, pks_eh_lr_mVdm_t = ks_2samp(mscom_ext_at[_meas], dmscom_ext_at[_meas])

_, pks_eh_lr_mVlsd = ks_2samp(mscom_ext_ao[_meas], lsd_ext_ao[_meas])
_, pks_eh_lr_mVlsd_t = ks_2samp(mscom_ext_at[_meas], lsd_ext_at[_meas])

_, pks_eh_lr_mVlsdm = ks_2samp(mscom_ext_ao[_meas], lsdm_ext_ao[_meas])
_, pks_eh_lr_mVlsdm_t = ks_2samp(mscom_ext_at[_meas], lsdm_ext_at[_meas])

_, pks_eh_lr_mVol = ks_2samp(mscom_ext_ao[_meas], olda_ext_ao[_meas])
_, pks_eh_lr_mVol_t = ks_2samp(mscom_ext_at[_meas], olda_ext_at[_meas])


# dmscom vs the rest
_, pks_eh_lr_dmVlsd = ks_2samp(dmscom_ext_ao[_meas], lsd_ext_ao[_meas])
_, pks_eh_lr_dmVlsd_t = ks_2samp(dmscom_ext_at[_meas], lsd_ext_at[_meas])

_, pks_eh_lr_dmVlsdm = ks_2samp(dmscom_ext_ao[_meas], lsdm_ext_ao[_meas])
_, pks_eh_lr_dmVlsdm_t = ks_2samp(dmscom_ext_at[_meas], lsdm_ext_at[_meas])

_, pks_eh_lr_dmVol = ks_2samp(dmscom_ext_ao[_meas], olda_ext_ao[_meas])
_, pks_eh_lr_dmVol_t = ks_2samp(dmscom_ext_at[_meas], olda_ext_at[_meas])

In [23]:
print('... L-RECALL ...\n')
print('MSCOM vs ... \n\tDMSCOM: {} (annotator 1) ; {} (annotator 2)\n\tLSD: {} (annotator 1) ; {} (annotator 2)'
      '\n\tOLDA: {} (annotator 1) ; {} (annotator 2)\n\tLSDM: {} (annotator 1) ; {} (annotator 2)'
      .format(pks_lr_mVdm, pks_lr_mVdm_t, pks_lr_mVlsd, pks_lr_mVlsd_t,
              pks_lr_mVol, pks_lr_mVol_t, pks_lr_mVlsdm, pks_lr_mVlsdm_t))

print('DMSCOM vs ... \n\tLSD: {} (annotator 1) ; {} (annotator 2)\n\tOLDA: {} (annotator 1) ; {} (annotator 2)'
      '\n\tLSDM: {} (annotator 1) ; {} (annotator 2)'
      .format(pks_lr_dmVlsd, pks_lr_dmVlsd_t, pks_lr_dmVol, pks_lr_dmVol_t,
              pks_lr_dmVlsdm, pks_lr_dmVlsdm_t))


print('\n\n... L-RECALL EXT ...\n')
print('MSCOM vs ... \n\tDMSCOM: {} (annotator 1) ; {} (annotator 2)\n\tLSD: {} (annotator 1) ; {} (annotator 2)'
      '\n\tOLDA: {} (annotator 1) ; {} (annotator 2)\n\tLSDM: {} (annotator 1) ; {} (annotator 2)'
      .format(pks_eh_lr_mVdm, pks_eh_lr_mVdm_t, pks_eh_lr_mVlsd, pks_eh_lr_mVlsd_t,
              pks_eh_lr_mVol, pks_eh_lr_mVol_t, pks_eh_lr_mVlsdm, pks_eh_lr_mVlsdm_t))

print('DMSCOM vs ... \n\tLSD: {} (annotator 1) ; {} (annotator 2)\n\tOLDA: {} (annotator 1) ; {} (annotator 2)'
      '\n\tLSDM: {} (annotator 1) ; {} (annotator 2)'
      .format(pks_eh_lr_dmVlsd, pks_eh_lr_dmVlsd_t, pks_eh_lr_dmVol, pks_eh_lr_dmVol_t,
              pks_eh_lr_dmVlsdm, pks_eh_lr_dmVlsdm_t))

... L-RECALL ...

MSCOM vs ... 
	DMSCOM: 0.08924057134431122 (annotator 1) ; 0.051532705435294514 (annotator 2)
	LSD: 0.2740368732023246 (annotator 1) ; 0.6428885698853031 (annotator 2)
	OLDA: 1.7524262334596047e-16 (annotator 1) ; 1.4659346270513828e-07 (annotator 2)
	LSDM: 9.058591389021714e-144 (annotator 1) ; 4.063442698123771e-102 (annotator 2)
DMSCOM vs ... 
	LSD: 0.0035914897757606656 (annotator 1) ; 0.02999098288496154 (annotator 2)
	OLDA: 1.1441551721768639e-24 (annotator 1) ; 2.4461699240766623e-12 (annotator 2)
	LSDM: 3.8238264848062445e-150 (annotator 1) ; 6.545031891598002e-101 (annotator 2)


... L-RECALL EXT ...

MSCOM vs ... 
	DMSCOM: 0.048605753542437395 (annotator 1) ; 0.07093190667625568 (annotator 2)
	LSD: 0.1768437295294467 (annotator 1) ; 0.48082580158939386 (annotator 2)
	OLDA: 3.9542117355410587e-19 (annotator 1) ; 1.960973740278813e-08 (annotator 2)
	LSDM: 1.7789270109620653e-151 (annotator 1) ; 8.111341410236242e-106 (annotator 2)
DMSCOM vs ... 
	LSD: 0.000466

In [24]:
# mscom baseline vs all

# MSCOM - DMSCOM ||| MSCOM - LSD ||| MSCOM - LSDM ||| MSCOM - OLDA

print('LP A1 bonferroni-corrected p-values:\n'
      + str(multipletests([pks_lr_mVdm, pks_lr_mVlsd, pks_lr_mVlsdm, pks_lr_mVol], alpha=0.005, method='b')))

print('LP A2 bonferroni-corrected p-values:\n'
      + str(multipletests([pks_lr_mVdm_t, pks_lr_mVlsd_t, pks_lr_mVlsdm_t, pks_lr_mVol_t], alpha=0.005, method='b')))

print('LP A1X bonferroni-corrected p-values:\n'
      + str(multipletests([pks_eh_lr_mVdm, pks_eh_lr_mVlsd, pks_eh_lr_mVlsdm, pks_eh_lr_mVol], alpha=0.005, method='b')))

print('LP A2X bonferroni-corrected p-values:\n'
      + str(multipletests([pks_eh_lr_mVdm_t, pks_eh_lr_mVlsd_t, pks_eh_lr_mVlsdm_t, pks_eh_lr_mVol_t], alpha=0.005, method='b')))

LP A1 bonferroni-corrected p-values:
(array([False, False,  True,  True]), array([3.56962285e-001, 1.00000000e+000, 3.62343656e-143, 7.00970493e-016]), 0.0012523506095245551, 0.00125)
LP A2 bonferroni-corrected p-values:
(array([False, False,  True,  True]), array([2.06130822e-001, 1.00000000e+000, 1.62537708e-101, 5.86373851e-007]), 0.0012523506095245551, 0.00125)
LP A1X bonferroni-corrected p-values:
(array([False, False,  True,  True]), array([1.94423014e-001, 7.07374918e-001, 7.11570804e-151, 1.58168469e-018]), 0.0012523506095245551, 0.00125)
LP A2X bonferroni-corrected p-values:
(array([False, False,  True,  True]), array([2.83727627e-001, 1.00000000e+000, 3.24453656e-105, 7.84389496e-008]), 0.0012523506095245551, 0.00125)


In [25]:
# dmscom baseline vs all

# MSCOM - DMSCOM ||| DMSCOM - LSD ||| DMSCOM - LSDM ||| DMSCOM - OLDA

print('LM A1 bonferroni-corrected p-values:\n'
      + str(multipletests([pks_lr_mVdm, pks_lr_dmVlsd, pks_lr_dmVlsdm, pks_lr_dmVol], alpha=0.005, method='b')))

print('LM A2 bonferroni-corrected p-values:\n'
      + str(multipletests([pks_lr_mVdm_t, pks_lr_dmVlsd_t, pks_lr_dmVlsdm_t, pks_lr_dmVol_t], alpha=0.005, method='b')))

print('LM A1X bonferroni-corrected p-values:\n'
      + str(multipletests([pks_eh_lr_mVdm, pks_eh_lr_dmVlsd, pks_eh_lr_dmVlsdm, pks_eh_lr_dmVol], alpha=0.005, method='b')))

print('LM A2X bonferroni-corrected p-values:\n'
      + str(multipletests([pks_eh_lr_mVdm_t, pks_eh_lr_dmVlsd_t, pks_eh_lr_dmVlsdm_t, pks_eh_lr_dmVol_t], alpha=0.005, method='b')))

LM A1 bonferroni-corrected p-values:
(array([False, False,  True,  True]), array([3.56962285e-001, 1.43659591e-002, 1.52953059e-149, 4.57662069e-024]), 0.0012523506095245551, 0.00125)
LM A2 bonferroni-corrected p-values:
(array([False, False,  True,  True]), array([2.06130822e-001, 1.19963932e-001, 2.61801276e-100, 9.78467970e-012]), 0.0012523506095245551, 0.00125)
LM A1X bonferroni-corrected p-values:
(array([False,  True,  True,  True]), array([1.94423014e-001, 1.86566560e-003, 2.12228743e-159, 1.14182092e-024]), 0.0012523506095245551, 0.00125)
LM A2X bonferroni-corrected p-values:
(array([False, False,  True,  True]), array([2.83727627e-001, 5.33248971e-002, 7.49078101e-111, 1.55180048e-013]), 0.0012523506095245551, 0.00125)
