In [1]:
import pandas as pd
from collections import Counter

In [2]:
df = pd.read_csv("annotation_result.csv")

In [3]:
df['label'].unique()

array(['Slogan #1', 'Slogan #2', "Can't decide"], dtype=object)

In [4]:
exp_ids, ctrls, labels = df['exp_id'].tolist(), df['ctrl'].tolist(), df['label'].tolist()

In [5]:
result = {ctrl:Counter() for ctrl in ['NN', 'JJ', 'VB', 'DT', 'PR', 'OTHER']}
result

{'NN': Counter(),
 'JJ': Counter(),
 'VB': Counter(),
 'DT': Counter(),
 'PR': Counter(),
 'OTHER': Counter()}

In [6]:
for exp_id, ctrl, label in zip(exp_ids, ctrls, labels):
    if label == "Can't decide":
        result[ctrl].update(["tie"])
    elif label == "Slogan #1":
        if exp_id == 0:
            result[ctrl].update(["better"])
        else:
            result[ctrl].update(["worse"])
    elif label == "Slogan #2":
        if exp_id == 0:
            result[ctrl].update(["worse"])
        else:
            result[ctrl].update(["better"])

In [7]:
for ctrl in ['NN', 'JJ', 'VB', 'DT', 'PR', 'OTHER']:
    print(ctrl)
    print(result[ctrl])

NN
Counter({'better': 28, 'worse': 19, 'tie': 3})
JJ
Counter({'better': 32, 'worse': 15, 'tie': 3})
VB
Counter({'better': 36, 'worse': 14})
DT
Counter({'better': 41, 'worse': 8, 'tie': 1})
PR
Counter({'better': 37, 'worse': 13})
OTHER
Counter({'better': 39, 'worse': 11})


## Statistical test

In [8]:
from scipy.stats import wilcoxon

In [9]:
def prepare_test_data(ct):
    x, y = [], []
    for k, v in ct.items():
        if k == 'better':
            x.extend([1]*v)
            y.extend([0]*v)
        elif k == 'tie':
            x.extend([0.5]*v)
            y.extend([0.5]*v)
        else:
            x.extend([0]*v)
            y.extend([1]*v)
    return x, y

In [10]:
x_all, y_all = list(), list()
for ctrl in ['NN', 'JJ', 'VB', 'DT', 'PR', 'OTHER']:
    print(ctrl)
    print(result[ctrl])
    x, y = prepare_test_data(result[ctrl])
    x_all.extend(x)
    y_all.extend(y)
    print(wilcoxon(x, y))
print("overall:")
print(wilcoxon(x_all, y_all))

NN
Counter({'better': 28, 'worse': 19, 'tie': 3})
WilcoxonResult(statistic=456.0, pvalue=0.18925543169201764)
JJ
Counter({'better': 32, 'worse': 15, 'tie': 3})
WilcoxonResult(statistic=360.0, pvalue=0.013149117330897018)
VB
Counter({'better': 36, 'worse': 14})
WilcoxonResult(statistic=357.0, pvalue=0.00186284629798189)
DT
Counter({'better': 41, 'worse': 8, 'tie': 1})
WilcoxonResult(statistic=200.0, pvalue=2.4256011426534493e-06)
PR
Counter({'better': 37, 'worse': 13})
WilcoxonResult(statistic=331.5, pvalue=0.0006885138966450773)
OTHER
Counter({'better': 39, 'worse': 11})
WilcoxonResult(statistic=280.5, pvalue=7.50131946654591e-05)
overall:
WilcoxonResult(statistic=11760.0, pvalue=7.852147576316655e-15)
