# Calculate IAA using Fleiss Kappa

In [15]:
import pandas as pd
import numpy as np
from collections import Counter

In [43]:
annotation_result_df = pd.read_csv("iaa.csv")

In [44]:
annotation_result_df

Unnamed: 0,num,원익,수민,유정,결론
0,11,2111,2111,2111,2111
1,12,2341,2141,2131,2341
2,19,4131,4131,4421,3131
3,25,2411,2131,2141,2431
4,27,1155,2151,2131,2151
...,...,...,...,...,...
633,2426,2133,2133,2131,2133
634,2432,4421,2421,2421,2421
635,2434,2262,2162,2431,2262
636,2435,3345,3355,3155,3345


In [45]:
""" data format
worker1	worker2	worker3
1	1	2
3	1	3
3	3	3
4	1	4
4	4	2
""" 
annotation_results = [[],[],[],[]]
annotator1 = annotation_result_df["원익"]
annotator2 = annotation_result_df["수민"]
annotator3 = annotation_result_df["유정"]

In [46]:
for a1, a2, a3 in zip(annotator1, annotator2, annotator3):
    for i in range(4):
        annotation_results[i].append(
        {
            "annotator1": int(str(a1)[i]),
            "annotator2": int(str(a2)[i]),
            "annotator3": int(str(a3)[i]),
        })

In [47]:
def calculate_iaa(annotation_result):
    import pandas as pd
    import numpy as np
    from fleiss import fleissKappa

    result = annotation_result.to_numpy()
    num_classes = int(np.max(result))

    transformed_result = []
    for i in range(len(result)):
        temp = np.zeros(num_classes)
        for j in range(len(result[i])):
            temp[int(result[i][j]-1)] += 1
        transformed_result.append(temp.astype(int).tolist())

    kappa = fleissKappa(transformed_result,len(result[0]))
    return

In [48]:
for i in range(4):
    print(f"Tag Type : {i+1}")
    calculate_iaa(pd.DataFrame(annotation_results[i]))
    print()

Tag Type : 1
#raters =  3 , #subjects =  638 , #categories =  5
PA =  0.6003134796238236
PE = 0.3414640404695534
Fleiss' Kappa = 0.393

Tag Type : 2
#raters =  3 , #subjects =  638 , #categories =  4
PA =  0.7126436781609167
PE = 0.38718773509606924
Fleiss' Kappa = 0.531

Tag Type : 3
#raters =  3 , #subjects =  638 , #categories =  6
PA =  0.6269592476489018
PE = 0.21707235581411347
Fleiss' Kappa = 0.524

Tag Type : 4
#raters =  3 , #subjects =  638 , #categories =  6
PA =  0.6844305120167168
PE = 0.3377543678051731
Fleiss' Kappa = 0.523



# Accuracy per annotators

In [8]:
gold_labels = [[],[],[],[]]
for gold in annotation_result_df["결론"]:
    for i in range(4):
        gold_labels[i].append(
            int(str(gold)[i]),
        )

In [11]:
ann1_avg = 0
ann2_avg = 0
ann3_avg = 0
for i in range(4):
    print(f"Tag Type : {i+1}")
    ann1_acc = sum(pd.DataFrame(annotation_results[i])['annotator1'].to_numpy() == np.array(gold_labels[i])) / len(gold_labels[i])
    ann2_acc = sum(pd.DataFrame(annotation_results[i])['annotator2'].to_numpy() == np.array(gold_labels[i])) / len(gold_labels[i])
    ann3_acc = sum(pd.DataFrame(annotation_results[i])['annotator3'].to_numpy() == np.array(gold_labels[i])) / len(gold_labels[i])
    ann1_avg += ann1_acc
    ann2_avg += ann2_acc
    ann3_avg += ann3_acc
    print(f"원익 : {ann1_acc:.3f}")
    print(f"수민: {ann2_acc:.3f}")
    print(f"유정: {ann3_acc:.3f}")
    print()
# print(f"Average Accuracy")
# print(f"원익 : {ann1_avg/4}")
# print(f"수민: {ann2_avg/4}")
# print(f"유정: {ann3_avg/4}")

Tag Type : 1
원익 : 0.790
수민: 0.738
유정: 0.715

Tag Type : 2
원익 : 0.851
수민: 0.839
유정: 0.784

Tag Type : 3
원익 : 0.839
수민: 0.773
유정: 0.727

Tag Type : 4
원익 : 0.813
수민: 0.785
유정: 0.803



# IAA per label

In [53]:
import copy
num_categories_per_label = {
    1: 5,
    2: 4,
    3: 6,
    4: 6,
}
for i in range(4):
    print(f"Label : {i+1}")
    # reformat into binary label per attributes
    for j in range(num_categories_per_label[i+1]):
        print(f"Attribute {j+1}")
        reformatted_annotation_result = copy.deepcopy(annotation_results[i])
        for ann in reformatted_annotation_result:
            for a in ann.keys():
                if ann[a] == j+1:
                    ann[a] = 1
                else:
                    ann[a] = 2
        calculate_iaa(pd.DataFrame(reformatted_annotation_result))
        print()

Label : 1
Attribute 1
#raters =  3 , #subjects =  638 , #categories =  2
PA =  0.7763845350052229
PE = 0.6817597660749752
Fleiss' Kappa = 0.297

Attribute 2
#raters =  3 , #subjects =  638 , #categories =  2
PA =  0.7053291536050136
PE = 0.5010094458802706
Fleiss' Kappa = 0.409

Attribute 3
#raters =  3 , #subjects =  638 , #categories =  2
PA =  0.8578892371995809
PE = 0.7614317425689168
Fleiss' Kappa = 0.404

Attribute 4
#raters =  3 , #subjects =  638 , #categories =  2
PA =  0.9268547544409614
PE = 0.8626280096391437
Fleiss' Kappa = 0.468

Attribute 5
#raters =  3 , #subjects =  638 , #categories =  2
PA =  0.934169278996865
PE = 0.8760991167758003
Fleiss' Kappa = 0.469

Label : 2
Attribute 1
#raters =  3 , #subjects =  638 , #categories =  2
PA =  0.7648902821316585
PE = 0.5079931298718456
Fleiss' Kappa = 0.522

Attribute 2
#raters =  3 , #subjects =  638 , #categories =  2
PA =  0.9101358411703239
PE = 0.8161662020704286
Fleiss' Kappa = 0.511

Attribute 3
#raters =  3 , #subjects

# Label Distribution

In [13]:
gold_labels = np.array(gold_labels)

In [30]:
counter_per_label = {}
for i in range(len(gold_labels)):
    counter_per_label[i] = Counter(gold_labels[i])
    print(f"Label {i+1}")
    total = len(gold_labels[i])
    for label, attribute_cnt in sorted(counter_per_label[i].items()):
        print(f"{attribute_cnt}\t{attribute_cnt/total}")

Label 1
109	0.17084639498432602
343	0.5376175548589341
83	0.13009404388714735
47	0.07366771159874608
56	0.0877742946708464
Label 2
346	0.542319749216301
62	0.09717868338557993
80	0.12539184952978055
150	0.23510971786833856
Label 3
55	0.08620689655172414
89	0.13949843260188088
181	0.2836990595611285
84	0.13166144200626959
165	0.25862068965517243
64	0.10031347962382445
Label 4
315	0.493730407523511
66	0.10344827586206896
21	0.032915360501567396
70	0.109717868338558
116	0.18181818181818182
50	0.07836990595611286
