# Interannotator agreement study

In [1]:
import nltk
from nltk.metrics.agreement import AnnotationTask
from nltk.probability import ConditionalFreqDist, FreqDist
import pandas as pd
from collections import defaultdict
import os 

In [2]:
entity_dict = defaultdict(lambda: defaultdict(list))
type_dict = defaultdict(lambda: defaultdict(list))
annotation_path = 'annotation/'
files = ['anshul_chao_annotated_anshul.csv',
 'anshul_chao_annotated_chao.csv',
 'anshul_wei_annotated_anshul.csv',
 'anshul_wei_annotated_wei.csv',
 'chao_wei_annotated_chao.csv',
 'chao_wei_annotated_wei.csv']
for i in files:
    file_name = '_'.join(i.split('_')[:2])
    read_file = pd.read_csv(annotation_path+i)
    for title, entity, types in zip(read_file['Title'].tolist(), read_file['central_entity'].tolist(), read_file['type'].tolist()):
        entity_dict[file_name][title.strip()].append(entity.strip())
        type_dict[file_name][title.strip()].append(types.upper())

# Assertions to check if we have the exact number of annotations
assert len(type_dict['anshul_wei']) == 165
assert len(type_dict['anshul_chao']) == 170
assert len(type_dict['chao_wei']) == 165
assert len(entity_dict['anshul_wei']) == 165
assert len(entity_dict['anshul_chao']) == 170
assert len(entity_dict['chao_wei']) == 165
print('success!')

success!


In [3]:
entity_triples = []
type_triples = []
for name, dicts in type_dict.items():
    for title, annotation in dicts.items():
        type_triples.append(('c1', title, annotation[0]))
        type_triples.append(('c2', title, annotation[1]))
for name, dicts in entity_dict.items():
    for title, annotation in dicts.items():
        entity_triples.append(('c1', title, annotation[0]))
        entity_triples.append(('c2', title, annotation[1]))
# Assertions to check if we have the exact number of annotations
assert len(entity_triples) == 1000
assert len(type_triples) == 1000
print('success!')

success!


The interannotator agreement measures:      
For the **central entity** annotation, we have decided to use the percentage agreement, which calculates observed agreement across all coders and items, as the label class in the central entity annotation does not have a particular distribution.     
For the central entity **type** annotation, we have decided to use the Scott's π since the label 'FILM' occurs the most compared to other labels, which was observed by all annotators. 

In [4]:
annotation_entity = AnnotationTask(entity_triples)
annotation_type = AnnotationTask(type_triples)
print('The interannotator agreement for entity annotation is:', round(annotation_entity.avg_Ao(), 2))
print('The interannotator agreement for type annotation is:', round(annotation_type.pi(), 2))

The interannotator agreement for entity annotation is: 0.94
The interannotator agreement for type annotation is: 0.87


The interannotator agreements are 0.94 and 0.87 for the central entity and type annotations respectively. The results show that the annotation is very reliable.         
The annotators have all followed the [annotation plan](https://github.ubc.ca/mds-cl-2021-22/523_group_9/blob/master/milestone_2/annotation_plan.ipynb) to ensure the annotation quality. 