In [2]:
import os

from sklearn.metrics import cohen_kappa_score
import itertools

from collections import defaultdict

import re

import numpy as np

In [3]:
DATA_PATH = os.path.join('..', 'data_2', 'Keep')

In [4]:
doc_annotations = {}
text_lengths = {}

for author_folder in os.listdir(DATA_PATH):
    full_path = os.path.join(DATA_PATH, author_folder)
    ann_files = list(filter(lambda x: x.endswith('.ann'), os.listdir(full_path)))
    for filename in ann_files:
        doc_name = os.path.splitext(filename)[0]
        if doc_name not in doc_annotations.keys():
            doc_annotations[doc_name] = {}

        if author_folder in doc_annotations[doc_name].keys():
            raise Exception(f'Author "{author_folder}" has duplicated annotation for document "{doc_name}"')

        with open(os.path.join(full_path, filename), 'r') as file_handler:
            doc_annotations[doc_name][author_folder] = file_handler.read()

        if doc_name not in text_lengths.keys():
            with open(os.path.join(full_path, f'{doc_name}.txt'), 'r') as txt_file_handler:
                text_lengths[doc_name] = len(txt_file_handler.read())

In [5]:
valid_annotations = {
    k: v for k, v in doc_annotations.items()
    if len(v.keys()) > 1
}

In [6]:
def process_annotations(annotations_str: str, text_length: int):
    result = ['O' for _ in range(text_length)]
    # entities_positions = {}

    annotations = annotations_str.split('\n')
    for annotation in annotations:
        annotation_parts = annotation.split('\t')
        if not re.search('([T]{1}[0-9]+)', annotation_parts[0]): continue

        if len(annotation_parts) < 2: print(annotation_parts)
        split_annotation = annotation_parts[1].split(' ')
        label = split_annotation[0]
        current_annotations_parts = ' '.join(split_annotation[1:]).split(';')
        for current_annotations_part in current_annotations_parts:
            current_annotations = current_annotations_part.split(' ')
            if len(current_annotations) > 3: print(current_annotations)

            start_pos, end_pos = current_annotations
            for i in range(int(start_pos), int(end_pos)):
                result[i] = label

    return result

In [14]:
comparisons = {}

for doc_key, annotations_per_author in valid_annotations.items():
    processed_annotations_per_author = {}
    for author, annotations in annotations_per_author.items():
        processed_annotations_per_author[author] = process_annotations(annotations, text_lengths[doc_key])
        if author not in comparisons.keys():
            comparisons[author] = {}

    authors = list(processed_annotations_per_author.keys())
    for author_1, author_2 in itertools.product(authors, authors):
        if author_1 == author_2: 
            kappa = 0
        else:
            kappa = cohen_kappa_score(
                processed_annotations_per_author[author_1],
                processed_annotations_per_author[author_2])

        if author_1 not in comparisons.keys():
            comparisons[author_1] = defaultdict(list)

        if author_2 not in comparisons[author_1].keys():
            comparisons[author_1][author_2] = []

        comparisons[author_1][author_2].append(kappa)

        if author_2 not in comparisons.keys():
            comparisons[author_2] = defaultdict(list)

        if author_1 not in comparisons[author_2].keys():
            comparisons[author_2][author_1] = []

        comparisons[author_2][author_1].append(kappa)


In [24]:
names = list(comparisons.keys())
print('\t', end='')
print('\t'.join(names))
for author_1 in names:
    print(author_1, end='\t')
    for author_2 in names:
        print(round(np.mean(comparisons[author_1][author_2]), 2), end='\t')

    print()

	Bert	Silja	Jonas	Emma	Roos	Yolien
Bert	0.0	0.63	0.55	0.65	0.44	0.55	
Silja	0.63	0.0	0.65	0.58	0.52	0.59	
Jonas	0.55	0.65	0.0	0.57	0.52	0.54	
Emma	0.65	0.58	0.57	0.0	0.72	0.6	
Roos	0.44	0.52	0.52	0.72	0.0	0.53	
Yolien	0.55	0.59	0.54	0.6	0.53	0.0	
