In [41]:
import os

from sklearn.metrics import cohen_kappa_score
import itertools

from collections import defaultdict

import re

import numpy as np
import math

In [12]:
DATA_PATH = os.path.join('..', 'data_2', 'Keep')

In [72]:
def read_data():
    doc_annotations = {}
    text_lengths = {}

    authors_set = set()

    for author_folder in os.listdir(DATA_PATH):
        authors_set.add(author_folder)
        full_path = os.path.join(DATA_PATH, author_folder)
        ann_files = list(filter(lambda x: x.endswith('.ann'), os.listdir(full_path)))
        for filename in ann_files:
            doc_name = os.path.splitext(filename)[0]
            if doc_name not in doc_annotations.keys():
                doc_annotations[doc_name] = {}

            if author_folder in doc_annotations[doc_name].keys():
                raise Exception(f'Author "{author_folder}" has duplicated annotation for document "{doc_name}"')

            with open(os.path.join(full_path, filename), 'r') as file_handler:
                doc_annotations[doc_name][author_folder] = file_handler.read()

            if doc_name not in text_lengths.keys():
                with open(os.path.join(full_path, f'{doc_name}.txt'), 'r') as txt_file_handler:
                    text_lengths[doc_name] = len(txt_file_handler.read())

    return doc_annotations, text_lengths, authors_set

In [73]:
doc_annotations, text_lengths, authors_set = read_data()

In [74]:
valid_annotations = {
    k: v for k, v in doc_annotations.items()
    if len(v.keys()) > 1
}

In [88]:
# def process_annotations(annotations_str: str, text_length: int):
#     result = ['O' for _ in range(text_length)]
#     # entities_positions = {}

#     annotations = annotations_str.split('\n')
#     for annotation in annotations:
#         annotation_parts = annotation.split('\t')
#         if not re.search('([T]{1}[0-9]+)', annotation_parts[0]): continue

#         if len(annotation_parts) < 2: print(annotation_parts)
#         split_annotation = annotation_parts[1].split(' ')
#         label = split_annotation[0]
#         current_annotations_parts = ' '.join(split_annotation[1:]).split(';')
#         for current_annotations_part in current_annotations_parts:
#             current_annotations = current_annotations_part.split(' ')
#             if len(current_annotations) > 3: print(current_annotations)

#             start_pos, end_pos = current_annotations
#             for i in range(int(start_pos), int(end_pos)):
#                 result[i] = label

#     return result

In [89]:
class Annotation:
    def __init__(self, key: str, entity: str, start_pos: int, end_pos: int):
        self.key = key
        self.entity = entity
        self.start_pos = start_pos
        self.end_pos = end_pos

    def __str__(self):
        return f'<{self.key}-{self.entity}-[{self.start_pos}:{self.end_pos}]>'

In [196]:
class DocumentAnnotation:
    def __init__(self, doc_key: str, annotations_str: str, text_length: int):
        self.doc_key = doc_key
        self.text_length = text_length

        self.annotations = self._parse_annotations(annotations_str, text_length)

    def _parse_annotations(self, annotations_str: str, text_length: int):
        result = []
        annotations = annotations_str.split('\n')
        for annotation in annotations:
            annotation_parts = annotation.split('\t')
            if not re.search('([T]{1}[0-9]+)', annotation_parts[0]): continue
            if len(annotation_parts) < 2: print(annotation_parts)

            split_annotation = annotation_parts[1].split(' ')
            label = split_annotation[0]
            current_annotations_parts = ' '.join(split_annotation[1:]).split(';')

            start = int(current_annotations_parts[0].split(' ')[0])
            end = int(current_annotations_parts[-1].split(' ')[-1])

            result.append(
                Annotation(
                    key=annotation_parts[0],
                    entity=label,
                    start_pos=start,
                    end_pos=end))

        return result

In [197]:
def parse_annotations(valid_annotations, text_lengths):
    parsed_annotations = {
        doc_key: {
            author: DocumentAnnotation(doc_key, annotations, text_lengths[doc_key])
            for author, annotations in annotations_per_author.items()
        }
        for doc_key, annotations_per_author in valid_annotations.items()
    }

    return parsed_annotations

In [198]:
parsed_annotations = parse_annotations(valid_annotations, text_lengths)

In [199]:
str(parsed_annotations['NL-HaNA_1.04.02_6857_0037']['Bert'].annotations[0])

'<T1-Person-[286:294]>'

In [200]:
def annotations_overlap(annotation1: Annotation, annotation2: Annotation, offset_chars: int) -> bool:
    # if the two annotations do not even overlap with one character, we return false
    if (annotation1.start_pos > annotation2.end_pos or
        annotation1.end_pos < annotation2.start_pos):
        return False
  
    out_of_boundary_chars = abs(annotation1.start_pos - annotation2.start_pos) + abs(annotation1.end_pos - annotation2.end_pos)

    result = out_of_boundary_chars <= offset_chars
    return result

In [232]:
def calculate_entity_overlap(doc_annotation1: DocumentAnnotation, doc_annotation2: DocumentAnnotation, offset_chars: int):
    assert (doc_annotation1.doc_key == doc_annotation2.doc_key)

    mapped_annotations = {}
    used_counter_annotations = set()
    empty_positions = [1 for _ in range(0, doc_annotation1.text_length)]
    for annotation in doc_annotation1.annotations:
        for i in range(annotation.start_pos, annotation.end_pos):
            empty_positions[i] = 0

        mapped_annotations[annotation] = None
        for annotation2 in doc_annotation2.annotations:

            if annotation2.start_pos > annotation.end_pos:
                break

            if annotation.start_pos > annotation2.end_pos:
                continue

            if annotation2.key in used_counter_annotations:
                continue

            if annotations_overlap(annotation, annotation2, offset_chars):
                used_counter_annotations.add(annotation2.key)
                mapped_annotations[annotation] = annotation2
                break


    annotation_maps = [ x.entity for x in mapped_annotations.keys() ]
    counter_annotations = [ x.entity if x is not None else 'O' for x in mapped_annotations.values() ]

    for annotation2 in doc_annotation2.annotations:
        for i in range(annotation2.start_pos, annotation2.end_pos):
            empty_positions[i] = 0

        if annotation2.key in used_counter_annotations:
            continue

        for annotation in doc_annotation1.annotations:
            if ((annotation2.start_pos > annotation.start_pos and annotation2.start_pos < annotation.end_pos) or
                (annotation2.end_pos > annotation.start_pos and annotation2.end_pos < annotation.end_pos)):

                annotation_maps.append('O')
                counter_annotations.append(annotation2.entity)
                break

    free_positions = sum(empty_positions)
    for _ in range(free_positions):
        annotation_maps.append('O')
        counter_annotations.append('O')

    if annotation_maps == counter_annotations:
        result = 1
    else:
        result = cohen_kappa_score(annotation_maps, counter_annotations, weights='quadratic')

    # if result < 0:
    #     print(result)
    #     print(len(annotation_maps))
    #     print(annotation_maps)
    #     print(len(counter_annotations))
    #     print(counter_annotations)
    #     raise Exception('test')

    return result

In [233]:
def create_comparison_matrix(parsed_annotations, offset_chars: int, authors_set: set):
    comparisons = {
        author_1 : {
            author_2: []
            for author_2 in authors_set
        } for author_1 in authors_set
    }

    for _, annotations in parsed_annotations.items():
        for author_1, author_2 in itertools.product(authors_set, authors_set):
            kappa_score = 0

            if author_1 != author_2 and author_1 in annotations.keys() and author_2 in annotations.keys():
                kappa_score = calculate_entity_overlap(annotations[author_1], annotations[author_2], offset_chars)

            comparisons[author_1][author_2].append(kappa_score)

    return comparisons

In [234]:
# comparisons = {
#     author_1 : {
#         author_2: []
#         for author_2 in authors_set
#     } for author_1 in authors_set
# }

# for doc_key, annotations_per_author in valid_annotations.items():
#     processed_annotations_per_author = {}
#     for author, annotations in annotations_per_author.items():
#         processed_annotations_per_author[author] = parse_annotations(annotations, text_lengths[doc_key])
#         if author not in comparisons.keys():
#             comparisons[author] = {}

    # for author_1, author_2 in itertools.product(authors_set, authors_set):
    #     if author_1 == author_2:
    #         kappa = 0
    #     else:
    #         kappa = cohen_kappa_score(
    #             processed_annotations_per_author[author_1],
    #             processed_annotations_per_author[author_2])

    #     comparisons[author_1][author_2].append(kappa)
    #     comparisons[author_2][author_1].append(kappa)


In [235]:
def print_comparison_matrix(comparisons):
    names = list(comparisons.keys())
    print('\t', end='')
    print('\t'.join(names))
    for author_1 in names:
        print(author_1, end='\t')
        for author_2 in names:
            print(round(np.mean(comparisons[author_1][author_2]), 2), end='\t')

        print()

In [236]:
def run_comparison(parsed_annotations, authors_set, offset_chars: int):
    comparison_matrix = create_comparison_matrix(parsed_annotations, offset_chars, authors_set)
    print_comparison_matrix(comparison_matrix)

In [237]:
run_comparison(parsed_annotations, authors_set, offset_chars=0)

	Silja	Emma	Yolien	Jonas	Roos	Bert
Silja	0.0	0.13	0.09	0.12	0.15	0.04	
Emma	0.11	0.0	0.07	0.1	0.12	0.02	
Yolien	0.1	0.08	0.0	0.06	0.13	0.04	
Jonas	0.1	0.12	0.05	0.0	0.12	0.04	
Roos	0.17	0.12	0.12	0.11	0.0	0.03	
Bert	0.03	0.02	0.05	0.03	0.04	0.0	


In [238]:
run_comparison(parsed_annotations, authors_set, offset_chars=5000)

	Silja	Emma	Yolien	Jonas	Roos	Bert
Silja	0.0	0.16	0.13	0.15	0.18	0.05	
Emma	0.15	0.0	0.1	0.13	0.15	0.03	
Yolien	0.12	0.11	0.0	0.07	0.16	0.05	
Jonas	0.13	0.15	0.07	0.0	0.14	0.05	
Roos	0.21	0.15	0.16	0.13	0.0	0.05	
Bert	0.04	0.03	0.07	0.05	0.05	0.0	
