In [280]:
import os

from sklearn.metrics import cohen_kappa_score
import itertools

from collections import defaultdict

import re

from typing import Dict, List

import numpy as np
import math

In [123]:
DATA_PATH = os.path.join('..', 'data_2', 'Keep')

In [385]:
def read_data():
    doc_annotations = {}
    text_lengths = {}

    authors_set = set()

    for author_folder in os.listdir(DATA_PATH):
        authors_set.add(author_folder)
        full_path = os.path.join(DATA_PATH, author_folder)
        ann_files = list(filter(lambda x: x.endswith('.ann'), os.listdir(full_path)))
        for filename in ann_files:
            doc_name = os.path.splitext(filename)[0]
            if doc_name not in doc_annotations.keys():
                doc_annotations[doc_name] = {}

            if author_folder in doc_annotations[doc_name].keys():
                raise Exception(f'Author "{author_folder}" has duplicated annotation for document "{doc_name}"')

            with open(os.path.join(full_path, filename), 'r') as file_handler:
                doc_annotations[doc_name][author_folder] = file_handler.read()

            if doc_name not in text_lengths.keys():
                with open(os.path.join(full_path, f'{doc_name}.txt'), 'r') as txt_file_handler:
                    text_lengths[doc_name] = len(txt_file_handler.read())

    return doc_annotations, text_lengths, authors_set

In [558]:
doc_annotations, text_lengths, authors_set = read_data()

In [559]:
# i = 0
# for a, b in doc_annotations.items():
#     if len(b.keys()) > 1 and 'Bert' not in b.keys():
#         i += 1
#         if i > 1:
#             print(a)
#             break

# key = 'NL-HaNA_1.04.02_6863_0208'
# doc_annotations = {
#     k: v for k, v in doc_annotations.items() if k == key
# }

# print(doc_annotations[key].keys())

In [560]:
# print('\n'.join([x for x in doc_annotations[key]['Bert'].split('\n') if x.startswith('T')]))

In [561]:
# print('\n'.join([x for x in doc_annotations[key]['Roos'].split('\n') if x.startswith('T')]))

In [562]:
valid_annotations = {
    k: v for k, v in doc_annotations.items()
    if len(v.keys()) > 1
}

In [563]:
# def process_annotations(annotations_str: str, text_length: int):
#     result = ['O' for _ in range(text_length)]
#     # entities_positions = {}

#     annotations = annotations_str.split('\n')
#     for annotation in annotations:
#         annotation_parts = annotation.split('\t')
#         if not re.search('([T]{1}[0-9]+)', annotation_parts[0]): continue

#         if len(annotation_parts) < 2: print(annotation_parts)
#         split_annotation = annotation_parts[1].split(' ')
#         label = split_annotation[0]
#         current_annotations_parts = ' '.join(split_annotation[1:]).split(';')
#         for current_annotations_part in current_annotations_parts:
#             current_annotations = current_annotations_part.split(' ')
#             if len(current_annotations) > 3: print(current_annotations)

#             start_pos, end_pos = current_annotations
#             for i in range(int(start_pos), int(end_pos)):
#                 result[i] = label

#     return result

In [564]:
class Annotation:
    def __init__(self, key: str, entity: str, start_pos: int, end_pos: int):
        self.key = key
        self.entity = entity
        self.start_pos = start_pos
        self.end_pos = end_pos

    def __str__(self):
        return f'<{self.key}-{self.entity}-[{self.start_pos}:{self.end_pos}]>'

In [565]:
invalid_labels = ['DuplicatePage', 'TranscriptionError_Document']

class DocumentAnnotation:
    def __init__(self, doc_key: str, annotations_str: str, text_length: int):
        self.doc_key = doc_key
        self.text_length = text_length
        self.is_valid=True

        self.annotations = self._parse_annotations(annotations_str)

    def _parse_annotations(self, annotations_str: str):
        result = []
        annotations = annotations_str.split('\n')
        for annotation in annotations:
            annotation_parts = annotation.split('\t')
            if not re.search('([T]{1}[0-9]+)', annotation_parts[0]): continue
            if len(annotation_parts) < 2: print(annotation_parts)

            split_annotation = annotation_parts[1].split(' ')
            label = split_annotation[0]

            if label in invalid_labels:
                self.is_valid=False
                continue

            current_annotations_parts = ' '.join(split_annotation[1:]).split(';')

            start = int(current_annotations_parts[0].split(' ')[0])
            end = int(current_annotations_parts[-1].split(' ')[-1])

            result.append(
                Annotation(
                    key=annotation_parts[0],
                    entity=label,
                    start_pos=start,
                    end_pos=end))

        return result

In [566]:
def parse_annotations(valid_annotations, text_lengths):
    parsed_annotations = {
        doc_key: {
            author: DocumentAnnotation(doc_key, annotations, text_lengths[doc_key])
            for author, annotations in annotations_per_author.items()
        }
        for doc_key, annotations_per_author in valid_annotations.items()
    }

    # Remove invalid annotations
    parsed_annotations = {
        doc_key: {
            author: doc_annotation
            for author, doc_annotation in annotations_per_author.items()
            if doc_annotation.is_valid
        }
        for doc_key, annotations_per_author in parsed_annotations.items()
    }

    # Remove documents where we are left with only one annotation
    parsed_annotations = {
        doc_key: annotations_per_author
        for doc_key, annotations_per_author in parsed_annotations.items()
        if len(annotations_per_author.keys()) > 1
    }

    return parsed_annotations

In [567]:
parsed_annotations = parse_annotations(valid_annotations, text_lengths)

In [568]:
def annotations_overlap(annotation1: Annotation, annotation2: Annotation, offset_chars: int, match_entity: bool) -> bool:
    # if the two annotations do not even overlap with one character, we return false
    if (annotation1.start_pos > annotation2.end_pos or
        annotation1.end_pos < annotation2.start_pos):
        return False

    if match_entity and (annotation1.entity != annotation2.entity):
        return False

    out_of_boundary_chars = abs(annotation1.start_pos - annotation2.start_pos) + abs(annotation1.end_pos - annotation2.end_pos)

    result = out_of_boundary_chars <= offset_chars
    return result

In [569]:
def print_mapped_annotations(mapped_annotations: dict):
    for ann1, ann2 in mapped_annotations.items():
        print(f'{ann1.key} <{ann1.start_pos}-{ann1.end_pos}>  ---', end='')
        if ann2 is None:
            print('NONE')
        else:
            print(f'{ann2.key} <{ann2.start_pos}-{ann2.end_pos}>')

In [570]:
def get_overlapping_annotation(annotation_to_compare: Annotation, annotations: List[Annotation], keys_to_skip: List[str], offset_chars: int, match_entity: bool) -> Annotation:
    overlaps = []
    for annotation2 in annotations:

        if annotation_to_compare.start_pos > annotation2.end_pos:
            continue

        if annotation2.key in keys_to_skip:
            continue

        if annotations_overlap(annotation_to_compare, annotation2, offset_chars, match_entity):
            overlaps.append(annotation2)

    if len(overlaps) == 0:
        return None

    for overlap in overlaps:
        if overlap.entity == annotation_to_compare.entity:
            return overlap

    return overlaps[0]

In [571]:
def calculate_entity_overlap(doc_annotation1: DocumentAnnotation, doc_annotation2: DocumentAnnotation, offset_chars: int, debug: bool = False):
    assert (doc_annotation1.doc_key == doc_annotation2.doc_key)

    mapped_annotations = {}
    used_counter_annotations = set()
    empty_positions = [1 for _ in range(0, doc_annotation1.text_length)]
    for annotation in doc_annotation1.annotations:
        for i in range(annotation.start_pos, annotation.end_pos):
            empty_positions[i] = 0

    # Perform iteration using strict overlap matching
    for annotation in doc_annotation1.annotations:
        overlapping_annotation = get_overlapping_annotation(annotation, doc_annotation2.annotations, used_counter_annotations, offset_chars, match_entity=True)
        if overlapping_annotation is not None:
            used_counter_annotations.add(overlapping_annotation.key)
            mapped_annotations[annotation] = overlapping_annotation

    # Perform iteration using loose overlap matching
    for annotation in doc_annotation1.annotations:
        if annotation in mapped_annotations.keys():
            continue

        overlapping_annotation = get_overlapping_annotation(annotation, doc_annotation2.annotations, used_counter_annotations, offset_chars, match_entity=False)
        if overlapping_annotation is not None:
            used_counter_annotations.add(overlapping_annotation.key)

        mapped_annotations[annotation] = overlapping_annotation

    if debug:
        print_mapped_annotations(mapped_annotations)
    annotation_maps = [ x.entity for x in mapped_annotations.keys() ]
    counter_annotations = [ x.entity if x is not None else 'O' for x in mapped_annotations.values() ]

    for annotation2 in doc_annotation2.annotations:
        for i in range(annotation2.start_pos, annotation2.end_pos):
            empty_positions[i] = 0

        if annotation2.key in used_counter_annotations:
            continue

        # for annotation in doc_annotation1.annotations:
        #     if ((annotation2.start_pos > annotation.start_pos and annotation2.start_pos < annotation.end_pos) or
        #         (annotation2.end_pos > annotation.start_pos and annotation2.end_pos < annotation.end_pos)):

        annotation_maps.append('O')
        counter_annotations.append(annotation2.entity)
        #         break

    free_positions = sum(empty_positions)
    for _ in range(free_positions):
        annotation_maps.append('O')
        counter_annotations.append('O')

    if annotation_maps == counter_annotations:
        result = 1
    else:
        result = cohen_kappa_score(annotation_maps, counter_annotations)

    return result

In [584]:
def create_comparison_matrix(parsed_annotations, offset_chars: int, authors_set: set):
    comparisons = {
        author_1 : {
            author_2: []
            for author_2 in authors_set
        } for author_1 in authors_set
    }

    for _, annotations in parsed_annotations.items():
        for author_1, author_2 in itertools.product(authors_set, authors_set):
            kappa_score = 0

            if author_1 != author_2 and author_1 in annotations.keys() and author_2 in annotations.keys():
                debug=False#(author_2=='Roos' and author_1=='Bert')
                kappa_score = calculate_entity_overlap(annotations[author_1], annotations[author_2], offset_chars, debug=debug)
                # if debug:
                #     for a in annotations[author_1].annotations:
                #         print(a)
                # if kappa_score < 0.2:
                #     print(annotations[author_1].doc_key)
                #     print(author_1)
                #     print(author_2)
                #     raise Exception('test')
                # print(f'<{author_1}, {author_2}, {kappa_score}>')

            comparisons[author_1][author_2].append(kappa_score)

    return comparisons

In [585]:
# comparisons = {
#     author_1 : {
#         author_2: []
#         for author_2 in authors_set
#     } for author_1 in authors_set
# }

# for doc_key, annotations_per_author in valid_annotations.items():
#     processed_annotations_per_author = {}
#     for author, annotations in annotations_per_author.items():
#         processed_annotations_per_author[author] = parse_annotations(annotations, text_lengths[doc_key])
#         if author not in comparisons.keys():
#             comparisons[author] = {}

    # for author_1, author_2 in itertools.product(authors_set, authors_set):
    #     if author_1 == author_2:
    #         kappa = 0
    #     else:
    #         kappa = cohen_kappa_score(
    #             processed_annotations_per_author[author_1],
    #             processed_annotations_per_author[author_2])

    #     comparisons[author_1][author_2].append(kappa)
    #     comparisons[author_2][author_1].append(kappa)


In [586]:
def print_comparison_matrix(comparisons):
    names = list(comparisons.keys())
    print('\t', end='')
    print('\t'.join(names))
    for author_1 in names:
        print(author_1, end='\t')
        for author_2 in names:
            print(round(np.mean(comparisons[author_1][author_2]), 2), end='\t')

        print()

In [587]:
def run_comparison(parsed_annotations, authors_set, offset_chars: int):
    comparison_matrix = create_comparison_matrix(parsed_annotations, offset_chars, authors_set)
    print_comparison_matrix(comparison_matrix)

In [588]:
run_comparison(parsed_annotations, authors_set, offset_chars=0)

	Roos	Emma	Silja	Bert	Jonas	Yolien
Roos	0.0	0.15	0.15	0.02	0.1	0.08	
Emma	0.15	0.0	0.1	0.03	0.12	0.05	
Silja	0.15	0.1	0.0	0.01	0.06	0.08	
Bert	0.02	0.03	0.01	0.0	0.03	0.02	
Jonas	0.1	0.12	0.06	0.03	0.0	0.04	
Yolien	0.08	0.05	0.08	0.02	0.04	0.0	


In [589]:
run_comparison(parsed_annotations, authors_set, offset_chars=50)

	Roos	Emma	Silja	Bert	Jonas	Yolien
Roos	0.0	0.19	0.2	0.02	0.13	0.13	
Emma	0.19	0.0	0.18	0.05	0.19	0.09	
Silja	0.2	0.18	0.0	0.02	0.09	0.13	
Bert	0.02	0.05	0.02	0.0	0.05	0.03	
Jonas	0.13	0.19	0.09	0.05	0.0	0.06	
Yolien	0.13	0.09	0.13	0.03	0.06	0.0	


In [590]:
run_comparison(parsed_annotations, authors_set, offset_chars=500)

	Roos	Emma	Silja	Bert	Jonas	Yolien
Roos	0.0	0.19	0.2	0.02	0.13	0.13	
Emma	0.19	0.0	0.18	0.05	0.19	0.09	
Silja	0.2	0.18	0.0	0.02	0.09	0.13	
Bert	0.02	0.05	0.02	0.0	0.05	0.03	
Jonas	0.13	0.19	0.09	0.05	0.0	0.07	
Yolien	0.13	0.09	0.13	0.03	0.07	0.0	
