In [2]:
import os
import csv
from tqdm import tqdm
import re
import pandas as pd
from typing import Dict

In [3]:
OUTPUT_DIR = os.path.join('..', 'processed_data')
DATA_PATH = os.path.join('..', 'data', 'annotated_data')

In [4]:
def should_use_file(filepath) -> bool:
    # if file path is wrong for some reason, do not use it
    if not os.path.exists(filepath):
        return False

    file_stats = os.stat(filepath)

    # if file is empty, do not use it
    if file_stats.st_size == 0:
        return False

    return True

In [5]:
def get_filepaths(data_path, folders_to_ignore = []):
    result = []
    for folder_name in os.listdir(data_path):
        if folder_name in folders_to_ignore:
            continue

        folder_path = os.path.join(data_path, folder_name)

        if not os.path.isdir(folder_path): # it's a file
            if folder_path.endswith('.ann') and should_use_file(folder_path): # only work with .ann files
                result.append(os.path.splitext(folder_path)[0])

            continue

        sub_paths = get_filepaths(folder_path)
        if len(sub_paths) > 0:
            result.extend(sub_paths)

    return result

In [6]:
filepaths = get_filepaths(DATA_PATH, folders_to_ignore=['6847', 'Charles'])

In [7]:
def validate_line(line_text):
    '''
        Validate if a line is not marked as 'transcription error' or as a duplicated one.
        If any of those is true, this whole document must be skipped
    '''
    invalid = line_text.startswith('TranscriptionError') or line_text.startswith('DuplicatePage')
    return not invalid

In [8]:
class Constants():
    Empty = 'O'
    Beginning = 'B-'
    Inside = 'I-'

    MainEntityPrefix = 'T'
    SubEntityPrefix = 'A'

In [9]:
def process_annotation_file(filepath) -> Dict[int, Dict[str, str]]:
    result = []
    # Check for lines starting with TN, where N is a numeric value
    main_regex = re.compile(f'^[{Constants.MainEntityPrefix}][1-9]+')
    sub_regex = re.compile(f'^[{Constants.SubEntityPrefix}][1-9]+')

    length_limit = 19
    intervals = []
    long_annotations = []

    current_annotation = None

    with open(f'{filepath}.ann', 'r', encoding='utf-8') as file_handle:
        file_lines = file_handle.readlines()

        for file_line in file_lines:
            split_line = file_line.split('\t')
            line_key = split_line[0]

            if not main_regex.match(line_key): # Main entity type

                # if current_pos == (1388, 1503) and 'NL-HaNA_1.04.02_6848_0120' in filepath and current_entity == 'T15':
                #     print(current_entity)
                #     print(current_annotation)

                if not sub_regex.match(line_key) and len(split_line) == 1:
                    current_annotation += file_line
                elif current_annotation is not None:
                    words = current_annotation.replace('\n', ' ').split()
                    if len(words) >= length_limit:
                        long_annotation = (filepath, current_pos, current_entity, current_annotation)
                        if long_annotation not in long_annotations:
                            long_annotations.append(long_annotation)

                continue

            if current_annotation is not None:
                words = current_annotation.replace('\n', ' ').split()
                if len(words) >= length_limit:
                    long_annotation = (filepath, current_pos, current_entity, current_annotation)
                    if long_annotation not in long_annotations:
                        long_annotations.append(long_annotation)

            assert len(split_line) > 1, f'File line is invalid. Not enough tokens were found\n - Original split line: {split_line}\n - Filepath: "{filepath}"'

            # Skip documents that are not valid
            if not validate_line(split_line[1]):
                return None

            annotation = split_line[1].split()

            # some positions are doubled, e.g. '100 110; 111 120'
            positions = [[int(pos) for pos in x.strip().split()] for x in ' '.join(annotation[1:]).split(';')]

            start_pos = positions[0][0]
            end_pos = positions[-1][1]
            if start_pos > end_pos:
                print(filepath)
                print(positions)
                current_start_pos = None
                current_annotation = None
            else:
                interval = pd.Interval(start_pos, end_pos, closed='both')
                intervals.append((interval, line_key))

                current_pos = (start_pos, end_pos)
                current_entity = line_key
                current_annotation = split_line[-1]

                # if current_pos == (1388, 1503) and 'NL-HaNA_1.04.02_6848_0120' in filepath and current_entity == 'T15':
                #     print(current_entity)
                #     print(current_annotation)

    if current_annotation is not None:
        words = current_annotation.split()
        if len(words) >= length_limit:
            long_annotation = (filepath, current_pos, current_entity, current_annotation)
            if long_annotation not in long_annotations:
                long_annotations.append(long_annotation)


    for i, (interval_1, line_key_1) in enumerate(intervals):
        for k, (interval_2, line_key_2) in enumerate(intervals):
            if k <= i : continue
            if not interval_1.overlaps(interval_2): continue

            if interval_1.left < interval_2.left:
                if interval_2.right > interval_1.right and (interval_1, interval_2, line_key_1, line_key_2) not in result:
                    result.append((interval_1, interval_2, line_key_1, line_key_2))

            if interval_1.left > interval_2.left:
                if interval_2.right < interval_1.right and (interval_2, interval_1) not in result:
                    result.append((interval_2, interval_1, line_key_2, line_key_1))

    return result, long_annotations

def process_files(filepaths):
    result = {}
    long_annotations = []

    for filepath in tqdm(filepaths, desc='Processing files'):
        ann_result = process_annotation_file(filepath)
        if ann_result is None:
            continue

        overlapping_intervals, curr_long_annotations = ann_result
        long_annotations.extend(curr_long_annotations)
        if len(overlapping_intervals) == 0:
            continue

        result[filepath] = overlapping_intervals

    return result, long_annotations


In [10]:
intervals_by_file, long_annotations = process_files(filepaths)

Processing files:  43%|████████▌           | 941/2199 [00:00<00:00, 1933.82it/s]

../data/annotated_data/C/NL-HaNA_1.04.02_6857_0173
[[117, 134], [91, 116]]


Processing files:  60%|███████████▍       | 1321/2199 [00:00<00:00, 1853.79it/s]

../data/annotated_data/D/NL-HaNA_1.04.02_6883_0047
[[159, 196], [77, 157]]
../data/annotated_data/D/NL-HaNA_1.04.02_6860_0079
[[102, 116], [81, 94]]
../data/annotated_data/D/NL-HaNA_1.04.02_6860_0079
[[102, 116], [81, 94]]
../data/annotated_data/D/NL-HaNA_1.04.02_6860_0073
[[105, 119], [84, 97]]
../data/annotated_data/D/NL-HaNA_1.04.02_6860_0073
[[105, 119], [84, 97]]
../data/annotated_data/D/NL-HaNA_1.04.02_6860_0075
[[104, 118], [83, 96]]
../data/annotated_data/D/NL-HaNA_1.04.02_6860_0075
[[104, 118], [83, 96]]
../data/annotated_data/D/NL-HaNA_1.04.02_6860_0077
[[106, 120], [86, 99]]
../data/annotated_data/D/NL-HaNA_1.04.02_6860_0077
[[106, 120], [86, 99]]


Processing files: 100%|███████████████████| 2199/2199 [00:01<00:00, 1867.66it/s]

../data/annotated_data/B/NL-HaNA_1.04.02_6860_0303
[[113, 129], [90, 108]]





In [11]:
for filepath, intervals in intervals_by_file.items():
    print(f'\nFilepath: "{filepath}"')
    for interval_1, interval_2, line_key_1, line_key_2 in intervals:
        print(f'  - {line_key_1:3s} <{interval_1.left:4d}, {interval_1.right:4d}> overlaps {line_key_2:3s} <{interval_2.left:4d}, {interval_2.right:4d}>')


Filepath: "../data/annotated_data/A/NL-HaNA_1.04.02_6860_0219"
  - T1  <  23,   67> overlaps T2  <  40,   68>

Filepath: "../data/annotated_data/A/NL-HaNA_1.04.02_6869_0188"
  - T19 < 766,  812> overlaps T20 < 796,  813>

Filepath: "../data/annotated_data/A/NL-HaNA_1.04.02_6870_0190"
  - T3  <1319, 1355> overlaps T4  <1336, 1358>

Filepath: "../data/annotated_data/A/NL-HaNA_1.04.02_6848_0100"
  - T28 < 621,  650> overlaps T8  < 622,  675>
  - T24 <1169, 1200> overlaps T25 <1178, 1201>

Filepath: "../data/annotated_data/A/NL-HaNA_1.04.02_6870_0206"
  - T3  < 117,  140> overlaps T4  < 130,  141>

Filepath: "../data/annotated_data/A/NL-HaNA_1.04.02_6848_0088"
  - T20 <1627, 1648> overlaps T21 <1642, 1649>

Filepath: "../data/annotated_data/A/NL-HaNA_1.04.02_6869_0183"
  - T10 < 341,  371> overlaps T11 < 351,  372>
  - T10 < 341,  371> overlaps T14 < 351,  372>

Filepath: "../data/annotated_data/A/NL-HaNA_1.04.02_6860_0110"
  - T18 <1224, 1238> overlaps T19 <1225, 1252>

Filepath: "../dat

In [12]:
print(len(long_annotations))
for path, length, entity, text in long_annotations:
    print(f'Path: "...{path[52:]}"\n - length: {length}\n - entity: {entity}\n - text: {text}\n')

4
Path: "..."
 - length: (223, 339)
 - entity: T6
 - text: binne scheepsboort van het schip Tutpenburg zeijlende op de Z:breete van 10 9rd:s 14 me een en Lengte 104: 9: ro 21:


Path: "..."
 - length: (649, 765)
 - entity: T13
 - text: a seeker Exs beboud met een steene huijs, staande gelegen binnen dese stad, aan de oost sij„ de van de groote revier


Path: "..."
 - length: (1106, 1242)
 - entity: T20
 - text: seker Erf bebouwd met een steen pedak gemerkt numero Een staande ende gelegen bin„ 9 nen den stad aan de oost sijde van de groote revier


Path: "..."
 - length: (1649, 1822)
 - entity: T25
 - text: seeker Erf bebouwd met een steene pedak gemerkt numero vijftien staande en gelegen numen vijftien staande en gelegen binnen dese stadt aen de oost sijde van de groote revier


