In [10]:
import os
import csv
from tqdm import tqdm
import re
import pandas as pd
from typing import Dict

In [11]:
OUTPUT_DIR = os.path.join('..', '..', 'processed')
DATA_PATH = os.path.join('..', '..', '2021-national-archives-data-annotation-project', 'data')

In [12]:
def should_use_file(filepath) -> bool:
    # if file path is wrong for some reason, do not use it
    if not os.path.exists(filepath):
        return False

    file_stats = os.stat(filepath)

    # if file is empty, do not use it
    if file_stats.st_size == 0:
        return False

    return True

In [4]:
def get_filepaths(data_path, folders_to_ignore = []):
    result = []
    for folder_name in os.listdir(data_path):
        if folder_name in folders_to_ignore:
            continue

        folder_path = os.path.join(data_path, folder_name)

        if not os.path.isdir(folder_path): # it's a file
            if folder_path.endswith('.ann') and should_use_file(folder_path): # only work with .ann files
                result.append(os.path.splitext(folder_path)[0])

            continue

        sub_paths = get_filepaths(folder_path)
        if len(sub_paths) > 0:
            result.extend(sub_paths)

    return result

In [5]:
filepaths = get_filepaths(DATA_PATH, folders_to_ignore=['6847', 'Charles'])

In [None]:
def validate_line(line_text):
    '''
        Validate if a line is not marked as 'transcription error' or as a duplicated one.
        If any of those is true, this whole document must be skipped
    '''
    invalid = line_text.startswith('TranscriptionError') or line_text.startswith('DuplicatePage')
    return not invalid

In [9]:
class Constants():
    Empty = 'O'
    Beginning = 'B-'
    Inside = 'I-'

    MainEntityPrefix = 'T'
    SubEntityPrefix = 'A'

In [67]:
def process_annotation_file(filepath) -> Dict[int, Dict[str, str]]:
    result = []
    # Check for lines starting with TN, where N is a numeric value
    main_regex = re.compile(f'^[{Constants.MainEntityPrefix}][1-9]+')

    intervals = []

    with open(f'{filepath}.ann', 'r', encoding='utf-8') as file_handle:
        file_lines = file_handle.readlines()

        for file_line in file_lines:
            split_line = file_line.split('\t')
            line_key = split_line[0]

            if not main_regex.match(line_key): # Main entity type
                continue

            assert len(split_line) > 1, f'File line is invalid. Not enough tokens were found\n - Original split line: {split_line}\n - Filepath: "{filepath}"'

            # Skip documents that are not valid
            if not validate_line(split_line[1]):
                return None

            annotation = split_line[1].split()

            # some positions are doubled, e.g. '100 110; 111 120'
            positions = [[int(pos) for pos in x.strip().split()] for x in ' '.join(annotation[1:]).split(';')]

            if positions[0][0] > positions[-1][1]:
                print(filepath)
                print(positions)
            else:
                interval = pd.Interval(positions[0][0], positions[-1][1], closed='both')
                intervals.append((interval, line_key))


    for i, (interval_1, line_key_1) in enumerate(intervals):
        for k, (interval_2, line_key_2) in enumerate(intervals):
            if k <= i : continue
            if not interval_1.overlaps(interval_2): continue

            if interval_1.left < interval_2.left:
                if interval_2.right > interval_1.right and (interval_1, interval_2, line_key_1, line_key_2) not in result:
                    result.append((interval_1, interval_2, line_key_1, line_key_2))

            if interval_1.left > interval_2.left:
                if interval_2.right < interval_1.right and (interval_2, interval_1) not in result:
                    result.append((interval_2, interval_1, line_key_2, line_key_1))

    return result

def process_files(filepaths):
    result = {}
    for filepath in tqdm(filepaths, desc='Processing files'):
        overlapping_intervals = process_annotation_file(filepath)
        if overlapping_intervals is None or len(overlapping_intervals) == 0:
            continue

        result[filepath] = overlapping_intervals

    return result


In [68]:
intervals_by_file = process_files(filepaths)

Processing files:  35%|███▌      | 1012/2889 [00:00<00:00, 2997.16it/s]

..\..\2021-national-archives-data-annotation-project\data\Jonas\6860\NL-HaNA_1.04.02_6860_0303
[[113, 129], [90, 108]]
..\..\2021-national-archives-data-annotation-project\data\Jonas\6860\NL-HaNA_1.04.02_6860_0394
[[1002, 1028], [983, 986], [987, 1000]]
..\..\2021-national-archives-data-annotation-project\data\Roos\6857\NL-HaNA_1.04.02_6857_0173
[[117, 134], [91, 116]]


Processing files:  80%|███████▉  | 2297/2889 [00:00<00:00, 3933.65it/s]

..\..\2021-national-archives-data-annotation-project\data\Silja\6860\NL-HaNA_1.04.02_6860_0077
[[106, 120], [86, 99]]
..\..\2021-national-archives-data-annotation-project\data\Silja\6860\NL-HaNA_1.04.02_6860_0077
[[106, 120], [86, 99]]


Processing files: 100%|██████████| 2889/2889 [00:00<00:00, 3684.80it/s]


In [79]:
for filepath, intervals in intervals_by_file.items():
    print(f'\nFilepath: "{filepath}"')
    for interval_1, interval_2, line_key_1, line_key_2 in intervals:
        print(f'  - {line_key_1:3s} <{interval_1.left:4d}, {interval_1.right:4d}> overlaps {line_key_2:3s} <{interval_2.left:4d}, {interval_2.right:4d}>')


Filepath: "..\..\2021-national-archives-data-annotation-project\data\Bert\6848\NL-HaNA_1.04.02_6848_0007"
  - T34 <1603, 1616> overlaps T35 <1612, 1617>

Filepath: "..\..\2021-national-archives-data-annotation-project\data\Bert\6857\NL-HaNA_1.04.02_6857_0011"
  - T11 < 316,  343> overlaps T12 < 335,  344>

Filepath: "..\..\2021-national-archives-data-annotation-project\data\Emma\6848\NL-HaNA_1.04.02_6848_0088"
  - T20 <1627, 1648> overlaps T21 <1642, 1649>

Filepath: "..\..\2021-national-archives-data-annotation-project\data\Emma\6848\NL-HaNA_1.04.02_6848_0100"
  - T28 < 621,  650> overlaps T8  < 622,  675>
  - T24 <1169, 1200> overlaps T25 <1178, 1201>

Filepath: "..\..\2021-national-archives-data-annotation-project\data\Emma\6848\NL-HaNA_1.04.02_6848_0109"
  - T8  < 991, 1010> overlaps T24 <1002, 1011>

Filepath: "..\..\2021-national-archives-data-annotation-project\data\Emma\6857\NL-HaNA_1.04.02_6857_0121"
  - T11 < 551,  573> overlaps T12 < 556,  574>
  - T13 < 878,  901> overlaps