In [31]:
import os
import csv
from tqdm import tqdm
import re

from typing import Dict

In [32]:
OUTPUT_DIR = os.path.join('..', '..', 'processed')
DATA_PATH = os.path.join('..', '..', '2021-national-archives-data-annotation-project', 'data')

In [33]:
def should_use_file(filepath) -> bool:
    # if file path is wrong for some reason, do not use it
    if not os.path.exists(filepath):
        return False

    file_stats = os.stat(filepath)

    # if file is empty, do not use it
    if file_stats.st_size == 0:
        return False

    return True

In [34]:
def get_filepaths(data_path, folders_to_ignore = []):
    result = []
    for folder_name in os.listdir(data_path):
        if folder_name in folders_to_ignore:
            continue

        folder_path = os.path.join(data_path, folder_name)

        if not os.path.isdir(folder_path): # it's a file
            if folder_path.endswith('.ann') and should_use_file(folder_path): # only work with .ann files
                result.append(os.path.splitext(folder_path)[0])

            continue

        sub_paths = get_filepaths(folder_path)
        if len(sub_paths) > 0:
            result.extend(sub_paths)

    return result

In [35]:
filepaths = get_filepaths(DATA_PATH, folders_to_ignore=['6847', 'Charles'])

In [36]:
len(filepaths)

2889

In [42]:
def validate_line(line_text):
    '''
        Validate if a line is not marked as 'transcription error' or as a duplicated one.
        If any of those is true, this whole document must be skipped
    '''
    invalid = line_text.startswith('TranscriptionError') or line_text.startswith('DuplicatePage')
    return not invalid

In [38]:
class Constants():
    Empty = 'O'
    Beginning = 'B-'
    Inside = 'I-'

    MainEntityPrefix = 'T'
    SubEntityPrefix = 'A'

entities_to_cols = {
    'main': 'NE-MAIN',
    'gender': 'NE-PER-GENDER',
    'legalstatus': 'NE-PER-LEGAL-STATUS',
    'role': 'NE-PER-ROLE',
    'misc': 'MISC'
}

In [39]:
def prepare_output_file():
    output_file = os.path.join(OUTPUT_DIR, 'train-nl.tsv')
    file_handler = open(output_file, 'w', encoding='utf-8', newline='')
    csv_writer = csv.DictWriter(file_handler, fieldnames=['TOKEN', 'NE-MAIN', 'NE-PER-GENDER', 'NE-PER-LEGAL-STATUS', 'NE-PER-ROLE', 'MISC'], delimiter='\t')
    csv_writer.writeheader()
    return (file_handler, csv_writer)

In [40]:
def process_annotation_file(filepath) -> Dict[int, Dict[str, str]]:
    # 
    annotations_by_pos = {}

    # This contains the character positions for any main entity. Example: { 'T1' : [[100, 111], [112, 120]] }
    positions_by_main_entity = {}

    # Check for lines starting with TN, where N is a numeric value
    main_regex = re.compile(f'^[{Constants.MainEntityPrefix}][1-9]+')

    # Check for lines starting with AN, where N is a numeric value
    sub_regex = re.compile(f'^[{Constants.SubEntityPrefix}][1-9]+')

    with open(f'{filepath}.ann', 'r', encoding='utf-8') as file_handle:
        for file_line in file_handle.readlines():
            split_line = file_line.split('\t')
            line_key = split_line[0]

            if main_regex.match(line_key): # Main entity type
                assert len(split_line) > 1, f'File line is invalid. Not enough tokens were found\n - Original split line: {split_line}\n - Filepath: "{filepath}"'

                # Skip documents that are not valid
                if not validate_line(split_line[1]):
                    return None

                annotation = split_line[1].split()
                main_entity_type = annotation[0]

                # some positions are doubled, e.g. '100 110; 111 120'
                positions = [[int(pos) for pos in x.strip().split()] for x in ' '.join(annotation[1:]).split(';')]
                positions_by_main_entity[line_key] = positions

                assert len(positions) > 0, f'Positions are invalid.\n - Original line: {file_line}\n - Positions: {positions}'
                for position_pair in positions:
                    assert len(position_pair) == 2, f'Position pair is invalid.\n - Position pair: {position_pair}'
                    for idx in range(position_pair[0], position_pair[1] + 1):
                        if idx in annotations_by_pos.keys():
                            if 'misc' not in annotations_by_pos[idx]:
                                annotations_by_pos[idx]['misc'] = ''
                            else:
                                annotations_by_pos[idx]['misc'] += ','

                            annotations_by_pos[idx]['misc'] += main_entity_type
                            continue

                        annotations_by_pos[idx] = {}
                        annotations_by_pos[idx]['main'] = main_entity_type
            elif sub_regex.match(line_key): # Sub entity type
                assert len(split_line) > 1, f'{split_line}, {filepath}'
                if not validate_line(split_line[1]):
                    return None

                sub_entity_type, main_entity, sub_entity_value = split_line[1].split()

                for position_pair in positions_by_main_entity[main_entity]:
                    for idx in range(position_pair[0], position_pair[1] + 1):
                        annotations_by_pos[idx][sub_entity_type.lower()] = sub_entity_value



    return annotations_by_pos

def calculate_entity_tag(annotations, entity_type, prev_entities, current_pos):
    entity = Constants.Empty

    if current_pos in annotations.keys() and entity_type in annotations[current_pos].keys():
        if entity_type == 'misc':
            entity = annotations[current_pos][entity_type]
        else:
            prefix = Constants.Beginning
            if prev_entities[entity_type] == annotations[current_pos][entity_type]:
                prefix = Constants.Inside

            entity = f'{prefix}{annotations[current_pos][entity_type]}'
            prev_entities[entity_type] = annotations[current_pos][entity_type]
    else:
        if entity_type == 'misc':
            entity = '_'

        prev_entities[entity_type] = Constants.Empty

    return entity, prev_entities

def get_word_annotations(word, annotations, prev_entities, current_pos):
    result = {
        'TOKEN': word,
        'MISC': '_'
    }

    if prev_entities is None:
        prev_entities = {x: Constants.Empty for x in entities_to_cols.keys()}

    for entity_type, col_name in entities_to_cols.items():
        entity, prev_entities = calculate_entity_tag(annotations, entity_type, prev_entities, current_pos)
        result[col_name] = entity

    return result, prev_entities


def process_txt_file(filepath, annotations, csv_writer: csv.DictWriter):
    char_counter = 1

    csv_writer.writerow({})
    csv_writer.writerow({'TOKEN': '# language = nl'})
    csv_writer.writerow({'TOKEN': f'# document_path = {filepath}.txt'})

    with open(f'{filepath}.txt', 'r', encoding='utf-8') as file_handle:
        file_content = file_handle.read().replace('\n', ' ')
        file_words = file_content.split()

        prev_entities = None

        for word in file_words:
            word_annotations, prev_entities = get_word_annotations(word, annotations, prev_entities, char_counter)
            csv_writer.writerow(word_annotations)

            char_counter += len(word) + 1


def process_files(filepaths):
    file_handler, csv_writer = prepare_output_file()

    for filepath in tqdm(filepaths, desc='Processing files'):
        annotations = process_annotation_file(filepath)
        if annotations is None:
            continue

        process_txt_file(filepath, annotations, csv_writer)

    file_handler.close()

In [41]:
process_files(filepaths)

Processing files: 100%|██████████| 2889/2889 [00:03<00:00, 742.73it/s] 
