In [56]:
# based on https://docs.microsoft.com/en-us/azure/cognitive-services/text-analytics/quickstarts/client-libraries-rest-api?pivots=programming-language-python&tabs=version-3-1#named-entity-recognition-(ner)
# requirements:
# pip install azure-ai-textanalytics --pre

import json
import os.path
from collections import defaultdict

with open('.azure-key') as fh:
    key = fh.read()
endpoint = 'https://climate-law-entity-extraction.cognitiveservices.azure.com/'

In [9]:
from azure.ai.textanalytics import TextAnalyticsClient
from azure.core.credentials import AzureKeyCredential

def authenticate_client():
    ta_credential = AzureKeyCredential(key)
    text_analytics_client = TextAnalyticsClient(
            endpoint=endpoint,
            credential=ta_credential)
    return text_analytics_client

client = authenticate_client()

In [30]:
def chunks(list, num_elements):
    for i in range(0, len(list), num_elements):
        yield list[i:i + num_elements]

def read_file_in_parts(file_path, part_length_limit):
    """
     splits the file along newlines to parts with given max length
    """
    document_parts = []
    with open(file_path, mode='r', encoding='utf8') as input_document:
            lines = input_document.readlines()
            part_character_count = 0
            part = ''
            for line in lines:
                line_length = len(line.encode('utf8'))
                if part_character_count + line_length > part_length_limit:
                    document_parts.append(part)
                    part = ''
                    part_character_count = 0

                if line_length > part_length_limit:
                    print("ERROR: line too long: " + line)
                    continue

                part += line + '\n'
                part_character_count += line_length
    return document_parts

In [58]:
# Actual limit of service is 5120 "text elements" but having a hard time matching this with characters or bytes in python
DOCUMENT_CHARACTER_LIMIT = 4000
VERBOSE = False

def entity_recognition_from_file(client, file_path, output_dir):
    """
    reads document from file, extracts entities using Azure and writes to output file
    """
    try:
        document_parts = read_file_in_parts(file_path, DOCUMENT_CHARACTER_LIMIT)

        aggregated_result = []
        for chunk in chunks(document_parts, 5):
            aggregated_result.extend(client.recognize_entities(documents=chunk))

        category_statistics = defaultdict(int)
        text_statistics = defaultdict(int)
        entities = []
        for result in aggregated_result:
            if VERBOSE: print("Named Entities:\n")
            for entity in result.entities:
                if entity.category == "Quantity":
                    # We skip quantities for now to reduce some noise, as we probably cant put it to use right now
                    continue

                category_statistics[entity.category] += 1
                subcategory_key = '{}_{}'.format(entity.category, entity.subcategory)
                category_statistics[subcategory_key] += 1
                text_statistics[entity.text] += 1
                entities.append({'text': entity.text, 'category': entity.category, 'subcategory': entity.subcategory, 'confidence_score': entity.confidence_score, 'offset': entity.offset, 'length': entity.length})
                if VERBOSE: print("\tText: \t", entity.text, "\tCategory: \t", entity.category, "\tSubCategory: \t", entity.subcategory,
                    "\n\tConfidence Score: \t", round(entity.confidence_score, 2), "\tLength: \t", entity.length, "\tOffset: \t", entity.offset, "\n")

        basename = os.path.basename(file_path)
        filename, extension = os.path.splitext(basename)
        target_filename = os.path.join(output_dir, filename + '_entities.json')
        print("Writing {} entities to {}".format(len(entities), target_filename))

        result_dump = {'category_statistics': category_statistics, 'text_statistics': text_statistics, 'entities': entities}
        with open(target_filename, mode='w', encoding='utf8') as entities_fh:
            entities_fh.write(json.dumps(result_dump))

    except Exception as err:
        print("Encountered exception. {}".format(err))

In [55]:
# Run for single document
entity_recognition_from_file(client, '../documents/1004_0.txt', '../entities')


Writing 338 entities to ../entities\1004_0_entities.json


In [57]:
documents_directory = '../documents'
entities_directory = '../entities'

In [None]:
# single threaded
for entry in os.listdir(documents_directory):
    source_file = os.path.join(documents_directory, entry)
    if not os.path.isfile(source_file):
        continue
    entity_recognition_from_file(client, source_file, entities_directory)

In [None]:
from concurrent.futures.thread import ThreadPoolExecutor

# multi threaded execution
with ThreadPoolExecutor(max_workers=10) as executor:
    futures = []
    for entry in os.listdir(documents_directory):
        source_file = os.path.join(documents_directory, entry)
        if not os.path.isfile(source_file):
            continue

        future = executor.submit(entity_recognition_from_file, client, source_file, entities_directory)
        futures.append(future)

    print([f.result() for f in futures])

Writing 125 entities to ../entities\1014_0_entities.json
Writing 0 entities to ../entities\1059_0_entities.json
Writing 308 entities to ../entities\1058_0_entities.json
Writing 0 entities to ../entities\1060_0_entities.json
Writing 108 entities to ../entities\1020_0_entities.json
Writing 333 entities to ../entities\1053_0_entities.json
Writing 53 entities to ../entities\1073_0_entities.json
Writing 47 entities to ../entities\1060_1_entities.json
Writing 338 entities to ../entities\1004_0_entities.json
Writing 366 entities to ../entities\1078_0_entities.json
Writing 737 entities to ../entities\1073_1_entities.json
Writing 0 entities to ../entities\1092_0_entities.json
Writing 0 entities to ../entities\1092_1_entities.json
Writing 0 entities to ../entities\1093_0_entities.json
Writing 0 entities to ../entities\1093_1_entities.json
Writing 863 entities to ../entities\1059_1_entities.json
Writing 805 entities to ../entities\1079_0_entities.json
Writing 579 entities to ../entities\1094_0_en