In [1]:
# -----------------------------------------------------------
# Faza 2:
# Implementacija flair NER
# -----------------------------------------------------------


# import nltk
# from nltk import pos_tag, word_tokenize
# from nltk.corpus import stopwords
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('gutenberg')
# import re
# import string
# from itertools import combinations
# from collections import Counter


from nltk.tokenize import sent_tokenize
from tqdm import tqdm
from flair.models import SequenceTagger
from flair.data import Sentence


In [2]:
# Use flair named entity recognition
tagger = SequenceTagger.load('ner')


2022-05-18 19:38:32,882 loading file /Users/matijagercer/.flair/models/ner-english/4f4cdab26f24cb98b732b389e6cebc646c36f54cfd6e0b7d3b90b25656e4262f.8baa8ae8795f4df80b28e7f7b61d788ecbb057d1dc85aacb316f1bd02837a4a4
2022-05-18 19:38:39,180 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, <START>, <STOP>


In [4]:
def read_text(path):

    with open(path, encoding="utf-8") as f:
        text = f.read()
        text = text.replace('\r', ' ').replace('\n', ' ')\
            .replace("’", "'").replace("\"", "").replace("”", "").replace("“", "")
    return text


def flair_NER(book):

    """
    flair_NER vrne seznam, v katerem so shranjene prepoznane identitete glede na posamezni stavek

    :param book: str
    :return: entity_dict (seznam slovarjev kot npr. [name, tag, start_pos, stop_pos, line_num. token_num])
    """

    # 00 Pretvori knjigo v stavke:
    sent = sent_tokenize(book)

    # 01 NER model
    entity_dict = []
    for line_num, line in enumerate(tqdm(sent)):

        # 01a Predict:
        sentence = Sentence(line)
        tagger.predict(sentence)

        # 01b Vmesni izpis (brez lokacije):
        #print(sentence.to_tagged_string())
        #print(sentence.get_spans('ner'))

        # 01c Ekstrakcija podatka iz stavka:
        for entity in sentence.get_spans('ner'):
            name = entity.text
            star_pos = entity.start_position
            stop_pos = entity.end_position

            for token in entity:
                token_num = token.idx -1

            tag = entity.get_label("ner").value             # tag = entity.tag
            conf_score = entity.get_label("ner").score      # conf_score = entity.score

            info_dict = {}
            info_dict["name"] = name
            info_dict["tag"] = tag
            info_dict["start_pos"] = star_pos
            info_dict["stop_pos"] = stop_pos
            info_dict["line_num"] = line_num
            info_dict["token_num"] = token_num

            entity_dict.append(info_dict)

    return entity_dict

def get_names_from_NER(entity_dict):

    """
    get_names_from_NER sprejme entity_dict in vrne set unikatnih imen

    :param entity_dict: dict (seznam dictov)
    :return: unique_names: set
    """

    unique_names = set()
    for entity in entity_dict:
        if entity["tag"] == "PER":     # PER, MISC, LOC, ORG
            unique_names.add(entity["name"])
    return unique_names

In [6]:
book = read_text('../data/books/ASongOfIceAndFire/AGOT/chapters/Bran_1_1.txt')
entity_dict = flair_NER(book)
unique_names = get_names_from_NER(entity_dict)
print(unique_names)

100%|██████████| 284/284 [02:18<00:00,  2.05it/s]

{'Greyjoy', 'Robert', 'Eddard Stark', 'Lord Stark', 'Jon', 'Old Nan', 'Mance Rayder', 'Jory', 'Eddard', 'Gods', 'Theon Greyjoy', 'Harwin', 'Robert of the House Baratheon', 'Jon Snow', 'Snow', 'Theon', 'Father', 'Bran', 'Rickon', 'Hullen', 'Stark', 'Starks', 'Jory Cassel', 'House Stark', 'Lord of Winterfell', 'Robb', 'Desmond', 'Ser Rodrik'}





<function get_names_from_NER at 0x7f8c5cfe7dc0>
