In [6]:
# -----------------------------------------------------------
# Faza 2:
# Implementacija flair NER
# -----------------------------------------------------------


# import nltk
# from nltk import pos_tag, word_tokenize
# from nltk.corpus import stopwords
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('gutenberg')
# import re
# import string
# from itertools import combinations
# from collections import Counter


from nltk.tokenize import sent_tokenize
from tqdm import tqdm
from flair.models import SequenceTagger
from flair.data import Sentence
import operator


In [7]:
# Use flair named entity recognition
tagger = SequenceTagger.load('ner')     # ner, ner-ontonotes, ...

2022-05-19 10:46:01,741 loading file /Users/matijagercer/.flair/models/ner-english/4f4cdab26f24cb98b732b389e6cebc646c36f54cfd6e0b7d3b90b25656e4262f.8baa8ae8795f4df80b28e7f7b61d788ecbb057d1dc85aacb316f1bd02837a4a4
2022-05-19 10:46:06,352 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, <START>, <STOP>


In [12]:
def read_text(path):
    with open(path, encoding="utf-8") as f:
        text = f.read()
        text = text.replace('\r', ' ').replace('\n', ' ')\
            .replace("’", "'").replace("\"", "").replace("”", "").replace("“", "")
    return text

def flair_NER(book):
    """
    flair_NER vrne seznam, v katerem so shranjene prepoznane identitete glede na posamezni stavek
    :param book: str
    :return: entity_dict (seznam slovarjev kot npr. [name, tag, start_pos, stop_pos, line_num. token_num])
    """

    # 00 Pretvori knjigo v stavke:
    sent = sent_tokenize(book)

    # 01 NER model
    entity_dict = []
    for line_num, line in enumerate(tqdm(sent)):

        # 01a Predict:
        sentence = Sentence(line)
        tagger.predict(sentence)

        # 01b Vmesni izpis (brez lokacije):
        #print(sentence.to_tagged_string())
        #print(sentence.get_spans('ner'))

        # 01c Ekstrakcija podatka iz stavka:
        for entity in sentence.get_spans('ner'):        # veliko različnih flair - nerov
            name = entity.text

            # str location
            # start_pos = entity.start_position    # št. str
            # stop_pos = entity.end_position      # št. str

            # token location
            tmp_flag = True
            for token in entity:
                if tmp_flag:
                    start_pos = token.idx -1
                    stop_pos = token.idx
                else:
                    stop_pos = token.idx
                tmp_flag = False

            tag = entity.get_label("ner").value             # tag = entity.tag
            conf_score = entity.get_label("ner").score      # conf_score = entity.score

            info_dict = {}
            info_dict["name"] = name
            info_dict["tag"] = tag
            info_dict["start_pos"] = start_pos
            info_dict["stop_pos"] = stop_pos
            info_dict["line_num"] = line_num

            entity_dict.append(info_dict)

    return entity_dict

def get_names_from_NER(entity_dict):
    """
    get_names_from_NER sprejme entity_dict in vrne urejen seznam terk ("ime", št_zaznano)
    :param entity_dict: dict (seznam dictov)
    :return: unique_names: list
    """
    unique_names = {}

    for entity in entity_dict:
        if entity["tag"] == "PER":
            if entity["name"] not in unique_names:
                unique_names[entity["name"]] = 1
            else:
                unique_names[entity["name"]] += 1
    unique_names = sorted(unique_names.items(), key=operator.itemgetter(1),reverse=True)

    return unique_names

In [15]:
book = read_text('../data/books/ASongOfIceAndFire/AGOT/chapters/Bran_1_1.txt')
entity_dict = flair_NER(book)
unique_names = get_names_from_NER(entity_dict)

for (name, num) in unique_names:
    if True: #num > 1:
        print(name, num)

100%|██████████| 284/284 [02:19<00:00,  2.04it/s]

Bran 46
Robb 29
Jon 28
Jory 9
Greyjoy 7
Theon Greyjoy 6
Father 5
Hullen 5
Jon Snow 3
Old Nan 2
Lord Stark 2
Jory Cassel 2
House Stark 2
Stark 2
Mance Rayder 1
Eddard Stark 1
Robert of the House Baratheon 1
Eddard 1
Lord of Winterfell 1
Theon 1
Robert 1
Starks 1
Gods 1
Harwin 1
Ser Rodrik 1
Rickon 1
Snow 1
Desmond 1



