In [1]:
import stanza
import operator

In [6]:
nlp = stanza.Pipeline(lang='en', processors='tokenize,ner')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.0.json:   0%|   …

2022-05-19 16:40:22 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| ner       | ontonotes |

2022-05-19 16:40:22 INFO: Use device: cpu
2022-05-19 16:40:22 INFO: Loading: tokenize
2022-05-19 16:40:22 INFO: Loading: ner
2022-05-19 16:40:23 INFO: Done loading processors!


In [25]:
def read_text(path):
    with open(path, encoding="utf-8") as f:
        text = f.read()
        text = text.replace('\r', ' ').replace('\n', ' ')\
            .replace("’", "'").replace("\"", "").replace("”", "").replace("“", "")
    return text

def stanza_NER(book):
    """
    stanza_NER vrne seznam, v katerem so shranjene prepoznane identitete glede na posamezni stavek
    :param book: str
    :return: entity_dict (seznam slovarjev kot npr. [name, tag, start_pos, stop_pos, line_num. token_num])
    """

    doc = nlp(book)
    entity_dict = []


    for line_num, sent in enumerate(doc.sentences):
        token_num = 0
        for token in sent.tokens:
            # S - samostojna
            # B - na zacetku
            # I - vmes
            # E - na koncu
            if token.ner == "S-PERSON" or token.ner == "B-PERSON" or token.ner == "E-PERSON" or token.ner == "I-PERSON":
                info_dict = {}
                info_dict["name"] = token.text
                info_dict["tag"] = token.ner
                info_dict["start_pos"] = token_num
                info_dict["stop_pos"] = token_num +1
                info_dict["line_num"] = line_num
                entity_dict.append(info_dict)
                token_num += 1

    return entity_dict


def get_names_from_NER(entity_dict):
    """
    get_names_from_NER sprejme entity_dict in vrne urejen seznam terk ("ime", št_zaznano)
    :param entity_dict: dict (seznam dictov)
    :return: unique_names: list
    """
    unique_names = {}

    for entity in entity_dict:
        if entity["tag"] == "S-PERSON" or entity["tag"] == "N-PERSON":
            if entity["name"] not in unique_names:
                unique_names[entity["name"]] = 1
            else:
                unique_names[entity["name"]] += 1
    unique_names = sorted(unique_names.items(), key=operator.itemgetter(1),reverse=True)

    return unique_names

def merge_person_names(entity_dict):
     """
    merge_person_names sprejme vse označene persone (S,B,I,E - PERSON) in smiselno zloži imena. Vrne (S - Person) če je
    nespremenjeno in N - Person če je bila združitev... 
    :param entity_dict: dict (seznam dictov)
    :return: entity_dict_clean: list
    """
    entity_dict_clean = []

    for entity_num, entity in enumerate(entity_dict):

        if entity["tag"] == "S-PERSON":
            entity_dict_clean.append(entity)

        elif entity["tag"] == "B-PERSON":
            tmp_entity = entity.copy()
            try:
                for i in range(10):
                    next_entity = entity_dict[entity_num + i]
                    if next_entity["line_num"] ==  tmp_entity["line_num"] and (next_entity["tag"] == "I-PERSON" or next_entity["tag"] == "E-PERSON"):
                        tmp_entity["name"] = tmp_entity["name"] + " " + next_entity["name"]
                        tmp_entity["tag"] = "N-PERSON"
                        tmp_entity["stop_pos"] = next_entity["stop_pos"]
                        if next_entity["tag"] == "E-PERSON":
                            break
            except:
                pass
            entity_dict_clean.append(tmp_entity)
    return entity_dict_clean

In [18]:
book = read_text('../data/books/ASongOfIceAndFire/AGOT/chapters/Bran_1_1.txt')
entity_dict = stanza_NER(book)

In [29]:
entity_dict_clean = merge_person_names(entity_dict)
unique_names = get_names_from_NER(entity_dict_clean)

sum_num = 0
for (name, num) in unique_names:
    if True: #num > 1:
        print(name, num)
        sum_num = sum_num + num


Bran 46
Robb 29
Jon 29
Jory 9
Greyjoy 7
Theon Greyjoy 6
Hullen 5
Stark 3
Snow 3
Winterfell 2
Jory Cassel 2
Jon Snow 2
Mance Rayder 1
Old Nan 1
Eddard Stark 1
Ice 1
Robert of the House Baratheon 1
King of the Andals 1
Rhoynar 1
Eddard 1
Warden of the North 1
Theon 1
Nan 1
Robert 1
Starks 1
Harwin 1
Ser Rodrik 's 1
Rickon 1
no Stark 1
Desmond 1
161
