In [1]:
import os
from nltk.tag import StanfordNERTagger
from nltk import sent_tokenize, word_tokenize
import operator
from sner import Ner


In [2]:
model = "/Users/matijagercer/Desktop/stanford-ner-2020-11-17/classifiers/english.all.3class.distsim.crf.ser.gz"
jar = "/Users/matijagercer/Desktop/stanford-ner-2020-11-17/stanford-ner-4.2.0.jar"
st = StanfordNERTagger(model, jar,encoding='utf-8')


In [3]:
# Termina:
# cd your_stanford_ner_dir
# java -Djava.ext.dirs=./lib -cp stanford-ner.jar edu.stanford.nlp.ie.NERServer -port 9199 -loadClassifier ./classifiers/english.all.3class.distsim.crf.ser.gz

# Za več preberi:
# https://stackoverflow.com/questions/33748554/how-to-speed-up-ne-recognition-with-stanford-ner-with-python-nltk

tagger = Ner(host='localhost',port=9199)


In [15]:
def read_text(path):
    with open(path, encoding="utf-8") as f:
        text = f.read()
        text = text.replace('\r', ' ').replace('\n', ' ')\
            .replace("’", "'").replace("\"", "").replace("”", "").replace("“", "")
    return text


def stanford_NER(book):
    """
    stanford_NER vrne seznam, v katerem so shranjene prepoznane identitete glede na posamezni stavek
    :param book: str
    :return: entity_dict (seznam slovarjev kot npr. [name, tag, start_pos, stop_pos, line_num. token_num])
    """

    # 00 Pretvori knjigo v stavke:
    sentences = sent_tokenize(book)
    entity_dict = []

    for line_num, line in enumerate(sentences):

        #tokenized_text = word_tokenize(line)
        #classified_text = st.tag(tokenized_text)

        classified_text = tagger.get_entities(line)

        token_num = 0
        for (name, tag) in classified_text:
            if tag != 'O':
                info_dict = {}
                info_dict["name"] = name
                info_dict["tag"] = tag
                info_dict["start_pos"] = token_num
                info_dict["stop_pos"] = token_num+1
                info_dict["line_num"] = line_num
                entity_dict.append(info_dict)
            token_num += 1

    return entity_dict


def get_names_from_NER(entity_dict):
    """
    get_names_from_NER sprejme entity_dict in vrne urejen seznam terk ("ime", št_zaznano)
    :param entity_dict: dict (seznam dictov)
    :return: unique_names: list
    """
    unique_names = {}

    for entity in entity_dict:
        if entity["tag"] == "PERSON":
            if entity["name"] not in unique_names:
                unique_names[entity["name"]] = 1
            else:
                unique_names[entity["name"]] += 1
    unique_names = sorted(unique_names.items(), key=operator.itemgetter(1),reverse=True)

    return unique_names



In [19]:
book = read_text('../../data/books/ASongOfIceAndFire/AGOT/chapters/Bran_1_1.txt')
entity_dict = stanford_NER(book)
unique_names = get_names_from_NER(entity_dict)

for (name, num) in unique_names:
    if True: #num > 1:
        print(name, num)

print(entity_dict)

Jon 31
Robb 29
Greyjoy 12
Jory 11
Theon 7
Bran 6
Hullen 5
Stark 3
Snow 3
Nan 2
Starks 2
Cassel 2
Robert 2
of 2
the 2
Mance 1
Rayder 1
Eddard 1
House 1
Baratheon 1
Warden 1
North 1
Targaryen 1
Harwin 1
Ser 1
Rodrik 1
Rickon 1
Desmond 1
Winterfell 1
