In [72]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk import sent_tokenize
import operator

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/matijagercer/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/matijagercer/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/matijagercer/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/matijagercer/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [16]:

sent= '''Prime Minister Jacinda Ardern has claimed that New Zealand had won a big
battle over the spread of coronavirus.'''

#sentences = sent_tokenize(sent)

words= word_tokenize(sent)
tags=pos_tag(words)
ne = nltk.ne_chunk(tags,binary=True)
print(ne)


(S
  Prime/NNP
  Minister/NNP
  (NE Jacinda/NNP Ardern/NNP)
  has/VBZ
  claimed/VBN
  that/IN
  (NE New/NNP Zealand/NNP)
  had/VBD
  won/VBN
  a/DT
  big/JJ
  battle/NN
  over/IN
  the/DT
  spread/NN
  of/IN
  coronavirus/NN
  ./.)


In [17]:
from nltk.chunk import tree2conlltags
iob = tree2conlltags(ne)
iob



[('Prime', 'NNP', 'O'),
 ('Minister', 'NNP', 'O'),
 ('Jacinda', 'NNP', 'B-NE'),
 ('Ardern', 'NNP', 'I-NE'),
 ('has', 'VBZ', 'O'),
 ('claimed', 'VBN', 'O'),
 ('that', 'IN', 'O'),
 ('New', 'NNP', 'B-NE'),
 ('Zealand', 'NNP', 'I-NE'),
 ('had', 'VBD', 'O'),
 ('won', 'VBN', 'O'),
 ('a', 'DT', 'O'),
 ('big', 'JJ', 'O'),
 ('battle', 'NN', 'O'),
 ('over', 'IN', 'O'),
 ('the', 'DT', 'O'),
 ('spread', 'NN', 'O'),
 ('of', 'IN', 'O'),
 ('coronavirus', 'NN', 'O'),
 ('.', '.', 'O')]

In [76]:
def read_text(path):
    with open(path, encoding="utf-8") as f:
        text = f.read()
        text = text.replace('\r', ' ').replace('\n', ' ')\
            .replace("’", "'").replace("\"", "").replace("”", "").replace("“", "")
    return text

def nltk_NER(book):
    """
    nlkt_NER vrne seznam, v katerem so shranjene prepoznane identitete glede na posamezni stavek
    :param book: str
    :return: entity_dict (seznam slovarjev kot npr. [name, tag, start_pos, stop_pos, line_num. token_num])
    """

    # 00 Pretvori knjigo v stavke:
    sentences = sent_tokenize(book)
    entity_dict = []

    for line_num, line in enumerate(sentences):

        words = nltk.word_tokenize(line)
        pos_tag = nltk.pos_tag(words)
        ne_chunk = nltk.ne_chunk(pos_tag, binary=False)

        token_num = 0
        for chunk in ne_chunk:
            if hasattr(chunk,'label'):
                start_pos = token_num
                stop_pos = token_num + len(chunk)
                tag = chunk.label()
                name = ' '.join(c[0] for c in chunk)
                token_num += len(chunk)

                info_dict = {}
                info_dict["name"] = name
                info_dict["tag"] = tag
                info_dict["start_pos"] = start_pos
                info_dict["stop_pos"] = stop_pos
                info_dict["line_num"] = line_num
                entity_dict.append(info_dict)
            else:
                token_num += 1



    return entity_dict


def get_names_from_NER(entity_dict):
    """
    get_names_from_NER sprejme entity_dict in vrne urejen seznam terk ("ime", št_zaznano)
    :param entity_dict: dict (seznam dictov)
    :return: unique_names: list
    """
    unique_names = {}

    for entity in entity_dict:
        if entity["tag"] == "PERSON":
            if entity["name"] not in unique_names:
                unique_names[entity["name"]] = 1
            else:
                unique_names[entity["name"]] += 1
    unique_names = sorted(unique_names.items(), key=operator.itemgetter(1),reverse=True)

    return unique_names

In [82]:
book = read_text('../../data/books/ASongOfIceAndFire/AGOT/chapters/Bran_1_1.txt')
entity_dict = nltk_NER(book)
unique_names = get_names_from_NER(entity_dict)

for (name, num) in unique_names:
    if True: #num > 1:
        print(name, num)







Bran 42
Jon 29
Robb 27
Father 8
Jory 7
Greyjoy 5
Hullen 4
Snow 3
Lord 2
Robert 2
Jon Snow 2
Theon Greyjoy 2
Mance Rayder 1
Old Nan 1
Eddard Stark 1
Jory Cassel 1
Blood 1
Stark 1
Old 1
Nan 1
Watch 1
Harwin 1
Ser 1
Rodrik 1
Rickon 1
Desmond 1
