In [41]:
import os
from nltk.tag import StanfordNERTagger
from nltk import sent_tokenize, word_tokenize
import operator
from sner import Ner

from nervaluate import Evaluator
from nltk.tokenize import sent_tokenize


In [42]:
model = "/Users/matijagercer/Desktop/stanford-ner-2020-11-17/classifiers/english.all.3class.distsim.crf.ser.gz"
jar = "/Users/matijagercer/Desktop/stanford-ner-2020-11-17/stanford-ner-4.2.0.jar"
st = StanfordNERTagger(model, jar,encoding='utf-8')


In [43]:
# Termina:
# cd your_stanford_ner_dir
# java -Djava.ext.dirs=./lib -cp stanford-ner.jar edu.stanford.nlp.ie.NERServer -port 9199 -loadClassifier ./classifiers/english.all.3class.distsim.crf.ser.gz

# Za več preberi:
# https://stackoverflow.com/questions/33748554/how-to-speed-up-ne-recognition-with-stanford-ner-with-python-nltk

tagger = Ner(host='localhost',port=9199)


In [44]:
def read_text(path):
    with open(path, encoding="utf-8") as f:
        text = f.read()
        text = text.replace('\r', ' ').replace('\n', ' ')\
            .replace("’", "'").replace("\"", "").replace("”", "").replace("“", "")
    return text


def stanford_NER(book, eval = False):
    """
    stanford_NER vrne seznam, v katerem so shranjene prepoznane identitete glede na posamezni stavek
    :param book: str
    :return: entity_dict (seznam slovarjev kot npr. [name, tag, start_pos, stop_pos, line_num. token_num])
    """
    
    
    # EVAL ---
    if eval == True:
        sentences, sentences_tag = conll_read()
        #print(len(sentences), "--------")
    else:
        sentences = sent_tokenize(book)
    sentences_NER = []
    sentences_NER_results = []
    # EVAL ---

    entity_dict = []

    for line_num, line in enumerate(sentences):
        
        # EVAL ---
        words_list = word_tokenize(line)
        tags_list = ["O"] * len(words_list)
        sentences_NER.append(words_list)
        # EVAL ---

        #tokenized_text = word_tokenize(line)
        #classified_text = st.tag(tokenized_text)

        classified_text = tagger.get_entities(line)

        token_num = 0
        for (name, tag) in classified_text:
            if tag != 'O':
                info_dict = {}
                info_dict["name"] = name
                info_dict["tag"] = tag
                info_dict["start_pos"] = token_num
                info_dict["stop_pos"] = token_num+1
                info_dict["line_num"] = line_num
                entity_dict.append(info_dict)
                
                
                # EVAL: ---
                #names = word.text.replace("'s", "")
                #names = names.split(" ")
                if tag == "PERSON":
                    try:
                        idx = words_list.index(name)
                        tags_list[idx] = "B-PER"
                    except:
                        print("Not found: ", name)
                # EVAL ---
                
                
            token_num += 1
        sentences_NER_results.append(tags_list)

    return entity_dict, sentences_NER, sentences_NER_results


def get_names_from_NER(entity_dict):
    """
    get_names_from_NER sprejme entity_dict in vrne urejen seznam terk ("ime", št_zaznano)
    :param entity_dict: dict (seznam dictov)
    :return: unique_names: list
    """
    unique_names = {}

    for entity in entity_dict:
        if entity["tag"] == "PERSON":
            if entity["name"] not in unique_names:
                unique_names[entity["name"]] = 1
            else:
                unique_names[entity["name"]] += 1
    unique_names = sorted(unique_names.items(), key=operator.itemgetter(1),reverse=True)

    return unique_names



def conll_read():
    with open("conll2003/valid.txt") as f:
        lines = f.readlines()
    sentences = []
    sentences_tag = []
    tmp = ""
    tmp_tag = ""
    tags = []
    for i in range(len(lines)):

        line = lines[i]
        line = line.split(" ")
        text_word = line[0]

        if text_word[0] != "-":
            if text_word == "\n":
                # print(tmp)
                # print(tmp_tag)
                # print("-------")
                sentences.append(tmp)
                tmp = ""
                #sentences_tag.append(tmp_tag)
                #tmp_tag = ""
                sentences_tag.append(tags)
                tags = []

            elif text_word == "(" or text_word == ")" or text_word == '"' or text_word == ':':
                pass
            else:
                if text_word[0] != "." and text_word[0] != "," and text_word[0] != ":" and text_word[0] != ";"\
                        and text_word[0] != "!" and text_word[0] != "?" and text_word[0] != ")" and text_word[0] != "'":   #or text_word[0] != "," or text_word[0] != "(" or text_word[0] != ")":
                    tmp = tmp + " " + text_word
                    line[3] = line[3].replace("\n", "")

                    if line[3] == "B-PER" or line[3] == "I-PER":
                        line[3] = "B-PER"
                        #tmp_tag = tmp_tag + " " + line[3]
                        tags.append(line[3])
                    else:
                        line[3] = "O"
                        #tmp_tag = tmp_tag + " " + line[3]
                        tags.append(line[3])
                else:
                    tmp = tmp +  text_word
                    line[3] = "O"
                    tags.append(line[3])
    return sentences, sentences_tag


In [45]:
book = read_text('../../data/books/ASongOfIceAndFire/AGOT/chapters/1_Bran_1.txt')
entity_dict, sentences_NER, sentences_NER_results = stanford_NER(book, eval = False)

unique_names = get_names_from_NER(entity_dict)

for (name, num) in unique_names:
    if True: #num > 1:
        print(name, num)


Jon 31
Robb 29
Greyjoy 12
Jory 11
Theon 7
Bran 6
Hullen 5
Stark 3
Snow 3
Nan 2
Starks 2
Cassel 2
Robert 2
of 2
the 2
Mance 1
Rayder 1
Eddard 1
House 1
Baratheon 1
Warden 1
North 1
Targaryen 1
Harwin 1
Ser 1
Rodrik 1
Rickon 1
Desmond 1
Winterfell 1


In [40]:
# Evalvacija na datasetu Conll2003 (osredotočena na imena)

def eval_wrapper():
    entity_dict, sentences_NER, sentences_NER_results = stanford_NER("", eval= True)
    unique_names = get_names_from_NER(entity_dict)

    # Kontrola podatkov
    sentences_org, sentences_org_tag = conll_read()
    # for x in range(len(sentences_NER)):
    #     print("-----------", x)
    #     #print(len(sentences_org_tag[x]))
    #     print(len(sentences_NER[x]))
    #     print(len(sentences_NER_results[x]))

    #     #print(sentences_org[x])    # 22 26
    #     print(sentences_NER[x])
    #     print(sentences_org_tag[x])
    #     print(sentences_NER_results[x])
    
    true = sentences_org_tag
    pred = sentences_NER_results
    evaluator = Evaluator(true, pred, tags=['PER'], loader="list")

    results, results_by_tag = evaluator.evaluate()
    print(results)

In [49]:
## Evalvacija za I. poglavje GoT

from chapter__1annotated_1class import chapter1_lst

# Kontrola:
# print(len(sentences_NER_results))
# print(len(chapter1_lst))

# for x in range(len(sentences_NER)):
#     print("-----------", x)
#     print(len(sentences_NER[x]))
#     print(len(sentences_NER_results[x]))
#     print(len(chapter1_lst[x]))
    

#     print(sentences_NER[x])
#     print(sentences_NER_results[x])
#     print(chapter1_lst[x])

true = chapter1_lst
pred = sentences_NER_results
evaluator = Evaluator(true, pred, tags=['PER'], loader="list")

results, results_by_tag = evaluator.evaluate()
print(results)

{'ent_type': {'correct': 102, 'incorrect': 0, 'partial': 0, 'missed': 62, 'spurious': 27, 'possible': 164, 'actual': 129, 'precision': 0.7906976744186046, 'recall': 0.6219512195121951, 'f1': 0.6962457337883959}, 'partial': {'correct': 102, 'incorrect': 0, 'partial': 0, 'missed': 62, 'spurious': 27, 'possible': 164, 'actual': 129, 'precision': 0.7906976744186046, 'recall': 0.6219512195121951, 'f1': 0.6962457337883959}, 'strict': {'correct': 102, 'incorrect': 0, 'partial': 0, 'missed': 62, 'spurious': 27, 'possible': 164, 'actual': 129, 'precision': 0.7906976744186046, 'recall': 0.6219512195121951, 'f1': 0.6962457337883959}, 'exact': {'correct': 102, 'incorrect': 0, 'partial': 0, 'missed': 62, 'spurious': 27, 'possible': 164, 'actual': 129, 'precision': 0.7906976744186046, 'recall': 0.6219512195121951, 'f1': 0.6962457337883959}}
