In [90]:
import stanza
import operator
from nltk import sent_tokenize, word_tokenize

from nervaluate import Evaluator
from nltk.tokenize import sent_tokenize

In [91]:
nlp = stanza.Pipeline(lang='en', processors='tokenize,ner')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.0.json:   0%|   …

2022-05-24 22:04:44 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| ner       | ontonotes |

2022-05-24 22:04:44 stanza INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| ner       | ontonotes |

2022-05-24 22:04:44 INFO: Use device: cpu
2022-05-24 22:04:44 stanza INFO: Use device: cpu
2022-05-24 22:04:44 INFO: Loading: tokenize
2022-05-24 22:04:44 stanza INFO: Loading: tokenize
2022-05-24 22:04:44 INFO: Loading: ner
2022-05-24 22:04:44 stanza INFO: Loading: ner
2022-05-24 22:04:45 INFO: Done loading processors!
2022-05-24 22:04:45 stanza INFO: Done loading processors!


In [92]:
def read_text(path):
    with open(path, encoding="utf-8") as f:
        text = f.read()
        text = text.replace('\r', ' ').replace('\n', ' ')\
            .replace("’", "'").replace("\"", "").replace("”", "").replace("“", "")
    return text

def stanza_NER(book):
    """
    stanza_NER vrne seznam, v katerem so shranjene prepoznane identitete glede na posamezni stavek
    :param book: str
    :return: entity_dict (seznam slovarjev kot npr. [name, tag, start_pos, stop_pos, line_num. token_num])
    """
    

    doc = nlp(book)
    entity_dict = []
    

    for line_num, sent in enumerate(doc.sentences):
        token_num = 0
        #print(sent.words_string())
        
        for token in sent.tokens:
           
            # S - samostojna
            # B - na zacetku
            # I - vmes
            # E - na koncu
            if token.ner == "S-PERSON" or token.ner == "B-PERSON" or token.ner == "E-PERSON" or token.ner == "I-PERSON":
                info_dict = {}
                info_dict["name"] = token.text
                info_dict["tag"] = token.ner
                info_dict["start_pos"] = token_num
                info_dict["stop_pos"] = token_num +1
                info_dict["line_num"] = line_num
                entity_dict.append(info_dict)
                token_num += 1

    return entity_dict


def get_names_from_NER(entity_dict):
    """
    get_names_from_NER sprejme entity_dict in vrne urejen seznam terk ("ime", št_zaznano)
    :param entity_dict: dict (seznam dictov)
    :return: unique_names: list
    """
    unique_names = {}

    for entity in entity_dict:
        if entity["tag"] == "S-PERSON" or entity["tag"] == "N-PERSON":
            if entity["name"] not in unique_names:
                unique_names[entity["name"]] = 1
            else:
                unique_names[entity["name"]] += 1
    unique_names = sorted(unique_names.items(), key=operator.itemgetter(1),reverse=True)

    return unique_names

def merge_person_names(entity_dict):
    """
    merge_person_names sprejme vse označene persone (S,B,I,E - PERSON) in smiselno zloži imena. Vrne (S - Person) če je
    nespremenjeno in N - Person če je bila združitev... 
    :param entity_dict: dict (seznam dictov)
    :return: entity_dict_clean: list
    """
    entity_dict_clean = []

    for entity_num, entity in enumerate(entity_dict):

        if entity["tag"] == "S-PERSON":
            entity_dict_clean.append(entity)

        elif entity["tag"] == "B-PERSON":
            tmp_entity = entity.copy()
            try:
                for i in range(10):
                    next_entity = entity_dict[entity_num + i]
                    if next_entity["line_num"] ==  tmp_entity["line_num"] and (next_entity["tag"] == "I-PERSON" or next_entity["tag"] == "E-PERSON"):
                        tmp_entity["name"] = tmp_entity["name"] + " " + next_entity["name"]
                        tmp_entity["tag"] = "N-PERSON"
                        tmp_entity["stop_pos"] = next_entity["stop_pos"]
                        if next_entity["tag"] == "E-PERSON":
                            break
            except:
                pass
            entity_dict_clean.append(tmp_entity)
    return entity_dict_clean

def conll_read():
    with open("conll2003/valid.txt") as f:
        lines = f.readlines()
    sentences = []
    sentences_tag = []
    tmp = ""
    tmp_tag = ""
    tags = []
    for i in range(len(lines)):

        line = lines[i]
        line = line.split(" ")
        text_word = line[0]

        if text_word[0] != "-":
            if text_word == "\n":
                # print(tmp)
                # print(tmp_tag)
                # print("-------")
                sentences.append(tmp)
                tmp = ""
                #sentences_tag.append(tmp_tag)
                #tmp_tag = ""
                sentences_tag.append(tags)
                tags = []

            elif text_word == "(" or text_word == ")" or text_word == '"' or text_word == ':':
                pass
            else:
                if text_word[0] != "." and text_word[0] != "," and text_word[0] != ":" and text_word[0] != ";"\
                        and text_word[0] != "!" and text_word[0] != "?" and text_word[0] != ")" and text_word[0] != "'":   #or text_word[0] != "," or text_word[0] != "(" or text_word[0] != ")":
                    tmp = tmp + " " + text_word
                    line[3] = line[3].replace("\n", "")

                    if line[3] == "B-PER" or line[3] == "I-PER":
                        #line[3] = "B-PER"
                        #tmp_tag = tmp_tag + " " + line[3]
                        tags.append(line[3])
                    else:
                        line[3] = "O"
                        #tmp_tag = tmp_tag + " " + line[3]
                        tags.append(line[3])
                else:
                    tmp = tmp +  text_word
                    line[3] = "O"
                    tags.append(line[3])
    return sentences, sentences_tag

In [96]:
book = read_text('../../data/books/ASongOfIceAndFire/AGOT/chapters/1_Bran_1.txt')
entity_dict = stanza_NER(book)


entity_dict_clean = merge_person_names(entity_dict)
unique_names = get_names_from_NER(entity_dict_clean)

sum_num = 0
for (name, num) in unique_names:
    if True: #num > 1:
        print(name, num)
        sum_num = sum_num + num


    


Bran 46
Robb 29
Jon 29
Jory 9
Greyjoy 7
Theon Greyjoy 6
Hullen 5
Stark 3
Snow 3
Winterfell 2
Jory Cassel 2
Jon Snow 2
Mance Rayder 1
Old Nan 1
Eddard Stark 1
Ice 1
Robert of the House Baratheon 1
King of the Andals 1
Rhoynar 1
Eddard 1
Warden of the North 1
Theon 1
Nan 1
Robert 1
Starks 1
Harwin 1
Ser Rodrik 's 1
Rickon 1
no Stark 1
Desmond 1


In [97]:
# Evalvacija na datasetu Conll2003 (osredotočena na imena)

def eval_wrapper():
    sentences, sentences_tag = conll_read()
    sentences_NER = []
    sentences_NER_results = []
    print(len(sentences))
    

    for lin_num, sent in enumerate(sentences):
        
        if lin_num % 100 == 0:
            print(lin_num)
        
        words_list = word_tokenize(sent)
        tags_list = ["O"] * len(words_list)
        sentences_NER.append(words_list)
        
        
        entity_dict = stanza_NER(sent)
        
        for entity in entity_dict:
            name = entity["name"]
            tag = entity["tag"]
            
            if tag == "B-PERSON" or tag == "S-PERSON":
                tag = "B-PER"
            if tag == "I-PERSON" or tag == "E-PERSON":
                tag = "I-PER"
            try:
                idx = words_list.index(name)
                tags_list[idx] = tag
            except:
                print("Not found: ", name)
                
        sentences_NER_results.append(tags_list)
        
        
        
        sentences_org, sentences_org_tag = conll_read()
        # for x in range(len(sentences_NER)):
        #     print("-----------", x)
        #     #print(len(sentences_org_tag[x]))
        #     print(len(sentences_NER[x]))
        #     print(len(sentences_NER_results[x]))

        #     #print(sentences_org[x])    # 22 26
        #     print(sentences_NER[x])
        #     print(sentences_org_tag[x])
        #     print(sentences_NER_results[x])
        
        true = sentences_org_tag
        pred = sentences_NER_results
        evaluator = Evaluator(true, pred, tags=['PER'], loader="list")

        results, results_by_tag = evaluator.evaluate()
        print(results)
           

In [102]:
## Evalvacija za I. poglavje GoT

from chapter__1annotated import chapter1_lst

def eval_got():
    
    book = read_text('../../data/books/ASongOfIceAndFire/AGOT/chapters/1_Bran_1.txt')
    sentences = sent_tokenize(book)
    
    sentences_NER = []
    sentences_NER_results = []
    print(len(sentences))
    

    for lin_num, sent in enumerate(sentences):
        
        if lin_num % 100 == 0:
            print(lin_num)
        
        words_list = word_tokenize(sent)
        tags_list = ["O"] * len(words_list)
        sentences_NER.append(words_list)
        
        
        entity_dict = stanza_NER(sent)
        
        for entity in entity_dict:
            name = entity["name"]
            tag = entity["tag"]
            
            if tag == "B-PERSON" or tag == "S-PERSON":
                tag = "B-PER"
            if tag == "I-PERSON" or tag == "E-PERSON":
                tag = "I-PER"
            try:
                idx = words_list.index(name)
                tags_list[idx] = tag
            except:
                print("Not found: ", name)
                
        sentences_NER_results.append(tags_list)
    return sentences_NER, sentences_NER_results
    
sentences_NER, sentences_NER_results = eval_got()  

284
0
100
200


In [105]:
from chapter__1annotated import chapter1_lst

# Kontrola:
# print(len(sentences_NER_results))
# print(len(chapter1_lst))
# for x in range(len(sentences_NER)):
#     print("-----------", x)
#     print(len(sentences_NER[x]))
#     print(len(sentences_NER_results[x]))
#     print(len(chapter1_lst[x]))
    

#     print(sentences_NER[x])
#     print(sentences_NER_results[x])
#     print(chapter1_lst[x])

true = chapter1_lst
pred = sentences_NER_results
evaluator = Evaluator(true, pred, tags=['PER'], loader="list")

results, results_by_tag = evaluator.evaluate()
print(results)

{'ent_type': {'correct': 149, 'incorrect': 0, 'partial': 0, 'missed': 15, 'spurious': 13, 'possible': 164, 'actual': 162, 'precision': 0.9197530864197531, 'recall': 0.9085365853658537, 'f1': 0.9141104294478528}, 'partial': {'correct': 143, 'incorrect': 0, 'partial': 6, 'missed': 15, 'spurious': 13, 'possible': 164, 'actual': 162, 'precision': 0.9012345679012346, 'recall': 0.8902439024390244, 'f1': 0.8957055214723927}, 'strict': {'correct': 143, 'incorrect': 6, 'partial': 0, 'missed': 15, 'spurious': 13, 'possible': 164, 'actual': 162, 'precision': 0.8827160493827161, 'recall': 0.8719512195121951, 'f1': 0.8773006134969324}, 'exact': {'correct': 143, 'incorrect': 6, 'partial': 0, 'missed': 15, 'spurious': 13, 'possible': 164, 'actual': 162, 'precision': 0.8827160493827161, 'recall': 0.8719512195121951, 'f1': 0.8773006134969324}}
