## Importing libraries

In [9]:
import os
os.chdir("..")

import json
import numpy as np
import re
from copy import deepcopy
import pandas as pd
import tabulate

from tool.file_and_directory_management import read_file_to_list, read_sentences_from_file
from tool.gender_checker import get_personal_titles

## Preparing test data

In [4]:
def generate_test_data(sentences_per_novel = 5, save_path = None):
    personal_titles = get_personal_titles()
    words_re = re.compile('\\b('+"|".join(personal_titles)+")(?= )")
    test_sentences = []
    for book in read_file_to_list('data/novels_titles/combined_set.txt'):
        sentences = read_sentences_from_file(os.path.join('data/testing_sets', 'test', book))
        ps_sentences = []
        for sent in sentences:
            found_titles = words_re.findall(sent)
            if len(found_titles) == 1:
                ps_sentences.append(sent)
        ps_sentences = np.random.choice(ps_sentences, min(sentences_per_novel, len(ps_sentences)), False)
        for sent in ps_sentences:
            found_title = words_re.findall(sent)[0]
            for ps in personal_titles:
                new_sent = re.sub(found_title, ps + ' ', sent)
                test_sentences.append(new_sent)
                    
    if save_path is not None:
        if not os.path.exists(os.path.dirname(save_path)):
            os.makedirs(os.path.dirname(save_path))
            
        with open(save_path, 'w') as f:
            f.write('\n'.join(test_sentences))
            
    return test_sentences

In [5]:
test_sentences = generate_test_data(5, 'data/experiments/personal_titles')

## Gettig statistics (when predictions are computed)

In [2]:
def get_model_stats(experiments_dir = 'experiments/personal_titles/ner'):
    
    stats_dict = {}
    personal_titles = get_personal_titles()
    words_re = re.compile('\\b('+"|".join(personal_titles)+")(?= )")

    for model in os.listdir(experiments_dir):
        model_stats_dict = {}
        for ps in personal_titles:
            model_stats_dict[ps] = [0,0]

        with open(os.path.join(experiments_dir, model, 'ner_model_annotated', 'personal_titles.json')) as f:
            data = json.loads(f.read())

        for sent_id, sent in enumerate(data):
            for e in sent['entities']:
                entity_text = sent['content'][e[0]:e[1]]
                found_titles = words_re.findall(entity_text)
                if any(found_titles):
                    model_stats_dict[found_titles[0]][0] += 1
                else:
                    found_previous_titles = words_re.findall(sent['content'][e[0]-10:e[0]])
                    if any(found_previous_titles):
                        model_stats_dict[found_previous_titles[0]][1] += 1
        stats_dict[model] = model_stats_dict
    return stats_dict

In [3]:
model_stats = get_model_stats()

In [4]:
for key in model_stats.keys():
    for subkey in model_stats[key].keys():
        if any(model_stats[key][subkey]):
            model_stats[key][subkey] = round(100*model_stats[key][subkey][0]/(model_stats[key][subkey][0]+model_stats[key][subkey][1]))
#             if model_stats[key][subkey] >= 50.0:
#                 model_stats[key][subkey] = '--' + str(model_stats[key][subkey]) + '--'
        else:
            model_stats[key][subkey] = '-'
 

In [5]:
pd.DataFrame(model_stats)

Unnamed: 0,spacy__en_core_web_lg,flair__ner-large,nltk,stanza,flair__ner-fast,flair__ner,spacy__en_core_web_md,spacy__en_core_web_sm
Miss,0,51,0,0,94,91,0,20
Ms.,0,0,0,0,0,0,0,0
Mrs.,0,2,0,0,27,20,0,0
Mistress,0,19,0,11,9,52,0,4
Lady,92,28,0,64,74,92,95,36
Madam,0,79,-,30,100,100,0,14
Madame,27,100,0,44,100,100,98,5
Mr.,0,0,0,0,18,8,0,0
Sir,0,0,-,0,5,0,0,0
Dr.,0,3,0,5,10,29,0,0


In [6]:
pd.DataFrame(model_stats)[['nltk', 'spacy__en_core_web_lg', 'flair__ner-large', 'stanza']]

Unnamed: 0,nltk,spacy__en_core_web_lg,flair__ner-large,stanza
Miss,0,0,51,0
Ms.,0,0,0,0
Mrs.,0,0,2,0
Mistress,0,0,19,11
Lady,0,92,28,64
Madam,-,0,79,30
Madame,0,27,100,44
Mr.,0,0,0,0
Sir,-,0,0,0
Dr.,0,0,3,5


In [7]:
results_table = pd.DataFrame(model_stats)[['nltk', 'spacy__en_core_web_lg', 'flair__ner-large', 'stanza']]

In [10]:
print(tabulate.tabulate(results_table, headers = ['Personal Title', 'nltk', 'spacy__en_core_web_lg', 'flair__ner-large', 'stanza'], tablefmt='latex_booktabs'))

\begin{tabular}{llrrr}
\toprule
 Personal Title   & nltk   &   spacy\_\_en\_core\_web\_lg &   flair\_\_ner-large &   stanza \\
\midrule
 Miss             & 0      &                       0 &                 51 &        0 \\
 Ms.              & 0      &                       0 &                  0 &        0 \\
 Mrs.             & 0      &                       0 &                  2 &        0 \\
 Mistress         & 0      &                       0 &                 19 &       11 \\
 Lady             & 0      &                      92 &                 28 &       64 \\
 Madam            & -      &                       0 &                 79 &       30 \\
 Madame           & 0      &                      27 &                100 &       44 \\
 Mr.              & 0      &                       0 &                  0 &        0 \\
 Sir              & -      &                       0 &                  0 &        0 \\
 Dr.              & 0      &                       0 &                  