## Importing libraries

In [None]:
import os
os.chdir("..")

import json
import numpy as np
import re
from copy import deepcopy
import pandas as pd
import tabulate

from tool.file_and_directory_management import read_file_to_list, read_sentences_from_file
from tool.gender_checker import get_personal_titles

## Preparing test data

In [None]:
def generate_test_data(sentences_per_novel = 5, save_path = None):
    personal_titles = get_personal_titles()
    words_re = re.compile('\\b('+"|".join(personal_titles)+")(?= )")
    test_sentences = []
    for book in read_file_to_list('data/novels_titles/combined_set.txt'):
        sentences = read_sentences_from_file(os.path.join('data/testing_sets', 'test', book))
        ps_sentences = []
        for sent in sentences:
            found_titles = words_re.findall(sent)
            if len(found_titles) == 1:
                ps_sentences.append(sent)
        ps_sentences = np.random.choice(ps_sentences, min(sentences_per_novel, len(ps_sentences)), False)
        for sent in ps_sentences:
            found_title = words_re.findall(sent)[0]
            for ps in personal_titles:
                new_sent = re.sub(found_title, ps + ' ', sent)
                test_sentences.append(new_sent)
                    
    if save_path is not None:
        if not os.path.exists(os.path.dirname(save_path)):
            os.makedirs(os.path.dirname(save_path))
            
        with open(save_path, 'w') as f:
            f.write('\n'.join(test_sentences))
            
    return test_sentences

In [None]:
test_sentences = generate_test_data(5, 'data/experiments/personal_titles')

## Gettig statistics (when predictions are computed)

In [None]:
def get_model_stats(experiments_dir = 'experiments/personal_titles/ner'):
    
    stats_dict = {}
    personal_titles = get_personal_titles()
    words_re = re.compile('\\b('+"|".join(personal_titles)+")(?= )")

    for model in os.listdir(experiments_dir):
        model_stats_dict = {}
        for ps in personal_titles:
            model_stats_dict[ps] = [0,0]

        with open(os.path.join(experiments_dir, model, 'personal_titles.json')) as f:
            data = json.loads(f.read())

        for sent_id, sent in enumerate(data):
            for e in sent['entities']:
                entity_text = sent['content'][e[0]:e[1]]
                found_titles = words_re.findall(entity_text)
                if any(found_titles):
                    model_stats_dict[found_titles[0]][0] += 1
                else:
                    found_previous_titles = words_re.findall(sent['content'][e[0]-10:e[0]])
                    if any(found_previous_titles):
                        model_stats_dict[found_previous_titles[0]][1] += 1
        stats_dict[model] = model_stats_dict
    return stats_dict

In [None]:
model_stats = get_model_stats()

In [None]:
for key in model_stats.keys():
    for subkey in model_stats[key].keys():
        if any(model_stats[key][subkey]):
            model_stats[key][subkey] = round(100*model_stats[key][subkey][0]/(model_stats[key][subkey][0]+model_stats[key][subkey][1]))
        else:
            model_stats[key][subkey] = '-'

In [None]:
pd.DataFrame(model_stats)

In [None]:
results_table = pd.DataFrame(model_stats)[['nltk', 'spacy__en_core_web_lg', 'flair__ner-large', 'stanza']]

In [None]:
results_table

In [None]:
print(tabulate.tabulate(results_table, headers = ['Personal Title', 'nltk', 'spacy__en_core_web_lg', 'flair__ner-large', 'stanza'], tablefmt='latex_booktabs'))