## Importing libraries

In [None]:
import os
os.chdir('..')

import json
import pandas as pd
from pylighter import Annotation
import ast
import tabulate

from tool.gender_checker import get_personal_titles
from tool.pylighter_utils import annotations_to_pylighter, csv_to_json
from tool.annotations_utils import read_annotations, has_intersection, fix_personal_titles, personal_titles_stats

## Fixing gold_standard annotations by cutting personal_titles

In [None]:
gold_standard_path = "data/testing_sets/test_person_gold_standard"
new_gold_standard_path = "data/testing_sets/test_person_gold_standard_titles"
os.makedirs(new_gold_standard_path)
results_annotated = {}
results_not_annotated = {}
for title in os.listdir(gold_standard_path):
    annotations = read_annotations(os.path.join(gold_standard_path, title))
    annotations = fix_personal_titles(annotations)
    with open(os.path.join(new_gold_standard_path, title), 'w') as f:
        f.write(json.dumps(annotations))

## Calculating errors statistics

In [None]:
def get_errors_stats(path_1, path_2, path_3):
    results = []
    for title in os.listdir(os.path.join(path_1))[:1]:
        annotations = read_annotations(os.path.join(path_1, title))
        personal_title_annotated, _ = personal_titles_stats(annotations)
        titles_annotated_count = sum(list(personal_title_annotated.values()))
        annotations_2 = read_annotations(os.path.join(path_2, title))
        annotations_3 = read_annotations(os.path.join(path_3, title))

        old_count = 0
        new_count = 0
        exact_count = 0
        intersections_count = 0
        incorrect_count = 0
        missing_count = 0

        for anno, anno2 in zip(annotations_2, annotations_3):
            entities = anno['entities']
            entities2 = anno2['entities']


            old_count += len(entities)
            new_count += len(entities2)

            matched_count = 0
            for ent1 in entities:
                exact = False
                intersection = False

                for ent2 in entities2:
                    if ent1 == ent2:
                        exact_count += 1
                        exact = True
                        matched_count += 1

                if not exact:
                    for ent2 in entities2:
                        if has_intersection(ent1, ent2):
                            intersections_count += 1
                            intersection = True
                            matched_count += 1

                if not exact and not intersection:
                    incorrect_count += 1

            missing_count += (len(entities2) - matched_count)

        exact_count -= titles_annotated_count

        title_results = {'Title': title.split('.')[0].replace('_', ' '), 
                         '# previously annotated': old_count, 
                         '# correct annotations': exact_count, 
                         '# personal titles annotated': titles_annotated_count, 
                         '# annotations with wrong boundaries (except personal titles)': intersections_count, 
                         '# missing annotations':  missing_count}
        results.append(title_results)
    return pd.DataFrame(results)

In [None]:
results_df = get_errors_stats("data/testing_sets/test_person_gold_standard",
                "data/testing_sets/test_person_gold_standard_titles_corrected",
                "data/testing_sets/test_person_gold_standard_corrected")
results_df

In [None]:
table_latex = results_df.to_latex(index=False)
table_latex = table_latex.split('\n')
table_latex = [' & '.join([x.strip() for x in line.split('&')]) for line in table_latex]
table_latex[2] = ' & '.join(['\\rot{\\textbf{' + x + '}}' for x in table_latex[2].split('&')])
print('\n'.join(table_latex))

## Correcting annonations with pylighter

In [None]:
annotations = read_annotations(os.path.join("data/testing_sets/test_person_gold_standard_titles_corrected", 'The_Catcher_in_the_Rye.json'))
labels, corpus = annotations_to_pylighter(annotations)
annotation = Annotation(corpus, labels_names=["PERSON"], labels=labels, save_path="notebooks/annotations/The_Catcher_in_the_Rye.csv")

In [None]:
csv_path = "notebooks/annotations/The_Catcher_in_the_Rye.csv"
json_path = "test_person_gold_standard/The_Catcher_in_the_Rye.json"
csv_to_json(csv_path, json_path)