In [93]:
import re
import json
import stanza
import argparse

import numpy as np
import pandas as pd

from itertools import chain
from ast import literal_eval
from collections import defaultdict

from functions_score_section import read_alignments, score_vec_rslts_chapter_level, \
build__src_2_tgt_dict, build_tgt_2_src_dict, score_fr_sents

In [94]:
# get vecalign results
vec_rslts_path = "/home/craig.car/repos/chiron/align_texts_project/data/lucretius/lucretius_en1893_vecrslts"
vec_rslts = read_alignments(vec_rslts_path)

In [95]:
lat_dict_path = "/home/craig.car/repos/chiron/align_texts_project/data/lucretius/lat_sent2book_dict.json"

with open(lat_dict_path) as f:
    lat_sent2book_name = json.load(f)

en1893_dict_path = "/home/craig.car/repos/chiron/align_texts_project/data/lucretius/en1893_sent2section_dict.json"

with open(en1893_dict_path) as f:
    en1893_sent2section_name = json.load(f)
    

In [96]:
# get peritext sections in en1893
extraneous_sections = []
for item in list(en1893_sent2section_name.values()):
    if isinstance(item, str):
        extraneous_sections.append(item)
    else:
        extraneous_sections.extend(item)

In [97]:
extraneous_sections = np.unique(np.array(extraneous_sections))

# Score vecalign results: by prediction

In [98]:
tp_strict_, tp_lax_, overlaps_, errors_, correct_nulls_ = score_vec_rslts_chapter_level(
    vec_rslts, lat_sent2book_name, en1893_sent2section_name, extraneous_sections)

In [99]:
print(tp_strict_)
print(tp_lax_)
print(overlaps_)
print(correct_nulls_)

8146
0
[]
8146


In [100]:
len(vec_rslts)

10534

# Analyze vecalign results: by French sentences

In [101]:
# get dict of tgt sentences' alignments to src sents
tgt_sent_2_src_sent_aligns = build_tgt_2_src_dict(vec_rslts)

In [102]:
# test
keys = list(tgt_sent_2_src_sent_aligns.keys())
keys = sorted(keys)
keys == [x for x in range(0, 13966)]

True

In [115]:
test = "index"
set(test)

{'d', 'e', 'i', 'n', 'x'}

In [118]:
def score_fr_sents(fr2el_sent_aligns_dict, fr_sent2section_name_dict,
                   el_sent2section_name_dict, fr_extraneous_chapter_names):
    extraneous2null_tpstrict = 0
    extraneous2null_tplax = 0 # at least one overlap
    extraneous2text = 0 # no overlap

    text2text_tpstrict = 0
    text2text_tplax = 0
    text2text_incorrect = 0
    text2text_incorrect_lst = []

    text2null_incorrect = 0
    text2null_lst = []

    for fr_sent_idx in fr2el_sent_aligns_dict.keys():
    # for fr_sent_idx in [0,1000,10000]:
        # get grk sentences aligned to it
        el_aligned_sents = fr2el_sent_aligns_dict[fr_sent_idx]
        print(f"el aligned sents is {el_aligned_sents}")
        
        # TODO: necessary? to skip null-null alignments ("null" will not appear as key in dict)
        if str(fr_sent_idx) in fr_sent2section_name_dict.keys():
            # get fr sent chapter (keys are str). only 1 chapter per french sent
            fr_sent_chapter = fr_sent2section_name_dict[str(fr_sent_idx)]
            print(f"fr chapter is {fr_sent_chapter}")
            if isinstance(fr_sent_chapter, str):
                fr_sent_chapter = [fr_sent_chapter]

            for tgt_chapter in fr_sent_chapter:
                if tgt_chapter in fr_extraneous_chapter_names:
                    # get num of fr - null alignments
                    extraneous2null_counter = 0
                    for item in el_aligned_sents:
                        if item == "null":
                            extraneous2null_counter += 1
                    # compare to number of el sents in alignmnent
                    if extraneous2null_counter == len(el_aligned_sents):
                        # then all grk aligned sents are null
                        extraneous2null_tpstrict += 1
                    elif extraneous2null_counter > 0:
                        # then at least one grk sent is null (also captures tpstrict)
                        extraneous2null_tplax += 1
                    else:
                        # no greek sents are null
                        extraneous2text += 1

                    # fr_extraneous2null_correct += el_counter/len(el_aligned_sents)
                    # fr_extraneous2text += (len(el_aligned_sents) - el_counter)/len(el_aligned_sents)
                        # if item == "null":
                        #     fr_extraneous2null_correct += 1
                        # else:
                        #     fr_extraneous2text += 1

                else: # compare fr and grk chapters
                    el_aligned_chapters = set()
                    el_text2text_correct_counter = 0
                    el_text2text_incorrect_counter = 0

                    for item in el_aligned_sents:
                        if item == "null":
                            text2null_incorrect += 1
                            text2null_lst.append(fr_sent_idx)
                        # if item == "null":
                        #     fr_text2null += 1
                        #     fr_text2null_lst.append(fr_sent_idx)
                        else:
                            # get chapters of el sent (keys are str)
                            if isinstance(el_sent2section_name_dict[str(item)], list):
                                for section_name in el_sent2section_name_dict[str(item)]:
                                    el_aligned_chapters.add(section_name)
                            else:
                                el_aligned_chapters.add(el_sent2section_name_dict[str(item)])

                    print(f"el chapters are {el_aligned_chapters}")

                    for item in el_aligned_chapters:
                        if tgt_chapter == item:
                            el_text2text_correct_counter += 1
                            # fr_text2text_correct += 1
                        else:
                            el_text2text_incorrect_counter += 1
                            # fr_text2text_incorrect += 1

                    if el_text2text_correct_counter == len(el_aligned_sents):
                        text2text_tpstrict += 1
                    elif el_text2text_correct_counter > 0:
                        text2text_tplax += 1
                    else:
                        text2text_incorrect += 1
                        text2text_incorrect_lst.append(fr_sent_idx)

                    # fr_text2text_correct += el_counter_text2text_correct/(len(el_aligned_sents))
                    # fr_text2text_incorrect += el_counter_text2text_incorrect/(len(el_aligned_sents))

    # remove text2null from text2text_incorrect_lst
    text2null_lst = set(text2null_lst)
    text2text_incorrect_lst = set(text2text_incorrect_lst)
    text2text_incorrect_lst -= text2null_lst
    # update num of text2text_incorrect
    text2text_incorrect -= text2null_incorrect
    
    results = [extraneous2null_tpstrict, extraneous2null_tplax, extraneous2text,
               text2text_tpstrict, text2text_tplax, 
               text2text_incorrect, text2text_incorrect_lst,
               text2null_incorrect, text2null_lst]
    
    return results

In [119]:
rslts_en1893_sents = score_fr_sents(tgt_sent_2_src_sent_aligns, en1893_sent2section_name,
                   lat_sent2book_name, extraneous_sections)

el aligned sents is {'null'}
fr chapter is forewordbook0title
el aligned sents is {'null'}
fr chapter is forewordbook0title
el aligned sents is {'null'}
fr chapter is forewordbook0title
el aligned sents is {'null'}
fr chapter is forewordbook0title
el aligned sents is {'null'}
fr chapter is forewordbook0title
el aligned sents is {'null'}
fr chapter is forewordbook0title
el aligned sents is {'null'}
fr chapter is forewordbook0title
el aligned sents is {'null'}
fr chapter is forewordbook0title
el aligned sents is {'null'}
fr chapter is forewordbook0title
el aligned sents is {'null'}
fr chapter is forewordbook0title
el aligned sents is {'null'}
fr chapter is forewordbook0title
el aligned sents is {'null'}
fr chapter is forewordbook0title
el aligned sents is {'null'}
fr chapter is forewordbook0title
el aligned sents is {'null'}
fr chapter is forewordbook0title
el aligned sents is {'null'}
fr chapter is forewordbook0title
el aligned sents is {'null'}
fr chapter is forewordbook0title
el align

In [120]:
rslts_en1893_sents

[8654, 0, 6255, 0, 0, 0, set(), 0, set()]