In [1]:
import re
import json
import stanza
import argparse

import numpy as np
import pandas as pd

from itertools import chain
from ast import literal_eval
from collections import defaultdict

from functions_score_section import read_alignments, score_vec_rslts_chapter_level, \
build__src_2_tgt_dict, build_tgt_2_src_dict, score_fr_sents

In [2]:
# get vecalign results
vec_rslts_path = "/home/craig.car/repos/chiron/align_texts_project/data/lucretius/lucretius_en1893_vecrslts"
vec_rslts = read_alignments(vec_rslts_path)

In [3]:
lat_dict_path = "/home/craig.car/repos/chiron/align_texts_project/data/lucretius/lat_sent2book_dict.json"

with open(lat_dict_path) as f:
    lat_sent2book_name = json.load(f)

en1893_dict_path = "/home/craig.car/repos/chiron/align_texts_project/data/lucretius/en1893_sent2section_dict.json"

with open(en1893_dict_path) as f:
    en1893_sent2section_name = json.load(f)
    

In [4]:
# get peritext sections in en1893
extraneous_sections = []
for item in list(en1893_sent2section_name.values()):
    if isinstance(item, str):
        extraneous_sections.append(item)
    else:
        extraneous_sections.extend(item)

In [5]:
extraneous_sections = np.unique(np.array(extraneous_sections))

# Score vecalign results: by prediction

In [6]:
def score_vec_rslts_chapter_level(vr_rslts_lst, el_sent2section_dict,
                                 fr_sent2section_dict, fr_extra_section_names):

    tp_strict = 0 # +1 per alignment if there's an exact match
    tp_lax = 0 # +1 per alignment if there's any overlap
    overlaps = []
    errors = []
    correct_nulls = 0
    correct_text2text = {}

    for idx_align, alignment in enumerate(vr_rslts_lst):
        # skip alignments null on both sides
        if alignment == ([],[]):
            continue
        else:
            src_sents = alignment[0]
            tgt_sents = alignment[1]
            # get set of chapters from src, then from tgt
            chapters_from_src = set()
            chapters_from_tgt = set()
            # if alignment is null on src side, then chapters_from_src remains empty set
            if src_sents != []:
                for src_id in src_sents:
                    if isinstance(el_sent2section_dict[str(src_id)], list):
                        for section_name in el_sent2section_dict[str(src_id)]:
                            chapters_from_src.add(section_name)
                    else:
                        chapters_from_src.add(el_sent2section_dict[str(src_id)])
            # if alignment is null on tgt side, then chapters_from_tgt remains empty set
            if tgt_sents != []:
                for tgt_id in tgt_sents:
                    if isinstance(fr_sent2section_dict[str(tgt_id)], list):
                        for section_name_ in fr_sent2section_dict[str(tgt_id)]:
                            chapters_from_tgt.add(section_name_)
                    else:
                        chapters_from_tgt.add(fr_sent2section_dict[str(tgt_id)])

            # compare the sets: if text, then last 5 characters in format "book[1-6]"
            src_chaps_compare = set()
            for src_chap in chapters_from_src:
                src_chaps_compare.add(src_chap[-5:])
            tgt_chaps_compare = set()
            for tgt_chap in chapters_from_tgt:
                tgt_chaps_compare.add(tgt_chap[-5:])       
            if src_chaps_compare == tgt_chaps_compare:
                tp_strict += 1
                # for correct text2text aligns, en is from prose or metric translation?
                correct_text2text[str(alignment)] = chapters_from_tgt

            # account for correct null : fr extraneous sections 
            elif chapters_from_src == set():
                tgt_counter = 0
                for chapter in chapters_from_tgt:
                    if chapter in fr_extra_section_names:
                        tgt_counter += 1
                # tp_strict if all tgt chapters are extraneous
                if tgt_counter == len(chapters_from_tgt):
                    # tp_strict += 1
                    correct_nulls += 1

            else:
                overlap = src_chaps_compare.intersection(tgt_chaps_compare)
                if len(overlap) != 0:
                    tp_lax += 1
                    overlaps.append(alignment)
                else:
                    # save errors
                    error_dict = {}
                    error_dict["alignment"] = alignment
                    error_dict["alignmnent_idx"] = idx_align
                    error_dict["src_chapters"] = chapters_from_src
                    error_dict["tgt_chapters"] = chapters_from_tgt
                    errors.append(error_dict)
        
    return tp_strict, tp_lax, overlaps, errors, correct_nulls, correct_text2text


In [7]:
test = {}
test[0] = [1,3]

In [8]:
lat_sent2book_name["789"][-5:]

'book3'

In [9]:
en1893_sent2section_name["423"][-5:] == lat_sent2book_name["0"][-5:]

True

In [10]:
tp_strict_, tp_lax_, overlaps_, errors_, correct_nulls_, correct_text2text_ = score_vec_rslts_chapter_level(
    vec_rslts, lat_sent2book_name, en1893_sent2section_name, extraneous_sections)

In [11]:
print(tp_strict_)
print(tp_lax_)
# print(overlaps_)
print(correct_nulls_)

1700
455
8146


In [12]:
len(vec_rslts)

10534

In [13]:
# was latin text aligned to any metric translation sentences?
correct_lat2entext = set()
for tgt_set in list(correct_text2text_.values()):
    for item in tgt_set:
        correct_lat2entext.add(item)

In [14]:
correct_lat2entext

{'metric_translationbook6',
 'prose_translationbook1',
 'prose_translationbook2',
 'prose_translationbook3',
 'prose_translationbook4',
 'prose_translationbook5',
 'prose_translationbook6'}

In [27]:
correct_metric2lat = []
for keys, values in correct_text2text_.items():
    for section_set in values:
        if section_set.startswith('metric_translation'):
            correct_metric2lat.append((keys, values))

In [28]:
correct_metric2lat

[('([2423], [12667, 12668, 12669, 12670, 12671, 12672, 12673])',
  {'metric_translationbook6'}),
 ('([2424], [12691, 12692, 12693, 12694, 12695, 12696, 12697])',
  {'metric_translationbook6'})]

In [30]:
overlaps_

[([0], [421, 422, 423, 424, 425, 426]),
 ([1], [465, 466, 467, 468, 469, 470, 471]),
 ([6], [477, 478, 479, 480, 481, 482, 483]),
 ([9], [512, 513, 514, 515, 516, 517]),
 ([10], [541, 542, 543, 544, 545, 546]),
 ([15], [583, 584, 585, 586, 587, 588, 589]),
 ([16], [592, 593, 594, 595, 596]),
 ([17], [621, 622]),
 ([23], [631, 632, 633, 634, 635]),
 ([25], [644, 645, 646, 647, 648, 649, 650]),
 ([31], [658, 659, 660, 661, 662, 663, 664]),
 ([34], [673, 674, 675, 676, 677, 678, 679]),
 ([36], [681, 682, 683, 684, 685, 686, 687]),
 ([41], [731, 732, 733, 734, 735, 736]),
 ([48], [746, 747, 748, 749, 750]),
 ([51], [756, 757, 758]),
 ([56], [763, 764, 765, 766, 767, 768, 769]),
 ([58], [777, 778, 779, 780]),
 ([64], [786, 787, 788, 789, 790, 791, 792]),
 ([67], [803, 804]),
 ([76], [831, 832, 833, 834, 835, 836, 837]),
 ([79], [842, 843, 844, 845, 846, 847, 848]),
 ([81], [855, 856, 857, 858, 859, 860, 861]),
 ([89], [875, 876, 877, 878]),
 ([90], [883, 884, 885]),
 ([101], [918, 919]),
 (

In [17]:
errors_

[{'alignment': ([13], [555]),
  'alignmnent_idx': 526,
  'src_chapters': {'book1'},
  'tgt_chapters': {'prose_translationbook1note'}},
 {'alignment': ([14], [569, 570, 571, 572, 573, 574, 575]),
  'alignmnent_idx': 540,
  'src_chapters': {'book1'},
  'tgt_chapters': {'prose_translationbook1note'}},
 {'alignment': ([24], [642, 643]),
  'alignmnent_idx': 589,
  'src_chapters': {'book1'},
  'tgt_chapters': {'prose_translationbook1note'}},
 {'alignment': ([32], [667, 668, 669, 670, 671]),
  'alignmnent_idx': 599,
  'src_chapters': {'book1'},
  'tgt_chapters': {'prose_translationbook1note'}},
 {'alignment': ([33], [672]),
  'alignmnent_idx': 600,
  'src_chapters': {'book1'},
  'tgt_chapters': {'prose_translationbook1note'}},
 {'alignment': ([40], [721, 722, 723, 724, 725, 726, 727]),
  'alignmnent_idx': 635,
  'src_chapters': {'book1'},
  'tgt_chapters': {'prose_translationbook1note'}},
 {'alignment': ([49], [751, 752, 753, 754]),
  'alignmnent_idx': 647,
  'src_chapters': {'book1'},
  'tgt

# Analyze vecalign results: by English sentences

In [18]:
# get dict of tgt sentences' alignments to src sents
tgt_sent_2_src_sent_aligns = build_tgt_2_src_dict(vec_rslts)

In [19]:
# test
keys = list(tgt_sent_2_src_sent_aligns.keys())
keys = sorted(keys)
keys == [x for x in range(0, 13966)]

True

In [20]:
extraneous_sections

array(['forewordbook0', 'forewordbook0note', 'forewordbook0title',
       'index', 'metric_translationbook0title', 'metric_translationbook1',
       'metric_translationbook1note', 'metric_translationbook1title',
       'metric_translationbook2', 'metric_translationbook2note',
       'metric_translationbook2title', 'metric_translationbook3',
       'metric_translationbook3note', 'metric_translationbook3title',
       'metric_translationbook4', 'metric_translationbook4note',
       'metric_translationbook4title', 'metric_translationbook5',
       'metric_translationbook5note', 'metric_translationbook5title',
       'metric_translationbook6', 'metric_translationbook6note',
       'metric_translationbook6title', 'prose_translationbook0title',
       'prose_translationbook1', 'prose_translationbook1commentary',
       'prose_translationbook1note', 'prose_translationbook1title',
       'prose_translationbook2', 'prose_translationbook2commentary',
       'prose_translationbook2note', 'prose_t

In [21]:
"forewordbook0".endswith("0")

True

In [42]:
def score_fr_sents(fr2el_sent_aligns_dict, fr_sent2section_name_dict,
                   el_sent2section_name_dict, fr_extraneous_chapter_names):
    extraneous2null_tpstrict = 0
    extraneous2null_tplax = 0 # at least one overlap
    extraneous2text = 0 # no overlap

    text2text_tpstrict = 0
    text2text_tplax = 0
    text2text_incorrect = 0
    text2text_incorrect_lst = []

    text2null_incorrect = 0
    text2null_lst = []

    for fr_sent_idx in fr2el_sent_aligns_dict.keys():
        # get grk sentences aligned to it (returns a set)
        el_aligned_sents = fr2el_sent_aligns_dict[fr_sent_idx]
        print(f"el aligned sents is {el_aligned_sents}")
        
        # TODO: necessary? to skip null-null alignments ("null" will not appear as key in dict)
        if str(fr_sent_idx) in fr_sent2section_name_dict.keys():
            # get fr sent chapter (keys are str)
            fr_sent_chapter = fr_sent2section_name_dict[str(fr_sent_idx)]
            # print(f"fr chapter is {fr_sent_chapter}")
            # convert to list in case of multiple chapters per sent
            if isinstance(fr_sent_chapter, str):
                fr_sent_chapter = [fr_sent_chapter]

            for tgt_chapter in fr_sent_chapter:
                if tgt_chapter in fr_extraneous_chapter_names:
                    # get num of fr to null alignments
                    extraneous2null_counter = 0
                    for item in el_aligned_sents:
                        if item == "null":
                            extraneous2null_counter += 1
                    # compare to number of el sents in alignmnent
                    if extraneous2null_counter == len(el_aligned_sents):
                        # then all grk aligned sents are null
                        extraneous2null_tpstrict += 1
                    elif extraneous2null_counter > 0:
                        # then at least one grk sent is null (also captures tpstrict)
                        extraneous2null_tplax += 1
                    else:
                        # no greek sents are null
                        extraneous2text += 1
                
                #### TODO: ADAPT TO ACCOUNT FOR DIFFERENT FORMATTING ("BOOK 2" for LATIN) ####
                else: # compare fr and grk chapters
                    el_aligned_chapters = set()
                    el_text2text_correct_counter = 0
                    el_text2text_incorrect_counter = 0

                    for item in el_aligned_sents:
                        if item == "null":
                            text2null_incorrect += 1
                            text2null_lst.append(fr_sent_idx)
                        else:
                            # get chapters of el sent (keys are str)
                            if isinstance(el_sent2section_name_dict[str(item)], list):
                                for section_name in el_sent2section_name_dict[str(item)]:
                                    el_aligned_chapters.add(section_name)
                            else:
                                el_aligned_chapters.add(el_sent2section_name_dict[str(item)])

                    # print(f"el chapters are {el_aligned_chapters}")
                    #### TODO: IS THIS WRONG? ############
                    tgt_chaps_compare = tgt_chapter[-5:]
                    for item in el_aligned_chapters:
                        if tgt_chaps_compare == item:
                            el_text2text_correct_counter += 1
                        else:
                            el_text2text_incorrect_counter += 1

                    if el_text2text_correct_counter == len(el_aligned_sents):
                        text2text_tpstrict += 1
                    elif el_text2text_correct_counter > 0:
                        text2text_tplax += 1
                    else:
                        text2text_incorrect += 1
                        text2text_incorrect_lst.append(fr_sent_idx)

    # remove text2null from text2text_incorrect_lst
    text2null_lst = set(text2null_lst)
    text2text_incorrect_lst = set(text2text_incorrect_lst)
    text2text_incorrect_lst -= text2null_lst
    # update num of text2text_incorrect
    text2text_incorrect -= text2null_incorrect
    
    results = [extraneous2null_tpstrict, extraneous2null_tplax, extraneous2text,
               text2text_tpstrict, text2text_tplax, 
               text2text_incorrect, text2text_incorrect_lst,
               text2null_incorrect, text2null_lst]
    
    return results

In [40]:
rslts_en1893_sents = score_fr_sents(tgt_sent_2_src_sent_aligns, en1893_sent2section_name,
                   lat_sent2book_name, extraneous_sections)

el aligned sents is {'null'}
el aligned sents is {'null'}
el aligned sents is {'null'}
el aligned sents is {'null'}
el aligned sents is {'null'}
el aligned sents is {'null'}
el aligned sents is {'null'}
el aligned sents is {'null'}
el aligned sents is {'null'}
el aligned sents is {'null'}
el aligned sents is {'null'}
el aligned sents is {'null'}
el aligned sents is {'null'}
el aligned sents is {'null'}
el aligned sents is {'null'}
el aligned sents is {'null'}
el aligned sents is {'null'}
el aligned sents is {'null'}
el aligned sents is {'null'}
el aligned sents is {'null'}
el aligned sents is {'null'}
el aligned sents is {'null'}
el aligned sents is {'null'}
el aligned sents is {'null'}
el aligned sents is {'null'}
el aligned sents is {'null'}
el aligned sents is {'null'}
el aligned sents is {'null'}
el aligned sents is {'null'}
el aligned sents is {'null'}
el aligned sents is {'null'}
el aligned sents is {'null'}
el aligned sents is {'null'}
el aligned sents is {'null'}
el aligned sen

In [41]:
rslts_en1893_sents

[8654, 0, 6255, 0, 0, 0, set(), 0, set()]

In [25]:
8654+6255

14909