In [1]:
import re
import json
import stanza
import argparse

import numpy as np
import pandas as pd

from itertools import chain
from ast import literal_eval
from collections import defaultdict

from functions_score_section import read_alignments, score_vec_rslts_chapter_level, \
build__src_2_tgt_dict, build_tgt_2_src_dict, score_fr_sents

In [2]:
# get vecalign results
vec_rslts_path = "/home/craig.car/repos/chiron/align_texts_project/data/lucretius/lucretius_en1893_vecrslts"
vec_rslts = read_alignments(vec_rslts_path)

In [3]:
lat_dict_path = "/home/craig.car/repos/chiron/align_texts_project/data/lucretius/lat_sent2book_dict.json"

with open(lat_dict_path) as f:
    lat_sent2book_name = json.load(f)

en1893_dict_path = "/home/craig.car/repos/chiron/align_texts_project/data/lucretius/en1893_sent2section_dict_NEW.json"

with open(en1893_dict_path) as f:
    en1893_sent2section_name = json.load(f)
    

In [4]:
# get peritext sections in en1893
section_names = []
for item in list(en1893_sent2section_name.values()):
    if isinstance(item, str):
        section_names.append(item)
    else:
        section_names.extend(item)

In [5]:
section_names = np.unique(np.array(section_names))

In [6]:
section_names

array(['forewordbook0', 'forewordbook0note', 'forewordbook0title',
       'index', 'metric_translationbook0title', 'metric_translationbook1',
       'metric_translationbook1note', 'metric_translationbook1title',
       'metric_translationbook2', 'metric_translationbook2note',
       'metric_translationbook2title', 'metric_translationbook3',
       'metric_translationbook3note', 'metric_translationbook3title',
       'metric_translationbook4', 'metric_translationbook4note',
       'metric_translationbook4title', 'metric_translationbook5',
       'metric_translationbook5note', 'metric_translationbook5title',
       'metric_translationbook6', 'metric_translationbook6note',
       'metric_translationbook6title', 'prose_translationbook0title',
       'prose_translationbook1', 'prose_translationbook1commentary',
       'prose_translationbook1note', 'prose_translationbook1title',
       'prose_translationbook2', 'prose_translationbook2commentary',
       'prose_translationbook2note', 'prose_t

In [7]:
"test1"[-2].isdigit()

False

In [8]:
extraneous_sections = []
for item in section_names:
    if item[-1].isdigit():
        continue
    else:
        extraneous_sections.append(item)
extraneous_sections

['forewordbook0note',
 'forewordbook0title',
 'index',
 'metric_translationbook0title',
 'metric_translationbook1note',
 'metric_translationbook1title',
 'metric_translationbook2note',
 'metric_translationbook2title',
 'metric_translationbook3note',
 'metric_translationbook3title',
 'metric_translationbook4note',
 'metric_translationbook4title',
 'metric_translationbook5note',
 'metric_translationbook5title',
 'metric_translationbook6note',
 'metric_translationbook6title',
 'prose_translationbook0title',
 'prose_translationbook1commentary',
 'prose_translationbook1note',
 'prose_translationbook1title',
 'prose_translationbook2commentary',
 'prose_translationbook2note',
 'prose_translationbook2title',
 'prose_translationbook3commentary',
 'prose_translationbook3note',
 'prose_translationbook3title',
 'prose_translationbook4commentary',
 'prose_translationbook4note',
 'prose_translationbook4title',
 'prose_translationbook5commentary',
 'prose_translationbook5note',
 'prose_translationboo

# Score vecalign results: by prediction

In [9]:
def score_vec_rslts_chapter_level(vr_rslts_lst, el_sent2section_dict,
                                 fr_sent2section_dict, fr_extra_section_names):

    tp_strict = 0 # +1 per alignment if there's an exact match
    tp_lax = 0 # +1 per alignment if there's any overlap
    overlaps = []
    errors = []
    correct_nulls = 0
    correct_text2text = {}

    for idx_align, alignment in enumerate(vr_rslts_lst):
    # for idx_align, alignment in enumerate(vr_rslts_lst[518:523]):
        # skip alignments null on both sides
        if alignment == ([],[]):
            continue
        else:
            src_sents = alignment[0]
            tgt_sents = alignment[1]
            # get set of chapters from src, then from tgt
            chapters_from_src = set()
            chapters_from_tgt = set()
            # if alignment is null on src side, then chapters_from_src remains empty set
            if src_sents != []:
                for src_id in src_sents:
                    if isinstance(el_sent2section_dict[str(src_id)], list):
                        for section_name in el_sent2section_dict[str(src_id)]:
                            chapters_from_src.add(section_name)
                    else:
                        chapters_from_src.add(el_sent2section_dict[str(src_id)])
            # if alignment is null on tgt side, then chapters_from_tgt remains empty set
            if tgt_sents != []:
                for tgt_id in tgt_sents:
                    if isinstance(fr_sent2section_dict[str(tgt_id)], list):
                        for section_name_ in fr_sent2section_dict[str(tgt_id)]:
                            chapters_from_tgt.add(section_name_)
                    else:
                        chapters_from_tgt.add(fr_sent2section_dict[str(tgt_id)])

            # compare the sets: if text, then last 5 characters in format "book[1-6]"
            src_chaps_compare = set()
            for src_chap in chapters_from_src:
                src_chaps_compare.add(src_chap[-5:])
            tgt_chaps_compare = set()
            for tgt_chap in chapters_from_tgt:
                tgt_chaps_compare.add(tgt_chap[-5:])
            print(f"{src_chaps_compare} : {tgt_chaps_compare}")
            
            if src_chaps_compare == tgt_chaps_compare:
                tp_strict += 1
                # for correct text2text aligns, en is from prose or metric translation?
                correct_text2text[str(alignment)] = chapters_from_tgt
            
            # account for correct null : fr extraneous sections 
            elif chapters_from_src == set():
                print(f"have null on src")
                tgt_counter = 0
                for chapter in chapters_from_tgt:
                    if chapter in fr_extra_section_names:
                        tgt_counter += 1
                # tp_strict if all tgt chapters are extraneous
                if tgt_counter == len(chapters_from_tgt):
                    # tp_strict += 1
                    correct_nulls += 1
                elif tgt_counter == 0:
                    print("HAVE ERROR")
                    # save errors
                    error_dict = {}
                    error_dict["alignment"] = alignment
                    error_dict["alignmnent_idx"] = idx_align
                    error_dict["src_chapters"] = chapters_from_src
                    error_dict["tgt_chapters"] = chapters_from_tgt
                    errors.append(error_dict)
                else: # null to multiple tgt including peritext and text
                    tp_lax += 1
                    overlaps.append(alignment)
                    # print(src_chaps_compare)
                    # print(tgt_chaps_compare)

            else:
                overlap = src_chaps_compare.intersection(tgt_chaps_compare)
                print(f"len overlap {len(overlap)}")
                if len(overlap) != 0:
                    tp_lax += 1
                    overlaps.append(alignment)
                    # print(src_chaps_compare)
                    # print(tgt_chaps_compare)
                else:
                    print("HAVE ERROR%%%%")
                    # save errors
                    error_dict = {}
                    error_dict["alignment"] = alignment
                    error_dict["alignmnent_idx"] = idx_align
                    error_dict["src_chapters"] = chapters_from_src
                    error_dict["tgt_chapters"] = chapters_from_tgt
                    errors.append(error_dict)
        
    return tp_strict, tp_lax, overlaps, errors, correct_nulls, correct_text2text


In [10]:
vec_rslts[10423]

([2427], [13849, 13850, 13851, 13852, 13853, 13854, 13855])

In [11]:
test = {}
test[0] = [1,3]

In [12]:
len(lat_sent2book_name)

2428

In [13]:
len(en1893_sent2section_name)

14648

In [14]:
len(vec_rslts)

10534

In [19]:
tp_strict_, tp_lax_, overlaps_, errors_, correct_nulls_, correct_text2text_ = score_vec_rslts_chapter_level(
    vec_rslts, lat_sent2book_name, en1893_sent2section_name, extraneous_sections)


set() : {'title'}
have null on src
set() : {'title'}
have null on src
set() : {'title'}
have null on src
set() : {'title'}
have null on src
set() : {'title'}
have null on src
set() : {'title'}
have null on src
set() : {'title'}
have null on src
set() : {'title'}
have null on src
set() : {'title'}
have null on src
set() : {'title'}
have null on src
set() : {'title'}
have null on src
set() : {'title'}
have null on src
set() : {'title'}
have null on src
set() : {'title'}
have null on src
set() : {'title'}
have null on src
set() : {'title'}
have null on src
set() : {'title'}
have null on src
set() : {'title'}
have null on src
set() : {'0note'}
have null on src
set() : {'0note'}
have null on src
set() : {'0note'}
have null on src
set() : {'0note'}
have null on src
set() : {'0note'}
have null on src
set() : {'0note'}
have null on src
set() : {'0note'}
have null on src
set() : {'book0'}
have null on src
HAVE ERROR
set() : {'book0'}
have null on src
HAVE ERROR
set() : {'book0'}
have null on sr

In [54]:
print(prose_sent_counter)
print(metric_sent_counter)

3936
4518


In [20]:
print(tp_strict_)
print(tp_lax_)
# print(len(overlaps_))
print(len(errors_))
print(correct_nulls_)
# print(len(correct_text2text_))

839
194
7227
2274


In [21]:
(tp_strict_ + correct_nulls_) / len(vec_rslts)

0.29551927093221947

In [22]:
len(errors_) / len(vec_rslts)

0.6860641731535979

In [24]:
num_ensents_manychaps = 0
for values in en1893_sent2section_name.values():
    if isinstance(values, list):
        if len(values) > 1:
            num_ensents_manychaps += 1

In [26]:
num_ensents_manychaps

0

In [27]:
len(en1893_sent2section_name["2"])

18

In [28]:
# was latin text aligned to any metric translation sentences?
correct_lat2entext = set()
for tgt_set in list(correct_text2text_.values()):
    for item in tgt_set:
        correct_lat2entext.add(item)

In [29]:
correct_lat2entext

{'metric_translationbook6',
 'prose_translationbook1',
 'prose_translationbook2',
 'prose_translationbook3',
 'prose_translationbook4',
 'prose_translationbook5',
 'prose_translationbook6'}

In [30]:
correct_metric2lat = []
for keys, values in correct_text2text_.items():
    for section_set in values:
        if section_set.startswith('metric_translation'):
            correct_metric2lat.append((keys, values))

In [31]:
correct_metric2lat

[('([2425], [13300, 13301, 13302, 13303, 13304, 13305, 13306])',
  {'metric_translationbook6'})]

In [48]:
metric_sents = ['metric_translationbook1',
                'metric_translationbook2',
                'metric_translationbook3',
                'metric_translationbook4',
                'metric_translationbook5',
                'metric_translationbook6']

prose_sents = ['prose_translationbook1',
               'prose_translationbook2',
               'prose_translationbook3',
               'prose_translationbook4',
               'prose_translationbook5',
               'prose_translationbook6']

In [50]:
prose_sent_counter = 0
metric_sent_counter = 0
for vals in en1893_sent2section_name.values():
    if isinstance(vals, list):
        for val in vals:
            if val in metric_sents:
                metric_sent_counter += 1
            elif val in prose_sents:
                prose_sent_counter += 1
    else:
        if vals in metric_sents:
                metric_sent_counter += 1
        elif vals in prose_sents:
            prose_sent_counter += 1

In [53]:
print(prose_sent_counter)
print(metric_sent_counter)

3936
4518


# Analyze vecalign results: by English sentences

In [32]:
# get dict of tgt sentences' alignments to src sents
en1893_sent2lat_sent_aligns = build_tgt_2_src_dict(vec_rslts)

In [33]:
# test
keys = list(en1893_sent2lat_sent_aligns.keys())
keys = sorted(keys)
keys == [x for x in range(0, 13966)]

True

In [55]:
# def score_tgt_sents(tgt2src_sent_aligns_dict, tgt_sent2section_name_dict,
#                    src_sent2section_name_dict, extraneous_chapter_names):
#     extraneous2null_tpstrict = 0
#     extraneous2null_tplax = 0 # at least one overlap
#     extraneous2text = 0 # no overlap

#     text2text_tpstrict = 0
#     text2text_tplax = 0
#     text2text_incorrect = 0
#     text2text_incorrect_lst = []

#     text2null_incorrect = 0
#     text2null_lst = []

#     for tgt_sent_idx in tgt2src_sent_aligns_dict.keys():
#         # get src sentences aligned to it (returns a set)
#         src_aligned_sents = tgt2src_sent_aligns_dict[tgt_sent_idx]
#         # print(f"src aligned sents is {src_aligned_sents}")
        
#         # TODO: necessary? to skip null-null alignments ("null" will not appear as key in dict)
#         if str(tgt_sent_idx) in tgt_sent2section_name_dict.keys():
#             # get tgt sent chapter (keys are str)
#             tgt_sent_chapter = tgt_sent2section_name_dict[str(tgt_sent_idx)]
#             # print(f"tgt chapter is {tgt_sent_chapter}")
#             # convert to list in case of multiple chapters per sent
#             if isinstance(tgt_sent_chapter, str):
#                 tgt_sent_chapter = [tgt_sent_chapter]
#                 # print(tgt_sent_chapter)
            
#             tgt_chapter_counter = 0
#             for tgt_chapter in tgt_sent_chapter:
#                 if tgt_chapter in extraneous_chapter_names:
#                     # print(tgt_sent_idx)
#                     # print(tgt_chapter)
#                     # get num of tgt to null alignments
#                     extraneous2null_counter = 0
#                     for item in src_aligned_sents:
#                         if item == "null":
#                             extraneous2null_counter += 1
#                     # compare to number of src sents in alignmnent
#                     if extraneous2null_counter == len(src_aligned_sents):
#                         # then all src aligned sents are null
#                         extraneous2null_tpstrict += 1
#                     elif extraneous2null_counter > 0:
#                         # then at least one src sent is null (also captures tpstrict)
#                         extraneous2null_tplax += 1
#                     else:
#                         # no src sents are null
#                         extraneous2text += 1
                
#                 #### TODO: ADAPT TO ACCOUNT FOR DIFFERENT FORMATTING ("BOOK 2" for LATIN) ####
#                 else: # compare src and tgt chapters
#                     src_aligned_chapters = set()
#                     src_text2text_correct_counter = 0
#                     src_text2text_incorrect_counter = 0

#                     for item in src_aligned_sents:
#                         if item == "null":
#                             text2null_incorrect += 1
#                             text2null_lst.append(tgt_sent_idx)
#                         else:
#                             # get chapters of src sent (keys are str)
#                             if isinstance(src_sent2section_name_dict[str(item)], list):
#                                 for section_name in src_sent2section_name_dict[str(item)]:
#                                     src_aligned_chapters.add(section_name)
#                             else:
#                                 src_aligned_chapters.add(src_sent2section_name_dict[str(item)])

#                     # print(f"src chapters are {src_aligned_chapters}")
#                     #### TODO: IS THIS WRONG? ############
#                     tgt_chaps_compare = tgt_chapter[-5:]
#                     for item in src_aligned_chapters:
#                         if tgt_chaps_compare == item:
#                             src_text2text_correct_counter += 1
#                         else:
#                             src_text2text_incorrect_counter += 1

#                     if src_text2text_correct_counter == len(src_aligned_sents):
#                         text2text_tpstrict += 1
#                     elif src_text2text_correct_counter > 0:
#                         text2text_tplax += 1
#                     else:
#                         text2text_incorrect += 1
#                         text2text_incorrect_lst.append(tgt_sent_idx)

#                 tgt_chapter_counter += 1
#             if tgt_chapter_counter > 1:
#                 print(tgt_sent_idx)

#     # remove text2null from text2text_incorrect_lst
#     text2null_lst = set(text2null_lst)
#     text2text_incorrect_lst = set(text2text_incorrect_lst)
#     text2text_incorrect_lst -= text2null_lst
#     # update num of text2text_incorrect
#     text2text_incorrect -= text2null_incorrect
    
#     results = [extraneous2null_tpstrict, extraneous2null_tplax, extraneous2text,
#                text2text_tpstrict, text2text_tplax, 
#                text2text_incorrect, text2text_incorrect_lst,
#                text2null_incorrect, text2null_lst]
    
#     return results

In [60]:
def score_tgt_sents(tgt2src_sent_aligns_dict, tgt_sent2section_name_dict,
                   src_sent2section_name_dict, extraneous_chapter_names):
    extraneous2null_tpstrict = 0
    extraneous2null_tplax = 0 # at least one overlap
    extraneous2text = 0 # no overlap

    text2text_tpstrict = 0
    text2text_tplax = 0
    text2text_incorrect = 0
    text2text_incorrect_lst = []

    text2null_incorrect = 0
    text2null_lst = []

    # for tgt_sent_idx in tgt2src_sent_aligns_dict.keys():
    for tgt_sent_idx in range(423, 433):
        sent_extraneous2null_tpstrict = 0
        sent_extraneous2null_tplax = 0
        sent_extraneous2text = 0
        sent_text2text_tpstrict = 0
        sent_text2text_tplax = 0
        sent_text2text_incorrect = 0
        sent_text2null_incorrect = 0
        
        # get src sentences aligned to it (returns a set)
        src_aligned_sents = tgt2src_sent_aligns_dict[tgt_sent_idx]
        # print(f"src aligned sents is {src_aligned_sents}")
        
        # TODO: necessary? to skip null-null alignments ("null" will not appear as key in dict)
        if str(tgt_sent_idx) in tgt_sent2section_name_dict.keys():
            # get tgt sent chapter (keys are str)
            tgt_sent_chapter = tgt_sent2section_name_dict[str(tgt_sent_idx)]
            # print(f"tgt chapter is {tgt_sent_chapter}")
            # convert to list in case of multiple chapters per sent
            if isinstance(tgt_sent_chapter, str):
                tgt_sent_chapter = [tgt_sent_chapter]
                print(tgt_sent_chapter)
            
            tgt_chapter_counter = 0
            for tgt_chapter in tgt_sent_chapter:
                if tgt_chapter in extraneous_chapter_names:
                    # print(tgt_sent_idx)
                    # print(tgt_chapter)
                    # get num of tgt to null alignments
                    extraneous2null_counter = 0
                    for item in src_aligned_sents:
                        if item == "null":
                            extraneous2null_counter += 1
                    # compare to number of src sents in alignmnent
                    if extraneous2null_counter == len(src_aligned_sents):
                        # then all src aligned sents are null
                        sent_extraneous2null_tpstrict += 1
                    elif extraneous2null_counter == 0:
                        # no src sents are null
                        sent_extraneous2text += 1
                    else:
                        # then at least one src sent is null
                        sent_extraneous2null_tplax += 1
                        
                
                #### TODO: ADAPT TO ACCOUNT FOR DIFFERENT FORMATTING ("BOOK 2" for LATIN) ####
                else: # compare src and tgt chapters
                    src_aligned_chapters = set()
                    src_text2text_correct_counter = 0
                    src_text2text_incorrect_counter = 0

                    for item in src_aligned_sents:
                        if item == "null":
                            sent_text2null_incorrect += 1
                            text2null_lst.append(tgt_sent_idx)
                        else:
                            # get chapters of src sent (keys are str)
                            if isinstance(src_sent2section_name_dict[str(item)], list):
                                for section_name in src_sent2section_name_dict[str(item)]:
                                    src_aligned_chapters.add(section_name)
                            else:
                                src_aligned_chapters.add(src_sent2section_name_dict[str(item)])

                    print(f"src chapters are {src_aligned_chapters}")
                    #### TODO: IS THIS WRONG? ############
                    tgt_chaps_compare = tgt_chapter[-5:]
                    for item in src_aligned_chapters:
                        if tgt_chaps_compare == item:
                            src_text2text_correct_counter += 1
                        else:
                            src_text2text_incorrect_counter += 1

                    if src_text2text_correct_counter == len(src_aligned_sents):
                        sent_text2text_tpstrict += 1
                    elif src_text2text_correct_counter > 0:
                        sent_text2text_tplax += 1
                    else:
                        sent_text2text_incorrect += 1
                        text2text_incorrect_lst.append(tgt_sent_idx)

                # tgt_chapter_counter += 1
                    
            if sent_extraneous2null_tpstrict == len(tgt_sent_chapter):
                extraneous2null_tpstrict += 1
            if ((sent_extraneous2null_tplax > 0) & (sent_extraneous2null_tplax < len(tgt_sent_chapter))):
                extraneous2null_tplax += 1
            if sent_extraneous2text > 0:
                extraneous2text += 1
            if sent_text2text_tpstrict > 0:
                text2text_tpstrict += 1
            if sent_text2text_tplax > 0:
                text2text_tplax += 1
            if sent_text2text_incorrect > 0:
                text2text_incorrect += 1
            if sent_text2null_incorrect > 0:
                text2null_incorrect += 1

    # remove text2null from text2text_incorrect_lst
    text2null_lst = set(text2null_lst)
    text2text_incorrect_lst = set(text2text_incorrect_lst)
    text2text_incorrect_lst -= text2null_lst
    # update num of text2text_incorrect
    text2text_incorrect -= text2null_incorrect
    
    results = [extraneous2null_tpstrict, #0
               extraneous2null_tplax, #1
               extraneous2text, #2
               text2text_tpstrict, #3 
               text2text_tplax, #4
               text2text_incorrect, #5 
               text2text_incorrect_lst, #6
               text2null_incorrect, #7
               text2null_lst]
    
    return results

In [61]:
rslts_en1893_sents = score_tgt_sents(en1893_sent2lat_sent_aligns, en1893_sent2section_name,
                   lat_sent2book_name, extraneous_sections)

['forewordbook0']
src chapters are {'book1'}
['forewordbook0']
src chapters are {'book1'}
['forewordbook0']
src chapters are {'book1'}
['forewordbook0']
src chapters are {'book1'}
['forewordbook0']
src chapters are set()
['forewordbook0']
src chapters are set()
['forewordbook0']
src chapters are set()
['forewordbook0']
src chapters are set()
['forewordbook0']
src chapters are set()
['forewordbook0']
src chapters are set()


In [62]:
print(rslts_en1893_sents[0])
print(rslts_en1893_sents[1])
print(rslts_en1893_sents[2])
print(rslts_en1893_sents[3])
print(rslts_en1893_sents[4])
print(rslts_en1893_sents[5])
print(rslts_en1893_sents[7])

0
0
0
0
0
4
6


In [45]:
sum(rslts_en1893_sents[0:5])+rslts_en1893_sents[7]

13314