In [1]:
import re
import json
import stanza
import argparse

import numpy as np
import pandas as pd

from itertools import chain
from ast import literal_eval
from collections import defaultdict

from functions_score_section import read_alignments, score_vec_rslts_chapter_level, \
build__src_2_tgt_dict, build_tgt_2_src_dict, score_fr_sents

In [138]:
# get vecalign results
vec_rslts_path = "/home/craig.car/repos/chiron/align_texts_project/data/lucretius/lucretius_en1893_vecrsltsNEW"
vec_rslts = read_alignments(vec_rslts_path)

In [3]:
lat_dict_path = "/home/craig.car/repos/chiron/align_texts_project/data/lucretius/lat_sent2book_dict.json"

with open(lat_dict_path) as f:
    lat_sent2book_name = json.load(f)

en1893_dict_path = "/home/craig.car/repos/chiron/align_texts_project/data/lucretius/en1893_sent2section_dict_NEW.json"

with open(en1893_dict_path) as f:
    en1893_sent2section_name = json.load(f)
    

In [4]:
en1893_dict_path_jul25 = "/home/craig.car/repos/chiron/align_texts_project/data/lucretius/en1893_sent2section_dict_jul25_NEW.json"

with open(en1893_dict_path_jul25) as f:
    en1893_sent2section_name_jul25 = json.load(f)
    

In [5]:
en1893_sent2section_name_jul25 == en1893_sent2section_name

False

In [6]:
# get paratext sections in en1893
section_names = []
for item in list(en1893_sent2section_name_jul25.values()):
    if isinstance(item, str):
        section_names.append(item)
    else:
        section_names.extend(item)

In [7]:
section_names = np.unique(np.array(section_names))

In [8]:
section_names

array(['forewordbook0', 'forewordbook0note', 'forewordbook0title',
       'index', 'metric_translationbook0title', 'metric_translationbook1',
       'metric_translationbook1note', 'metric_translationbook1title',
       'metric_translationbook2', 'metric_translationbook2note',
       'metric_translationbook2title', 'metric_translationbook3',
       'metric_translationbook3note', 'metric_translationbook3title',
       'metric_translationbook4', 'metric_translationbook4note',
       'metric_translationbook4title', 'metric_translationbook5',
       'metric_translationbook5note', 'metric_translationbook5title',
       'metric_translationbook6', 'metric_translationbook6note',
       'prose_translationbook0title', 'prose_translationbook1',
       'prose_translationbook1commentary', 'prose_translationbook1note',
       'prose_translationbook1title', 'prose_translationbook2',
       'prose_translationbook2commentary', 'prose_translationbook2note',
       'prose_translationbook2title', 'prose_tr

In [9]:
extraneous_sections = []
for item in section_names:
    if item == "forewordbook0":
        extraneous_sections.append(item)
    elif item[-1].isdigit():
        continue
    else:
        extraneous_sections.append(item)
extraneous_sections

['forewordbook0',
 'forewordbook0note',
 'forewordbook0title',
 'index',
 'metric_translationbook0title',
 'metric_translationbook1note',
 'metric_translationbook1title',
 'metric_translationbook2note',
 'metric_translationbook2title',
 'metric_translationbook3note',
 'metric_translationbook3title',
 'metric_translationbook4note',
 'metric_translationbook4title',
 'metric_translationbook5note',
 'metric_translationbook5title',
 'metric_translationbook6note',
 'prose_translationbook0title',
 'prose_translationbook1commentary',
 'prose_translationbook1note',
 'prose_translationbook1title',
 'prose_translationbook2commentary',
 'prose_translationbook2note',
 'prose_translationbook2title',
 'prose_translationbook3commentary',
 'prose_translationbook3note',
 'prose_translationbook3title',
 'prose_translationbook4commentary',
 'prose_translationbook4note',
 'prose_translationbook4title',
 'prose_translationbook5commentary',
 'prose_translationbook5note',
 'prose_translationbook5title',
 'pro

# Score vecalign results: by prediction

In [10]:
def score_vec_rslts_chapter_level(vr_rslts_lst, el_sent2section_dict,
                                 fr_sent2section_dict, fr_extra_section_names):

    tp_strict = 0 # +1 per alignment if there's an exact match
    tp_lax = 0 # +1 per alignment if there's any overlap
    overlaps = []
    errors = []
    correct_nulls = 0
    correct_text2text = {}

    for idx_align, alignment in enumerate(vr_rslts_lst):
    # for idx_align, alignment in enumerate(vr_rslts_lst[518:523]):
        # skip alignments null on both sides
        if alignment == ([],[]):
            continue
        else:
            src_sents = alignment[0]
            tgt_sents = alignment[1]
            # get set of chapters from src, then from tgt
            chapters_from_src = set()
            chapters_from_tgt = set()
            # if alignment is null on src side, then chapters_from_src remains empty set
            if src_sents != []:
                for src_id in src_sents:
                    if isinstance(el_sent2section_dict[str(src_id)], list):
                        for section_name in el_sent2section_dict[str(src_id)]:
                            chapters_from_src.add(section_name)
                    else:
                        chapters_from_src.add(el_sent2section_dict[str(src_id)])
            # if alignment is null on tgt side, then chapters_from_tgt remains empty set
            if tgt_sents != []:
                for tgt_id in tgt_sents:
                    if isinstance(fr_sent2section_dict[str(tgt_id)], list):
                        for section_name_ in fr_sent2section_dict[str(tgt_id)]:
                            chapters_from_tgt.add(section_name_)
                    else:
                        chapters_from_tgt.add(fr_sent2section_dict[str(tgt_id)])

            # compare the sets: if text, then last 5 characters in format "book[1-6]"
            src_chaps_compare = set()
            for src_chap in chapters_from_src:
                src_chaps_compare.add(src_chap[-5:])
            tgt_chaps_compare = set()
            for tgt_chap in chapters_from_tgt:
                tgt_chaps_compare.add(tgt_chap[-5:])
            print(f"{src_chaps_compare} : {tgt_chaps_compare}")
            
            if src_chaps_compare == tgt_chaps_compare:
                tp_strict += 1
                # for correct text2text aligns, en is from prose or metric translation?
                correct_text2text[str(alignment)] = chapters_from_tgt
            
            # account for correct null : fr extraneous sections 
            elif chapters_from_src == set():
                print(f"have null on src")
                tgt_counter = 0
                for chapter in chapters_from_tgt:
                    if chapter in fr_extra_section_names:
                        tgt_counter += 1
                # tp_strict if all tgt chapters are extraneous
                if tgt_counter == len(chapters_from_tgt):
                    # tp_strict += 1
                    correct_nulls += 1
                elif tgt_counter == 0:
                    print("HAVE ERROR")
                    # save errors
                    error_dict = {}
                    error_dict["alignment"] = alignment
                    error_dict["alignmnent_idx"] = idx_align
                    error_dict["src_chapters"] = chapters_from_src
                    error_dict["tgt_chapters"] = chapters_from_tgt
                    errors.append(error_dict)
                else: # null to multiple tgt including peritext and text
                    tp_lax += 1
                    overlaps.append(alignment)
                    # print(src_chaps_compare)
                    # print(tgt_chaps_compare)

            else:
                overlap = src_chaps_compare.intersection(tgt_chaps_compare)
                print(f"len overlap {len(overlap)}")
                if len(overlap) != 0:
                    tp_lax += 1
                    overlaps.append(alignment)
                    # print(src_chaps_compare)
                    # print(tgt_chaps_compare)
                else:
                    print("HAVE ERROR%%%%")
                    # save errors
                    error_dict = {}
                    error_dict["alignment"] = alignment
                    error_dict["alignmnent_idx"] = idx_align
                    error_dict["src_chapters"] = chapters_from_src
                    error_dict["tgt_chapters"] = chapters_from_tgt
                    errors.append(error_dict)
        
    return tp_strict, tp_lax, overlaps, errors, correct_nulls, correct_text2text


In [11]:
vec_rslts[10423]

([], [13980])

In [12]:
test = {}
test[0] = [1,3]

In [13]:
len(lat_sent2book_name)

2428

In [14]:
len(en1893_sent2section_name_jul25)

14648

In [15]:
len(vec_rslts)

11073

In [16]:
tp_strict_, tp_lax_, overlaps_, errors_, correct_nulls_, correct_text2text_ = score_vec_rslts_chapter_level(
    vec_rslts, lat_sent2book_name, en1893_sent2section_name_jul25, extraneous_sections)


set() : {'title'}
have null on src
set() : {'title'}
have null on src
set() : {'title'}
have null on src
set() : {'book0'}
have null on src
set() : {'book0'}
have null on src
set() : {'book0'}
have null on src
set() : {'book0'}
have null on src
set() : {'book0'}
have null on src
set() : {'book0'}
have null on src
set() : {'book0'}
have null on src
set() : {'book0'}
have null on src
set() : {'book0'}
have null on src
set() : {'book0'}
have null on src
set() : {'book0'}
have null on src
set() : {'book0'}
have null on src
set() : {'book0'}
have null on src
set() : {'book0'}
have null on src
set() : {'book0'}
have null on src
set() : {'0note'}
have null on src
set() : {'0note'}
have null on src
set() : {'0note'}
have null on src
set() : {'0note'}
have null on src
set() : {'0note'}
have null on src
set() : {'0note'}
have null on src
set() : {'0note'}
have null on src
set() : {'book0'}
have null on src
set() : {'book0'}
have null on src
set() : {'book0'}
have null on src
set() : {'book0'}
ha

In [17]:
tp_strict_ + tp_lax_ + len(errors_) + correct_nulls_

11073

In [18]:
# with en1893_sent2section_name - results:
# 1707
# 451
# 4867
# 4048

In [19]:
print(tp_strict_)
print(tp_lax_)
# print(len(overlaps_))
print(len(errors_))
print(correct_nulls_)
# print(len(correct_text2text_))

1707
451
4934
3981


In [20]:
# old results: 0.5197326831030434
(tp_strict_ + correct_nulls_) / len(vec_rslts)

0.5136819290165267

In [21]:
# old results: 0.1541587645624492
tp_strict_ / len(vec_rslts)

0.1541587645624492

In [22]:
# old: 0.04072970288088142
tp_lax_ / len(vec_rslts)

0.04072970288088142

In [23]:
(tp_strict_ + tp_lax_) / len(vec_rslts)

0.19488846744333063

In [24]:
len(errors_) / len(vec_rslts)

0.4455883681025919

In [25]:
correct_nulls_ / len(vec_rslts)

0.3595231644540775

In [26]:
0.3595231644540775 + 0.4455883681025919 + 0.04072970288088142 + 0.1541587645624492

1.0

In [27]:
# check if any en sents overlap sections
num_ensents_manychaps = 0
for values in en1893_sent2section_name_jul25.values():
    if isinstance(values, list):
        if len(values) > 1:
            num_ensents_manychaps += 1
num_ensents_manychaps

0

In [28]:
correct_text2text_

{'([2], [532, 533])': {'prose_translationbook1'},
 '([3], [534])': {'prose_translationbook1'},
 '([4], [535])': {'prose_translationbook1'},
 '([5], [536])': {'prose_translationbook1'},
 '([7], [574])': {'prose_translationbook1'},
 '([8], [575])': {'prose_translationbook1'},
 '([11, 12], [614])': {'prose_translationbook1'},
 '([13], [615])': {'prose_translationbook1'},
 '([14], [616])': {'prose_translationbook1'},
 '([18], [693, 694, 695])': {'prose_translationbook1'},
 '([19], [696, 697])': {'prose_translationbook1'},
 '([20], [698])': {'prose_translationbook1'},
 '([21], [699])': {'prose_translationbook1'},
 '([22], [700])': {'prose_translationbook1'},
 '([26], [725])': {'prose_translationbook1'},
 '([27], [726])': {'prose_translationbook1'},
 '([28], [727, 728, 729])': {'prose_translationbook1'},
 '([29], [730])': {'prose_translationbook1'},
 '([30], [731])': {'prose_translationbook1'},
 '([35], [756, 757])': {'prose_translationbook1'},
 '([38], [779])': {'prose_translationbook1'},
 

In [29]:
# was latin text aligned to any metric translation sentences?
correct_lat2entext = set()
for tgt_set in list(correct_text2text_.values()):
    for item in tgt_set:
        correct_lat2entext.add(item)

In [30]:
correct_lat2entext

{'metric_translationbook6',
 'prose_translationbook1',
 'prose_translationbook2',
 'prose_translationbook3',
 'prose_translationbook4',
 'prose_translationbook5',
 'prose_translationbook6'}

In [31]:
correct_metric2lat = []
for keys, values in correct_text2text_.items():
    for section_set in values:
        if section_set.startswith('metric_translation'):
            correct_metric2lat.append((keys, values))

In [32]:
correct_metric2lat

[('([2423], [13348, 13349, 13350, 13351, 13352, 13353, 13354])',
  {'metric_translationbook6'}),
 ('([2424], [13375, 13376, 13377, 13378, 13379, 13380, 13381])',
  {'metric_translationbook6'})]

# Analyze vecalign results: by English sentences

In [33]:
# get dict of tgt sentences' alignments to src sents
en1893_sent2lat_sent_aligns = build_tgt_2_src_dict(vec_rslts)

In [34]:
# test
keys = list(en1893_sent2lat_sent_aligns.keys())
keys = sorted(keys)
keys == [x for x in range(0, 13966)]

False

In [35]:
# def score_tgt_sents(tgt2src_sent_aligns_dict, tgt_sent2section_name_dict,
#                    src_sent2section_name_dict, extraneous_chapter_names):
#     extraneous2null_tpstrict = 0
#     extraneous2null_tplax = 0 # at least one overlap
#     extraneous2text = 0 # no overlap

#     text2text_tpstrict = 0
#     text2text_tplax = 0
#     text2text_incorrect = 0
#     text2text_incorrect_lst = []

#     text2null_incorrect = 0
#     text2null_lst = []

#     for tgt_sent_idx in tgt2src_sent_aligns_dict.keys():
#         # get src sentences aligned to it (returns a set)
#         src_aligned_sents = tgt2src_sent_aligns_dict[tgt_sent_idx]
#         # print(f"src aligned sents is {src_aligned_sents}")
        
#         # TODO: necessary? to skip null-null alignments ("null" will not appear as key in dict)
#         if str(tgt_sent_idx) in tgt_sent2section_name_dict.keys():
#             # get tgt sent chapter (keys are str)
#             tgt_sent_chapter = tgt_sent2section_name_dict[str(tgt_sent_idx)]
#             # print(f"tgt chapter is {tgt_sent_chapter}")
#             # convert to list in case of multiple chapters per sent
#             if isinstance(tgt_sent_chapter, str):
#                 tgt_sent_chapter = [tgt_sent_chapter]
#                 # print(tgt_sent_chapter)
            
#             tgt_chapter_counter = 0
#             for tgt_chapter in tgt_sent_chapter:
#                 if tgt_chapter in extraneous_chapter_names:
#                     # print(tgt_sent_idx)
#                     # print(tgt_chapter)
#                     # get num of tgt to null alignments
#                     extraneous2null_counter = 0
#                     for item in src_aligned_sents:
#                         if item == "null":
#                             extraneous2null_counter += 1
#                     # compare to number of src sents in alignmnent
#                     if extraneous2null_counter == len(src_aligned_sents):
#                         # then all src aligned sents are null
#                         extraneous2null_tpstrict += 1
#                     elif extraneous2null_counter > 0:
#                         # then at least one src sent is null (also captures tpstrict)
#                         extraneous2null_tplax += 1
#                     else:
#                         # no src sents are null
#                         extraneous2text += 1
                
#                 #### TODO: ADAPT TO ACCOUNT FOR DIFFERENT FORMATTING ("BOOK 2" for LATIN) ####
#                 else: # compare src and tgt chapters
#                     src_aligned_chapters = set()
#                     src_text2text_correct_counter = 0
#                     src_text2text_incorrect_counter = 0

#                     for item in src_aligned_sents:
#                         if item == "null":
#                             text2null_incorrect += 1
#                             text2null_lst.append(tgt_sent_idx)
#                         else:
#                             # get chapters of src sent (keys are str)
#                             if isinstance(src_sent2section_name_dict[str(item)], list):
#                                 for section_name in src_sent2section_name_dict[str(item)]:
#                                     src_aligned_chapters.add(section_name)
#                             else:
#                                 src_aligned_chapters.add(src_sent2section_name_dict[str(item)])

#                     # print(f"src chapters are {src_aligned_chapters}")
#                     #### TODO: IS THIS WRONG? ############
#                     tgt_chaps_compare = tgt_chapter[-5:]
#                     for item in src_aligned_chapters:
#                         if tgt_chaps_compare == item:
#                             src_text2text_correct_counter += 1
#                         else:
#                             src_text2text_incorrect_counter += 1

#                     if src_text2text_correct_counter == len(src_aligned_sents):
#                         text2text_tpstrict += 1
#                     elif src_text2text_correct_counter > 0:
#                         text2text_tplax += 1
#                     else:
#                         text2text_incorrect += 1
#                         text2text_incorrect_lst.append(tgt_sent_idx)

#                 tgt_chapter_counter += 1
#             if tgt_chapter_counter > 1:
#                 print(tgt_sent_idx)

#     # remove text2null from text2text_incorrect_lst
#     text2null_lst = set(text2null_lst)
#     text2text_incorrect_lst = set(text2text_incorrect_lst)
#     text2text_incorrect_lst -= text2null_lst
#     # update num of text2text_incorrect
#     text2text_incorrect -= text2null_incorrect
    
#     results = [extraneous2null_tpstrict, extraneous2null_tplax, extraneous2text,
#                text2text_tpstrict, text2text_tplax, 
#                text2text_incorrect, text2text_incorrect_lst,
#                text2null_incorrect, text2null_lst]
    
#     return results

In [36]:
def score_tgt_sents(tgt2src_sent_aligns_dict, tgt_sent2section_name_dict,
                   src_sent2section_name_dict, extraneous_chapter_names):
    extraneous2null_tpstrict = 0
    extraneous2null_tpstrict_lst = []
    extraneous2null_tplax = 0 # at least one overlap
    extraneous2text = 0 # no overlap
    extraneous2text_lst = []

    text2text_tpstrict = 0
    text2text_tpstrict_lst = []
    
    text2text_tplax = 0
    text2text_tplax_lst = []
    
    text2text_incorrect = 0
    text2text_incorrect_lst = []

    text2null_incorrect = 0
    text2null_lst = []
    
    for tgt_sent_idx in range(len(tgt_sent2section_name_dict)):        
        # get src sentences aligned to it (returns a set)
        src_aligned_sents = tgt2src_sent_aligns_dict[tgt_sent_idx]
        # print(f"src aligned sents is {src_aligned_sents}")
        
        # TODO: necessary? to skip null-null alignments ("null" will not appear as key in dict)
        if str(tgt_sent_idx) in tgt_sent2section_name_dict.keys():
            # get tgt sent chapter (keys are str)
            tgt_sent_chapter = tgt_sent2section_name_dict[str(tgt_sent_idx)]
            # print(f"tgt chapter is {tgt_sent_chapter}")
            
            if tgt_sent_chapter in extraneous_chapter_names:
            # tgt_chapter_counter = 0
            # for tgt_chapter in tgt_sent_chapter:
                # if tgt_chapter in extraneous_chapter_names:
                    # print(tgt_sent_idx)
                    # print(tgt_chapter)
                    # get num of tgt to null alignments
                extraneous2null_counter = 0
                for item in src_aligned_sents:
                    if item == "null":
                        extraneous2null_counter += 1
                        # print("have extra2null")
                # compare to number of src sents in alignmnent
                if extraneous2null_counter == len(src_aligned_sents):
                    # then all src aligned sents are null
                    extraneous2null_tpstrict += 1
                    extraneous2null_tpstrict_lst.append((src_aligned_sents,tgt_sent_chapter))
                elif extraneous2null_counter == 0:
                    # no src sents are null
                    extraneous2text += 1
                    extraneous2text_lst.append((src_aligned_sents,tgt_sent_chapter))
                    # print("have extra2text")
                else:
                    # then at least one src sent is null
                    extraneous2null_tplax += 1

            else: # compare src and tgt chapters
                src_aligned_chapters = set()
                src_text2text_correct_counter = 0
                src_text2text_incorrect_counter = 0

                for item in src_aligned_sents:
                    if item == "null":
                        text2null_incorrect += 1
                        text2null_lst.append(tgt_sent_idx)
                    else:
                        # get chapters of src sent (keys are str)
                        if isinstance(src_sent2section_name_dict[str(item)], list):
                            for section_name in src_sent2section_name_dict[str(item)]:
                                src_aligned_chapters.add(section_name)
                        else:
                            src_aligned_chapters.add(src_sent2section_name_dict[str(item)])

                # print(f"src chapters are {src_aligned_chapters}")
                tgt_chaps_compare = tgt_sent_chapter[-5:]
                for item in src_aligned_chapters:
                    if tgt_chaps_compare == item:
                        src_text2text_correct_counter += 1
                    else:
                        src_text2text_incorrect_counter += 1

                if src_text2text_correct_counter == len(src_aligned_sents):
                    text2text_tpstrict += 1
                    text2text_tpstrict_lst.append((src_aligned_sents,tgt_sent_chapter))
                elif src_text2text_correct_counter > 0:
                    text2text_tplax += 1
                    text2text_tplax_lst.append((src_aligned_sents,tgt_sent_chapter)) 
                else:
                    text2text_incorrect += 1
                    text2text_incorrect_lst.append(tgt_sent_idx)

    # remove text2null from text2text_incorrect_lst
    text2null_lst = set(text2null_lst)
    text2text_incorrect_lst = set(text2text_incorrect_lst)
    text2text_incorrect_lst -= text2null_lst
    # update num of text2text_incorrect
    text2text_incorrect -= text2null_incorrect
    
    results = [extraneous2null_tpstrict, #0
               extraneous2null_tplax, #1
               extraneous2text, #2
               text2text_tpstrict, #3 
               text2text_tplax, #4
               text2text_incorrect, #5 
               text2text_incorrect_lst, #6
               text2null_incorrect, #7
               text2null_lst,
               extraneous2null_tpstrict_lst,
               extraneous2text_lst,
               text2text_tpstrict_lst,
               text2text_tplax_lst]
    
    return results

In [37]:
len(vec_rslts)

11073

In [38]:
len(en1893_sent2section_name)

14648

In [39]:
len(en1893_sent2lat_sent_aligns)

14648

In [40]:
en1893_sent2section_name["8412"]

'prose_translationbook6note'

In [41]:
en1893_sent2lat_sent_aligns[14647]

{'null'}

In [42]:
lat_sent2book_name["2395"]

'book6'

In [43]:
rslts_en1893_sents = score_tgt_sents(en1893_sent2lat_sent_aligns, en1893_sent2section_name_jul25,
                   lat_sent2book_name, extraneous_sections)

In [44]:
# old results:
# 4048
# 0
# 2146
# 3770
# 42
# 3
# 4639

In [45]:
print(rslts_en1893_sents[0])
print(rslts_en1893_sents[1])
print(rslts_en1893_sents[2])
print(rslts_en1893_sents[3])
print(rslts_en1893_sents[4])
print(rslts_en1893_sents[5])
print(rslts_en1893_sents[7])

3981
0
2146
3770
42
3
4706


In [46]:
sum(rslts_en1893_sents[0:6])+rslts_en1893_sents[7]

14648

In [47]:
print(rslts_en1893_sents[6])

{8760, 8763, 8764}


## Get number of paratext and text sentences

In [48]:
metric_sents = ['metric_translationbook1',
                'metric_translationbook2',
                'metric_translationbook3',
                'metric_translationbook4',
                'metric_translationbook5',
                'metric_translationbook6']

metric_paratext = ['metric_translationbook0title',
                   'metric_translationbook1note',
                   'metric_translationbook1title',
                   'metric_translationbook2note',
                   'metric_translationbook2title',
                   'metric_translationbook3note',
                   'metric_translationbook3title',
                   'metric_translationbook4note',
                   'metric_translationbook4title',
                   'metric_translationbook5note',
                   'metric_translationbook5title',
                   'metric_translationbook6note',
                   'metric_translationbook6title'
                  ]

prose_sents = ['prose_translationbook1',
               'prose_translationbook2',
               'prose_translationbook3',
               'prose_translationbook4',
               'prose_translationbook5',
               'prose_translationbook6']

prose_paratext = ['prose_translationbook0title',
                  'prose_translationbook1commentary',
                  'prose_translationbook1note',
                  'prose_translationbook1title',
                  'prose_translationbook2commentary',
                  'prose_translationbook2note',
                  'prose_translationbook2title',
                  'prose_translationbook3commentary',
                  'prose_translationbook3note',
                  'prose_translationbook3title',
                  'prose_translationbook4commentary',
                  'prose_translationbook4note',
                  'prose_translationbook4title',
                  'prose_translationbook5commentary',
                  'prose_translationbook5note',
                  'prose_translationbook5title',
                  'prose_translationbook6commentary',
                  'prose_translationbook6note',
                  'prose_translationbook6title'
                 ]

prose_sent_counter = 0
metric_sent_counter = 0
prose_paratext_counter = 0
metric_paratext_counter = 0
num_paratext_sents = 0
num_notes = 0
num_index = 0
num_foreword = 0
for vals in en1893_sent2section_name_jul25.values():    
    if vals in metric_sents:
        metric_sent_counter += 1
    elif vals in prose_sents:
        prose_sent_counter += 1
    elif vals in metric_paratext:
        metric_paratext_counter += 1
    elif vals in prose_paratext:
        prose_paratext_counter += 1

for vals in en1893_sent2section_name_jul25.values():        
    if vals in extraneous_sections:
        num_paratext_sents += 1

num_en1893_text_sents = prose_sent_counter + metric_sent_counter

num_footnote_sents = 0
for vals in en1893_sent2section_name_jul25.values():    
    if vals.endswith("note"):
        num_footnote_sents += 1

for vals in en1893_sent2section_name_jul25.values():      
    if vals == "index":
        num_index += 1
    elif vals.startswith("foreword"):
        num_foreword += 1


print(prose_sent_counter)
print(metric_sent_counter)
print(num_en1893_text_sents)
print("===")
print(prose_paratext_counter)
print(metric_paratext_counter)
print(num_index)
print(num_foreword)
print(num_paratext_sents)
print("===")
print((num_en1893_text_sents + num_paratext_sents) == len(en1893_sent2section_name_jul25))
print((num_en1893_text_sents + num_paratext_sents) == len(en1893_sent2lat_sent_aligns))
print(num_footnote_sents)

3902
4619
8521
===
4247
214
1205
461
6127
===
True
True
4376


In [49]:
# extraneous2null_tpstrict, #0
#                extraneous2null_tplax, #1
#                extraneous2text, #2
#                text2text_tpstrict, #3 
#                text2text_tplax, #4
#                text2text_incorrect, #5 
#                text2text_incorrect_lst, #6
#                text2null_incorrect, #7
               # text2null_lst

In [50]:
# extraneous2null_tpstrict to num paratext sents
rslts_en1893_sents[0]/num_paratext_sents

0.6497470213807737

In [51]:
# extraneous2null_tplax to num paratext sents
rslts_en1893_sents[1]/num_paratext_sents

0.0

In [52]:
# extraneous2text to num paratext sents
rslts_en1893_sents[2]/num_paratext_sents

0.3502529786192264

In [53]:
# extraneous2text to num text sents
rslts_en1893_sents[2]/num_en1893_text_sents

0.25184837460391973

In [54]:
# text2text_tpstrict to num text sents
rslts_en1893_sents[3]/num_en1893_text_sents

0.4424363337636428

In [55]:
# text2text_tplax to num text sents
rslts_en1893_sents[4]/num_en1893_text_sents

0.00492899894378594

In [56]:
# text2text_incorrect to num text sents
rslts_en1893_sents[5]/num_en1893_text_sents

0.0003520713531275672

In [57]:
# text2null_incorrect to num text sents
rslts_en1893_sents[7]/num_en1893_text_sents

0.5522825959394437

In [58]:
len(vec_rslts)

11073

# 2nd eval by English sents: without 2nd translation

In [59]:
# rslts_en1893_sents
               #  extraneous2null_tpstrict, #0
               # extraneous2null_tplax, #1
               # extraneous2text, #2
               # text2text_tpstrict, #3 
               # text2text_tplax, #4
               # text2text_incorrect, #5 
               # text2text_incorrect_lst, #6
               # text2null_incorrect, #7
               # text2null_lst, #8
               # extraneous2null_tpstrict_lst, #9
               # extraneous2text_lst, #10
               # text2text_tpstrict_lst, #11
               # text2text_tplax_lst #12

In [60]:
print(prose_sent_counter)
print(metric_sent_counter)
print(num_en1893_text_sents)
print(num_paratext_sents)

3902
4619
8521
6127


In [61]:
prose_sent_counter + metric_sent_counter

8521

## Get verse translation sentences in each metric

In [62]:
# text2text_incorrect_lst: tgt_sent_idx's
# These are all from book 1 of the metric translation aligned to Latin book 6
text2text_incorrect_counter = 0
for tgt_sent in rslts_en1893_sents[6]:
    tgt_text_section = en1893_sent2section_name_jul25[str(tgt_sent)]
    if tgt_text_section.startswith("metric_translation"):
        text2text_incorrect_counter += 1
        print(tgt_text_section)
        # get book of aligned Latin sents
        lat_sent = en1893_sent2lat_sent_aligns[tgt_sent]
        for sent in lat_sent:
            print(f"lat section is {lat_sent2book_name[str(sent)]}")
text2text_incorrect_counter

metric_translationbook1
lat section is book6
metric_translationbook1
lat section is book6
metric_translationbook1
lat section is book6


3

In [63]:
# text2null_lst: tgt_sent_idx's
text2null_metric = 0
text2null_prose = 0
for tgt_sent in rslts_en1893_sents[8]:
    tgt_text_section = en1893_sent2section_name_jul25[str(tgt_sent)]
    if tgt_text_section.startswith("metric_translation"):
        text2null_metric += 1
    elif tgt_text_section.startswith("prose_translation"):
        text2null_prose += 1
print(text2null_metric)
print(text2null_prose)

4602
104


In [64]:
rslts_en1893_sents[0]

3981

In [65]:
# extraneous2null_tpstrict_lst, #9 (src_sents, tgt_sent)
# these will be paratext sents within the metric translation, correctly aligned to null
extraneous2null_tpstrict_metric = 0
extraneous2null_tpstrict_prose = 0
extraneous2null_tpstrict_index = 0
extraneous2null_tpstrict_foreword = 0
for src_tgt_tuple in rslts_en1893_sents[9]:
    tgt_section = src_tgt_tuple[1]
    if tgt_section.startswith("metric_translation"):
        extraneous2null_tpstrict_metric += 1
    elif tgt_section.startswith("prose_translation"):
        extraneous2null_tpstrict_prose += 1
    elif tgt_section.startswith("foreword"):
        extraneous2null_tpstrict_foreword += 1
    else:
        extraneous2null_tpstrict_index += 1
print(extraneous2null_tpstrict_metric)
print(extraneous2null_tpstrict_prose)
print(extraneous2null_tpstrict_index)
print(extraneous2null_tpstrict_foreword)

212
2124
1184
461


In [66]:
# extraneous2text_lst, #10 (src_sents, tgt_sent)
extraneous2text_counter_metric = 0
extraneous2text_counter_prose = 0
extraneous2text_counter_index = 0
extraneous2text_counter_foreword = 0

for src_tgt_tuple in rslts_en1893_sents[10]:
    tgt_section = src_tgt_tuple[1]
    if tgt_section.startswith("metric_translation"):
        extraneous2text_counter_metric += 1
    elif tgt_section.startswith("prose_translation"):
        extraneous2text_counter_prose += 1
    elif tgt_section == "index":
        extraneous2text_counter_index += 1
    elif tgt_section.startswith("foreword"):
        extraneous2text_counter_foreword += 1
print(extraneous2text_counter_metric)
print(extraneous2text_counter_prose)
print(extraneous2text_counter_index)
print(extraneous2text_counter_foreword)

2
2123
21
0


In [67]:
# text2text_tpstrict_lst, #11
# These are metric translation text sentences aligned to Latin sents from the same Latin book
# They are all from book 6 of the metric translation
text2text_tpstrict_counter = 0
for src_tgt_tuple in rslts_en1893_sents[11]:
    tgt_section = src_tgt_tuple[1]
    if tgt_section.startswith("metric_translation"):
        text2text_tpstrict_counter += 1
        # get book of aligned Latin sents
        for idx in src_tgt_tuple[0]: # idx is in set
            lat_sent = lat_sent2book_name[str(idx)]
            print(f"lat section is {lat_sent}")
        print(src_tgt_tuple[1])
text2text_tpstrict_counter

lat section is book6
metric_translationbook6
lat section is book6
metric_translationbook6
lat section is book6
metric_translationbook6
lat section is book6
metric_translationbook6
lat section is book6
metric_translationbook6
lat section is book6
metric_translationbook6
lat section is book6
metric_translationbook6
lat section is book6
metric_translationbook6
lat section is book6
metric_translationbook6
lat section is book6
metric_translationbook6
lat section is book6
metric_translationbook6
lat section is book6
metric_translationbook6
lat section is book6
metric_translationbook6
lat section is book6
metric_translationbook6


14

In [68]:
# text2text_tplax_lst #12
text2text_tplax_counter = 0
for src_tgt_tuple in rslts_en1893_sents[12]:
    tgt_section = src_tgt_tuple[1]
    if tgt_section.startswith("metric_translation"):
        text2text_tplax_counter += 1
text2text_tplax_counter

0

## Redo eval after backing out sents related to 2nd translation

In [69]:
# rslts_en1893_sents
               #  extraneous2null_tpstrict, #0
               # extraneous2null_tplax, #1
               # extraneous2text, #2
               # text2text_tpstrict, #3 
               # text2text_tplax, #4
               # text2text_incorrect, #5 
               # text2text_incorrect_lst, #6
               # text2null_incorrect, #7
               # text2null_lst, #8
               # extraneous2null_tpstrict_lst, #9
               # extraneous2text_lst, #10
               # text2text_tpstrict_lst, #11
               # text2text_tplax_lst #12

In [70]:
# extraneous2null_tpstrict to num paratext sents
(rslts_en1893_sents[0] - extraneous2null_tpstrict_metric) / (num_paratext_sents - metric_paratext_counter)

0.6374090985963132

In [71]:
# extraneous2text to num paratext sents
(rslts_en1893_sents[2] - extraneous2text_counter_metric) / (num_paratext_sents - metric_paratext_counter)

0.36259090140368677

In [72]:
# text2text_tpstrict to num text sents
(rslts_en1893_sents[3] - text2text_tpstrict_counter) / prose_sent_counter

0.9625832906201948

In [73]:
# text2text_tplax to num text sents
(rslts_en1893_sents[4] - text2text_tplax_counter) / prose_sent_counter

0.010763710917478216

In [74]:
# text2text_incorrect to num text sents
(rslts_en1893_sents[5] - text2text_incorrect_counter) / prose_sent_counter

0.0

In [75]:
# text2null_incorrect to num text sents
(rslts_en1893_sents[7] - text2null_metric) / prose_sent_counter

0.02665299846232701

In [76]:
0.0350609756097561 + 0.9542682926829268 + 0.010670731707317074

0.9999999999999999

In [77]:
# vals to back out of numerator and denominator
print(prose_sent_counter)
print(metric_sent_counter)
print(num_en1893_text_sents)
print("===")
print(prose_paratext_counter)
print(metric_paratext_counter)
print(num_index)
print(num_foreword)
print(num_paratext_sents)
print("===")
print((num_en1893_text_sents + num_paratext_sents) == len(en1893_sent2section_name_jul25))
print((num_en1893_text_sents + num_paratext_sents) == len(en1893_sent2lat_sent_aligns))
print(num_footnote_sents)

3902
4619
8521
===
4247
214
1205
461
6127
===
True
True
4376


In [78]:
rslts_en1893_sents[2]

2146

In [79]:
en1893_sent2section_name_jul25["461"]

'prose_translationbook0title'

In [80]:
6194-metric_paratext_counter

5980

# Score Text only experiment

In [143]:
textonly_dict_path = "/home/craig.car/repos/chiron/align_texts_project/data/lucretius/en1893_sent2section_dict_textonly.json"

with open(textonly_dict_path) as f:
    en1893_sent2section_name_textonly = json.load(f)


In [144]:
vecrslts_textonly_path = "/home/craig.car/repos/chiron/align_texts_project/data/lucretius/lucretius_en1893_vecrslts_textonly"
vecrslts_textonly = read_alignments(vecrslts_textonly_path)

## Score by prediction

In [194]:
def score_textonly_book_level(vr_rslts_lst, src_sent2section_dict,
                                 tgt_sent2section_dict):

    tp_strict = 0 # +1 per alignment if there's an exact match
    tp_lax = 0 # +1 per alignment if there's any overlap
    overlaps = []
    errors = []
    correct_text2text = {}
    text2null = 0

    for idx_align, alignment in enumerate(vr_rslts_lst):
    # for idx_align, alignment in enumerate(vr_rslts_lst[518:523]):
        # skip alignments null on both sides
        if alignment == ([],[]):
            continue
        else:
            src_sents = alignment[0]
            tgt_sents = alignment[1]
            # get set of chapters from src, then from tgt
            chapters_from_src = set()
            chapters_from_tgt = set()
            # if alignment is null on src side, then chapters_from_src remains empty set
            if src_sents != []:
                for src_id in src_sents:
                    if isinstance(src_sent2section_dict[str(src_id)], list):
                        for section_name in src_sent2section_dict[str(src_id)]:
                            chapters_from_src.add(section_name)
                    else:
                        chapters_from_src.add(src_sent2section_dict[str(src_id)])
            # if alignment is null on tgt side, then chapters_from_tgt remains empty set
            if tgt_sents != []:
                for tgt_id in tgt_sents:
                    if isinstance(tgt_sent2section_dict[str(tgt_id)], list):
                        for section_name_ in tgt_sent2section_dict[str(tgt_id)]:
                            chapters_from_tgt.add(section_name_)
                    else:
                        chapters_from_tgt.add(tgt_sent2section_dict[str(tgt_id)])

            # compare the sets: if text, then last 5 characters in format "book[1-6]"
            src_chaps_compare = set()
            for src_chap in chapters_from_src:
                src_chaps_compare.add(src_chap[-5:])
            tgt_chaps_compare = set()
            for tgt_chap in chapters_from_tgt:
                tgt_chaps_compare.add(tgt_chap[-5:])
            # print(f"{src_chaps_compare} : {tgt_chaps_compare}")
            
            if src_chaps_compare == tgt_chaps_compare:
                tp_strict += 1
                # for correct text2text aligns, en is from prose or metric translation?
                correct_text2text[str(alignment)] = chapters_from_tgt
            
            elif chapters_from_src == set():
                text2null += 1
                print(alignment)

            else:
                overlap = src_chaps_compare.intersection(tgt_chaps_compare)
                # print(f"len overlap {len(overlap)}")
                if len(overlap) != 0:
                    tp_lax += 1
                    overlaps.append(alignment)
                    # print(src_chaps_compare)
                    # print(tgt_chaps_compare)
                else:
                    # print("HAVE ERROR%%%%")
                    # print(f"len overlap {len(overlap)}")
                    # save errors
                    error_dict = {}
                    error_dict["alignment"] = alignment
                    error_dict["alignmnent_idx"] = idx_align
                    error_dict["src_chapters"] = chapters_from_src
                    error_dict["tgt_chapters"] = chapters_from_tgt
                    errors.append(error_dict)
        
    return [tp_strict, tp_lax, overlaps, errors, correct_text2text, text2null]


In [195]:
rslts_txtonly_byprediction = score_textonly_book_level(vecrslts_textonly,
                                                       lat_sent2book_name,
                                                       en1893_sent2section_name_textonly)

([], [1441])
([], [1442])
([], [1443])
([], [1444])
([], [1611])
([], [2160])
([], [2217])
([], [2253])
([], [2280])
([], [2808])
([], [3009])
([], [3646])
([], [3647])
([], [3648])
([], [3649])
([], [3650])
([], [3651])
([], [3652])
([], [3653])
([], [3654])
([], [3655])
([], [3656])
([], [3657])
([], [3658])
([], [3659])
([], [3660])
([], [3661])
([], [3662])
([], [3663])
([], [3664])
([], [3665])
([], [3666])
([], [3667])
([], [3668])
([], [3669])
([], [3670])
([], [3671])
([], [3672])
([], [3673])
([], [3674])
([], [3675])
([], [3676])
([], [3677])
([], [3678])
([], [3679])
([], [3680])
([], [3681])
([], [3682])
([], [3683])
([], [3684])
([], [3685])
([], [3686])
([], [3687])
([], [3688])
([], [3689])
([], [3690])
([], [3691])
([], [3692])
([], [3693])
([], [3694])
([], [3695])
([], [3696])
([], [3697])
([], [3698])
([], [3699])
([], [3700])
([], [3701])
([], [3702])
([], [3703])
([], [3704])
([], [3705])
([], [3706])
([], [3707])
([], [3708])
([], [3709])
([], [3710])
([], [3711])

In [196]:
# old results (before adding text2null):
# 2367
# 0
# 4509

In [197]:
print(rslts_txtonly_byprediction[0]) # tpstrict
print(rslts_txtonly_byprediction[1]) # tplax
print(len(rslts_txtonly_byprediction[3]))
print(rslts_txtonly_byprediction[5]) # text2null

2367
0
1
4508


In [198]:
2367+4509 == len(vecrslts_textonly)

True

In [203]:
rslts_txtonly_byprediction[0] / len(vecrslts_textonly)

0.34424083769633507

In [201]:
len(rslts_txtonly_byprediction[3]) / len(vecrslts_textonly)

0.00014543339150668993

In [202]:
rslts_txtonly_byprediction[5] / len(vecrslts_textonly)

0.6556137289121582

In [152]:
len(vecrslts_textonly)

6876

## Score by English sentences

In [153]:
# get dict of tgt sentences' alignments to src sents
en1893_sent2lat_sent_aligns_textonly = build_tgt_2_src_dict(vecrslts_textonly)

In [154]:
def score_tgt_sents_textonly(tgt2src_sent_aligns_dict, tgt_sent2section_name_dict,
                   src_sent2section_name_dict):

    text2text_tpstrict = 0
    text2text_tpstrict_lst = []
    
    text2text_tplax = 0
    text2text_tplax_lst = []
    
    text2text_incorrect = 0
    text2text_incorrect_lst = []

    text2null = 0
    text2null_lst = []
    
    for tgt_sent_idx in range(len(tgt_sent2section_name_dict)):        
        # get src sentences aligned to it (returns a set)
        src_aligned_sents = tgt2src_sent_aligns_dict[tgt_sent_idx]
        # print(f"src aligned sents is {src_aligned_sents}")
        
        # TODO: necessary? to skip null-null alignments ("null" will not appear as key in dict)
        if str(tgt_sent_idx) in tgt_sent2section_name_dict.keys():
            # get tgt sent chapter (keys are str)
            tgt_sent_chapter = tgt_sent2section_name_dict[str(tgt_sent_idx)]
            # print(f"tgt chapter is {tgt_sent_chapter}")

            src_aligned_chapters = set()
            src_text2text_correct_counter = 0
            src_text2text_incorrect_counter = 0

            for item in src_aligned_sents:
                if item == "null":
                    text2null += 1
                    text2null_lst.append(tgt_sent_idx)
                else:
                    # get chapters of src sent (keys are str)
                    if isinstance(src_sent2section_name_dict[str(item)], list):
                        for section_name in src_sent2section_name_dict[str(item)]:
                            src_aligned_chapters.add(section_name)
                    else:
                        src_aligned_chapters.add(src_sent2section_name_dict[str(item)])

            # print(f"src chapters are {src_aligned_chapters}")
            tgt_chaps_compare = tgt_sent_chapter[-5:]
            for item in src_aligned_chapters:
                if tgt_chaps_compare == item:
                    src_text2text_correct_counter += 1
                else:
                    src_text2text_incorrect_counter += 1

            if src_text2text_correct_counter == len(src_aligned_sents):
                text2text_tpstrict += 1
                text2text_tpstrict_lst.append((src_aligned_chapters,tgt_sent_chapter))
            elif src_text2text_correct_counter > 0:
                text2text_tplax += 1
                text2text_tplax_lst.append((src_aligned_chapters,tgt_sent_chapter)) 
            else:
                text2text_incorrect += 1
                text2text_incorrect_lst.append(tgt_sent_idx)

    # remove text2null from text2text_incorrect_lst
    text2null_lst = set(text2null_lst)
    text2text_incorrect_lst = set(text2text_incorrect_lst)
    text2text_incorrect_lst -= text2null_lst
    # update num of text2text_incorrect
    text2text_incorrect -= text2null
    
    results = [text2text_tpstrict, #0 
               text2text_tplax, #1
               text2text_incorrect, #2
               text2text_incorrect_lst, #3
               text2null, #4
               text2null_lst, #5
               text2text_tpstrict_lst, #6
               text2text_tplax_lst] #7
    
    return results

In [155]:
rslts_txtonly_by_ensentence = score_tgt_sents_textonly(en1893_sent2lat_sent_aligns_textonly,
                                                       en1893_sent2section_name_textonly,
                                                       lat_sent2book_name)

In [156]:
lat_sent2book_name[str(0)]

'book1'

In [167]:
print(rslts_txtonly_by_ensentence[0])
print(rslts_txtonly_by_ensentence[1])
print(rslts_txtonly_by_ensentence[2])
print(rslts_txtonly_by_ensentence[4])
print(len(rslts_txtonly_by_ensentence[5]))

3592
61
4
4508
4508


In [171]:
len(en1893_sent2section_name_textonly)

8165

In [158]:
3592+61+4+4508 == len(en1893_sent2section_name_textonly)

True

In [172]:
# tpstrict
rslts_txtonly_by_ensentence[0]/ len(en1893_sent2section_name_textonly)

0.4399265156154317

In [173]:
#tplax
rslts_txtonly_by_ensentence[1] / len(en1893_sent2section_name_textonly)

0.007470912431108389

In [174]:
#text2text incorrect
rslts_txtonly_by_ensentence[2] / len(en1893_sent2section_name_textonly)

0.0004898958971218616

In [175]:
# text2null - all incorrect
rslts_txtonly_by_ensentence[4] / len(en1893_sent2section_name_textonly)

0.5521126760563381

In [176]:
# total errors: text2textincorrect + text2null
(rslts_txtonly_by_ensentence[2] + rslts_txtonly_by_ensentence[4]) / len(en1893_sent2section_name_textonly)

0.5526025719534599

### get number of prose and metric sentences

In [179]:
num_prose_textonly = 0
num_metric_textonly = 0

for vals in en1893_sent2section_name_textonly.values():
    # b/c of one sentence that overlaps two metric books
    if isinstance(vals, list):
        num_metric_textonly += 1
    elif vals.startswith("metric_translation"):
        num_metric_textonly += 1
    elif vals.startswith("prose_translation"):
        num_prose_textonly += 1
print(num_prose_textonly)
print(num_metric_textonly)

3648
4517


In [180]:
3648+4517

8165

### get breakdown by translation per result

In [160]:
# text2text_tpstrict ({src_sents},{tgt_sents})
text2text_tpstrict_prose = 0
for alignment in rslts_txtonly_by_ensentence[6]:
    tgt_text_section = alignment[1]
    if tgt_text_section.startswith("prose_translation"):
        text2text_tpstrict_prose += 1
        # get book of aligned Latin sents
        lat_sent = alignment[0]
text2text_tpstrict_metric = rslts_txtonly_by_ensentence[0] - text2text_tpstrict_prose

print(text2text_tpstrict_prose)
print(text2text_tpstrict_metric)

3574
18


In [161]:
# text2text_tplax ({src_sents},{tgt_sents})
text2text_tplax_prose = 0
for alignment in rslts_txtonly_by_ensentence[7]:
    tgt_text_section = alignment[1]
    if tgt_text_section.startswith("prose_translation"):
        text2text_tplax_prose += 1
        # get book of aligned Latin sents
        lat_sent = alignment[0]
text2text_tplax_metric = rslts_txtonly_by_ensentence[1] - text2text_tplax_prose

print(text2text_tplax_prose)
print(text2text_tplax_metric)

61
0


TODO: why are there [null] : [sent 1, sent 2] alignments below?

In [162]:
en1893_sent2lat_sent_aligns_textonly[2253]

{'null'}

In [163]:
en1893_sent2section_name_textonly[str(2253)]

'prose_translationbook4'

In [170]:
# text2null_lst: tgt_sent_idx's
text2null_metric = 0
text2null_prose = 0
for tgt_sent in rslts_txtonly_by_ensentence[5]:
    tgt_text_section = en1893_sent2section_name_textonly[str(tgt_sent)]
    # print(tgt_text_section)
    if isinstance(tgt_text_section, list):
        # means tgt sentence overlaps two sections; don't doublecount. 
        # this is sent 6519, the first sent of metric book5, which contains "* * * * * * *" from end of metric book 4
        lst_counter_prose = 0
        lst_counter_metric = 0
        for item in tgt_text_section:
            if item.startswith("metric_translation"):
                lst_counter_metric += 1
            elif item.startswith("prose_translation"):
                lst_counter_prose += 1
        if lst_counter_metric != 0:
            text2null_metric += 1
        elif lst_counter_prose != 0:
            text2null_prose += 1
    else:
        if tgt_text_section.startswith("metric_translation"):
            text2null_metric += 1
        elif tgt_text_section.startswith("prose_translation"):
            text2null_prose += 1
print(text2null_metric)
print(text2null_prose)

4495
13


In [165]:
# text2text_incorrect_lst: tgt_sent_idx's
text2text_incorrect_metric = 0
text2text_incorrect_prose = 0
for tgt_sent in rslts_txtonly_by_ensentence[3]:
    tgt_text_section = en1893_sent2section_name_textonly[str(tgt_sent)]
    if isinstance(tgt_text_section, list):
        for item in tgt_text_section:
            if item.startswith("metric_translation"):
                text2text_incorrect_metric += 1
            elif item.startswith("prose_translation"):
                text2text_incorrect_prose += 1
    else:
        if tgt_text_section.startswith("metric_translation"):
            text2text_incorrect_metric += 1
        elif tgt_text_section.startswith("prose_translation"):
            text2text_incorrect_prose += 1
print(text2text_incorrect_metric)
print(text2text_incorrect_prose)

4
0


In [130]:
en1893_sent2section_name_textonly["0"]

'prose_translationbook1'

### Redo eval after backing out sents related to 2nd translation

In [181]:
    # results = [text2text_tpstrict, #0 
    #            text2text_tplax, #1
    #            text2text_incorrect, #2
    #            text2text_incorrect_lst, #3
    #            text2null, #4
    #            text2null_lst, #5
    #            text2text_tpstrict_lst, #6
    #            text2text_tplax_lst] #7

In [182]:
print(num_prose_textonly)
print(num_metric_textonly)

3648
4517


In [183]:
# text2text_tpstrict
(rslts_txtonly_by_ensentence[0] - text2text_tpstrict_metric) / num_prose_textonly

0.9797149122807017

In [184]:
# text2text_tplax
(rslts_txtonly_by_ensentence[1] - text2text_tplax_metric) / num_prose_textonly

0.016721491228070175

In [185]:
# text2text_incorrect
(rslts_txtonly_by_ensentence[2] - text2text_incorrect_metric) / num_prose_textonly

0.0

In [186]:
#text2null
(rslts_txtonly_by_ensentence[4] - text2null_metric) / num_prose_textonly

0.00356359649122807