In [1]:
import re
import json
import stanza
import argparse

import numpy as np
import pandas as pd

from itertools import chain
from ast import literal_eval
from collections import defaultdict

from functions_score_section import read_alignments, score_vec_rslts_chapter_level, \
build__src_2_tgt_dict, build_tgt_2_src_dict, score_fr_sents

In [2]:
# get vecalign results
vec_rslts_path = "/home/craig.car/repos/chiron/align_texts_project/data/lucretius/lucretius_en1893_vecrsltsNEW"
vec_rslts = read_alignments(vec_rslts_path)

In [3]:
lat_dict_path = "/home/craig.car/repos/chiron/align_texts_project/data/lucretius/lat_sent2book_dict.json"

with open(lat_dict_path) as f:
    lat_sent2book_name = json.load(f)

en1893_dict_path = "/home/craig.car/repos/chiron/align_texts_project/data/lucretius/en1893_sent2section_dict_NEW.json"

with open(en1893_dict_path) as f:
    en1893_sent2section_name = json.load(f)
    

In [4]:
# get paratext sections in en1893
section_names = []
for item in list(en1893_sent2section_name.values()):
    if isinstance(item, str):
        section_names.append(item)
    else:
        section_names.extend(item)

In [5]:
section_names = np.unique(np.array(section_names))

In [6]:
section_names

array(['forewordbook0', 'forewordbook0note', 'forewordbook0title',
       'index', 'metric_translationbook0title', 'metric_translationbook1',
       'metric_translationbook1note', 'metric_translationbook1title',
       'metric_translationbook2', 'metric_translationbook2note',
       'metric_translationbook2title', 'metric_translationbook3',
       'metric_translationbook3note', 'metric_translationbook3title',
       'metric_translationbook4', 'metric_translationbook4note',
       'metric_translationbook4title', 'metric_translationbook5',
       'metric_translationbook5note', 'metric_translationbook5title',
       'metric_translationbook6', 'metric_translationbook6note',
       'metric_translationbook6title', 'prose_translationbook0title',
       'prose_translationbook1', 'prose_translationbook1commentary',
       'prose_translationbook1note', 'prose_translationbook1title',
       'prose_translationbook2', 'prose_translationbook2commentary',
       'prose_translationbook2note', 'prose_t

In [7]:
extraneous_sections = []
for item in section_names:
    if item == "forewordbook0":
        extraneous_sections.append(item)
    elif item[-1].isdigit():
        continue
    else:
        extraneous_sections.append(item)
extraneous_sections

['forewordbook0',
 'forewordbook0note',
 'forewordbook0title',
 'index',
 'metric_translationbook0title',
 'metric_translationbook1note',
 'metric_translationbook1title',
 'metric_translationbook2note',
 'metric_translationbook2title',
 'metric_translationbook3note',
 'metric_translationbook3title',
 'metric_translationbook4note',
 'metric_translationbook4title',
 'metric_translationbook5note',
 'metric_translationbook5title',
 'metric_translationbook6note',
 'metric_translationbook6title',
 'prose_translationbook0title',
 'prose_translationbook1commentary',
 'prose_translationbook1note',
 'prose_translationbook1title',
 'prose_translationbook2commentary',
 'prose_translationbook2note',
 'prose_translationbook2title',
 'prose_translationbook3commentary',
 'prose_translationbook3note',
 'prose_translationbook3title',
 'prose_translationbook4commentary',
 'prose_translationbook4note',
 'prose_translationbook4title',
 'prose_translationbook5commentary',
 'prose_translationbook5note',
 'pr

# Score vecalign results: by prediction

In [8]:
def score_vec_rslts_chapter_level(vr_rslts_lst, el_sent2section_dict,
                                 fr_sent2section_dict, fr_extra_section_names):

    tp_strict = 0 # +1 per alignment if there's an exact match
    tp_lax = 0 # +1 per alignment if there's any overlap
    overlaps = []
    errors = []
    correct_nulls = 0
    correct_text2text = {}

    for idx_align, alignment in enumerate(vr_rslts_lst):
    # for idx_align, alignment in enumerate(vr_rslts_lst[518:523]):
        # skip alignments null on both sides
        if alignment == ([],[]):
            continue
        else:
            src_sents = alignment[0]
            tgt_sents = alignment[1]
            # get set of chapters from src, then from tgt
            chapters_from_src = set()
            chapters_from_tgt = set()
            # if alignment is null on src side, then chapters_from_src remains empty set
            if src_sents != []:
                for src_id in src_sents:
                    if isinstance(el_sent2section_dict[str(src_id)], list):
                        for section_name in el_sent2section_dict[str(src_id)]:
                            chapters_from_src.add(section_name)
                    else:
                        chapters_from_src.add(el_sent2section_dict[str(src_id)])
            # if alignment is null on tgt side, then chapters_from_tgt remains empty set
            if tgt_sents != []:
                for tgt_id in tgt_sents:
                    if isinstance(fr_sent2section_dict[str(tgt_id)], list):
                        for section_name_ in fr_sent2section_dict[str(tgt_id)]:
                            chapters_from_tgt.add(section_name_)
                    else:
                        chapters_from_tgt.add(fr_sent2section_dict[str(tgt_id)])

            # compare the sets: if text, then last 5 characters in format "book[1-6]"
            src_chaps_compare = set()
            for src_chap in chapters_from_src:
                src_chaps_compare.add(src_chap[-5:])
            tgt_chaps_compare = set()
            for tgt_chap in chapters_from_tgt:
                tgt_chaps_compare.add(tgt_chap[-5:])
            print(f"{src_chaps_compare} : {tgt_chaps_compare}")
            
            if src_chaps_compare == tgt_chaps_compare:
                tp_strict += 1
                # for correct text2text aligns, en is from prose or metric translation?
                correct_text2text[str(alignment)] = chapters_from_tgt
            
            # account for correct null : fr extraneous sections 
            elif chapters_from_src == set():
                print(f"have null on src")
                tgt_counter = 0
                for chapter in chapters_from_tgt:
                    if chapter in fr_extra_section_names:
                        tgt_counter += 1
                # tp_strict if all tgt chapters are extraneous
                if tgt_counter == len(chapters_from_tgt):
                    # tp_strict += 1
                    correct_nulls += 1
                elif tgt_counter == 0:
                    print("HAVE ERROR")
                    # save errors
                    error_dict = {}
                    error_dict["alignment"] = alignment
                    error_dict["alignmnent_idx"] = idx_align
                    error_dict["src_chapters"] = chapters_from_src
                    error_dict["tgt_chapters"] = chapters_from_tgt
                    errors.append(error_dict)
                else: # null to multiple tgt including peritext and text
                    tp_lax += 1
                    overlaps.append(alignment)
                    # print(src_chaps_compare)
                    # print(tgt_chaps_compare)

            else:
                overlap = src_chaps_compare.intersection(tgt_chaps_compare)
                print(f"len overlap {len(overlap)}")
                if len(overlap) != 0:
                    tp_lax += 1
                    overlaps.append(alignment)
                    # print(src_chaps_compare)
                    # print(tgt_chaps_compare)
                else:
                    print("HAVE ERROR%%%%")
                    # save errors
                    error_dict = {}
                    error_dict["alignment"] = alignment
                    error_dict["alignmnent_idx"] = idx_align
                    error_dict["src_chapters"] = chapters_from_src
                    error_dict["tgt_chapters"] = chapters_from_tgt
                    errors.append(error_dict)
        
    return tp_strict, tp_lax, overlaps, errors, correct_nulls, correct_text2text


In [9]:
vec_rslts[10423]

([], [13980])

In [10]:
test = {}
test[0] = [1,3]

In [11]:
len(lat_sent2book_name)

2428

In [12]:
len(en1893_sent2section_name)

14648

In [13]:
len(vec_rslts)

11073

In [14]:
tp_strict_, tp_lax_, overlaps_, errors_, correct_nulls_, correct_text2text_ = score_vec_rslts_chapter_level(
    vec_rslts, lat_sent2book_name, en1893_sent2section_name, extraneous_sections)


set() : {'title'}
have null on src
set() : {'title'}
have null on src
set() : {'title'}
have null on src
set() : {'title'}
have null on src
set() : {'title'}
have null on src
set() : {'title'}
have null on src
set() : {'title'}
have null on src
set() : {'title'}
have null on src
set() : {'title'}
have null on src
set() : {'title'}
have null on src
set() : {'title'}
have null on src
set() : {'title'}
have null on src
set() : {'title'}
have null on src
set() : {'title'}
have null on src
set() : {'title'}
have null on src
set() : {'title'}
have null on src
set() : {'title'}
have null on src
set() : {'title'}
have null on src
set() : {'0note'}
have null on src
set() : {'0note'}
have null on src
set() : {'0note'}
have null on src
set() : {'0note'}
have null on src
set() : {'0note'}
have null on src
set() : {'0note'}
have null on src
set() : {'0note'}
have null on src
set() : {'book0'}
have null on src
set() : {'book0'}
have null on src
set() : {'book0'}
have null on src
set() : {'book0'}
ha

In [15]:
print(tp_strict_)
print(tp_lax_)
# print(len(overlaps_))
print(len(errors_))
print(correct_nulls_)
# print(len(correct_text2text_))

1707
451
4867
4048


In [16]:
(tp_strict_ + correct_nulls_) / len(vec_rslts)

0.5197326831030434

In [43]:
tp_strict_ / len(vec_rslts)

0.1541587645624492

In [17]:
tp_lax_ / len(vec_rslts)

0.04072970288088142

In [18]:
(tp_strict_ + tp_lax_) / len(vec_rslts)

0.19488846744333063

In [19]:
len(errors_) / len(vec_rslts)

0.43953761401607516

In [20]:
correct_nulls_ / len(vec_rslts)

0.36557391854059423

In [21]:
# check if any en sents overlap sections
num_ensents_manychaps = 0
for values in en1893_sent2section_name.values():
    if isinstance(values, list):
        if len(values) > 1:
            num_ensents_manychaps += 1
num_ensents_manychaps

0

In [22]:
correct_text2text_

{'([2], [532, 533])': {'prose_translationbook1'},
 '([3], [534])': {'prose_translationbook1'},
 '([4], [535])': {'prose_translationbook1'},
 '([5], [536])': {'prose_translationbook1'},
 '([7], [574])': {'prose_translationbook1'},
 '([8], [575])': {'prose_translationbook1'},
 '([11, 12], [614])': {'prose_translationbook1'},
 '([13], [615])': {'prose_translationbook1'},
 '([14], [616])': {'prose_translationbook1'},
 '([18], [693, 694, 695])': {'prose_translationbook1'},
 '([19], [696, 697])': {'prose_translationbook1'},
 '([20], [698])': {'prose_translationbook1'},
 '([21], [699])': {'prose_translationbook1'},
 '([22], [700])': {'prose_translationbook1'},
 '([26], [725])': {'prose_translationbook1'},
 '([27], [726])': {'prose_translationbook1'},
 '([28], [727, 728, 729])': {'prose_translationbook1'},
 '([29], [730])': {'prose_translationbook1'},
 '([30], [731])': {'prose_translationbook1'},
 '([35], [756, 757])': {'prose_translationbook1'},
 '([38], [779])': {'prose_translationbook1'},
 

In [23]:
# was latin text aligned to any metric translation sentences?
correct_lat2entext = set()
for tgt_set in list(correct_text2text_.values()):
    for item in tgt_set:
        correct_lat2entext.add(item)

In [24]:
correct_lat2entext

{'metric_translationbook6',
 'prose_translationbook1',
 'prose_translationbook2',
 'prose_translationbook3',
 'prose_translationbook4',
 'prose_translationbook5',
 'prose_translationbook6'}

In [25]:
correct_metric2lat = []
for keys, values in correct_text2text_.items():
    for section_set in values:
        if section_set.startswith('metric_translation'):
            correct_metric2lat.append((keys, values))

In [26]:
correct_metric2lat

[('([2423], [13348, 13349, 13350, 13351, 13352, 13353, 13354])',
  {'metric_translationbook6'}),
 ('([2424], [13375, 13376, 13377, 13378, 13379, 13380, 13381])',
  {'metric_translationbook6'})]

# Analyze vecalign results: by English sentences

In [27]:
# get dict of tgt sentences' alignments to src sents
en1893_sent2lat_sent_aligns = build_tgt_2_src_dict(vec_rslts)

In [28]:
# test
keys = list(en1893_sent2lat_sent_aligns.keys())
keys = sorted(keys)
keys == [x for x in range(0, 13966)]

False

In [29]:
# def score_tgt_sents(tgt2src_sent_aligns_dict, tgt_sent2section_name_dict,
#                    src_sent2section_name_dict, extraneous_chapter_names):
#     extraneous2null_tpstrict = 0
#     extraneous2null_tplax = 0 # at least one overlap
#     extraneous2text = 0 # no overlap

#     text2text_tpstrict = 0
#     text2text_tplax = 0
#     text2text_incorrect = 0
#     text2text_incorrect_lst = []

#     text2null_incorrect = 0
#     text2null_lst = []

#     for tgt_sent_idx in tgt2src_sent_aligns_dict.keys():
#         # get src sentences aligned to it (returns a set)
#         src_aligned_sents = tgt2src_sent_aligns_dict[tgt_sent_idx]
#         # print(f"src aligned sents is {src_aligned_sents}")
        
#         # TODO: necessary? to skip null-null alignments ("null" will not appear as key in dict)
#         if str(tgt_sent_idx) in tgt_sent2section_name_dict.keys():
#             # get tgt sent chapter (keys are str)
#             tgt_sent_chapter = tgt_sent2section_name_dict[str(tgt_sent_idx)]
#             # print(f"tgt chapter is {tgt_sent_chapter}")
#             # convert to list in case of multiple chapters per sent
#             if isinstance(tgt_sent_chapter, str):
#                 tgt_sent_chapter = [tgt_sent_chapter]
#                 # print(tgt_sent_chapter)
            
#             tgt_chapter_counter = 0
#             for tgt_chapter in tgt_sent_chapter:
#                 if tgt_chapter in extraneous_chapter_names:
#                     # print(tgt_sent_idx)
#                     # print(tgt_chapter)
#                     # get num of tgt to null alignments
#                     extraneous2null_counter = 0
#                     for item in src_aligned_sents:
#                         if item == "null":
#                             extraneous2null_counter += 1
#                     # compare to number of src sents in alignmnent
#                     if extraneous2null_counter == len(src_aligned_sents):
#                         # then all src aligned sents are null
#                         extraneous2null_tpstrict += 1
#                     elif extraneous2null_counter > 0:
#                         # then at least one src sent is null (also captures tpstrict)
#                         extraneous2null_tplax += 1
#                     else:
#                         # no src sents are null
#                         extraneous2text += 1
                
#                 #### TODO: ADAPT TO ACCOUNT FOR DIFFERENT FORMATTING ("BOOK 2" for LATIN) ####
#                 else: # compare src and tgt chapters
#                     src_aligned_chapters = set()
#                     src_text2text_correct_counter = 0
#                     src_text2text_incorrect_counter = 0

#                     for item in src_aligned_sents:
#                         if item == "null":
#                             text2null_incorrect += 1
#                             text2null_lst.append(tgt_sent_idx)
#                         else:
#                             # get chapters of src sent (keys are str)
#                             if isinstance(src_sent2section_name_dict[str(item)], list):
#                                 for section_name in src_sent2section_name_dict[str(item)]:
#                                     src_aligned_chapters.add(section_name)
#                             else:
#                                 src_aligned_chapters.add(src_sent2section_name_dict[str(item)])

#                     # print(f"src chapters are {src_aligned_chapters}")
#                     #### TODO: IS THIS WRONG? ############
#                     tgt_chaps_compare = tgt_chapter[-5:]
#                     for item in src_aligned_chapters:
#                         if tgt_chaps_compare == item:
#                             src_text2text_correct_counter += 1
#                         else:
#                             src_text2text_incorrect_counter += 1

#                     if src_text2text_correct_counter == len(src_aligned_sents):
#                         text2text_tpstrict += 1
#                     elif src_text2text_correct_counter > 0:
#                         text2text_tplax += 1
#                     else:
#                         text2text_incorrect += 1
#                         text2text_incorrect_lst.append(tgt_sent_idx)

#                 tgt_chapter_counter += 1
#             if tgt_chapter_counter > 1:
#                 print(tgt_sent_idx)

#     # remove text2null from text2text_incorrect_lst
#     text2null_lst = set(text2null_lst)
#     text2text_incorrect_lst = set(text2text_incorrect_lst)
#     text2text_incorrect_lst -= text2null_lst
#     # update num of text2text_incorrect
#     text2text_incorrect -= text2null_incorrect
    
#     results = [extraneous2null_tpstrict, extraneous2null_tplax, extraneous2text,
#                text2text_tpstrict, text2text_tplax, 
#                text2text_incorrect, text2text_incorrect_lst,
#                text2null_incorrect, text2null_lst]
    
#     return results

In [30]:
def score_tgt_sents(tgt2src_sent_aligns_dict, tgt_sent2section_name_dict,
                   src_sent2section_name_dict, extraneous_chapter_names):
    extraneous2null_tpstrict = 0
    extraneous2null_tplax = 0 # at least one overlap
    extraneous2text = 0 # no overlap

    text2text_tpstrict = 0
    text2text_tplax = 0
    text2text_incorrect = 0
    text2text_incorrect_lst = []

    text2null_incorrect = 0
    text2null_lst = []
    
    for tgt_sent_idx in range(len(tgt_sent2section_name_dict)):
    # for tgt_sent_idx in tgt2src_sent_aligns_dict.keys():
    # for tgt_sent_idx in range(8411, 8414):
#         sent_extraneous2null_tpstrict = 0
#         sent_extraneous2null_tplax = 0
#         sent_extraneous2text = 0
#         sent_text2text_tpstrict = 0
#         sent_text2text_tplax = 0
#         sent_text2text_incorrect = 0
#         sent_text2null_incorrect = 0
        
        # get src sentences aligned to it (returns a set)
        src_aligned_sents = tgt2src_sent_aligns_dict[tgt_sent_idx]
        # print(f"src aligned sents is {src_aligned_sents}")
        
        # TODO: necessary? to skip null-null alignments ("null" will not appear as key in dict)
        if str(tgt_sent_idx) in tgt_sent2section_name_dict.keys():
            # get tgt sent chapter (keys are str)
            tgt_sent_chapter = tgt_sent2section_name_dict[str(tgt_sent_idx)]
            print(f"tgt chapter is {tgt_sent_chapter}")

            #### TODO: UNDO THIS iF DOESN'T WORK
            # # convert to list in case of multiple chapters per sent
            # if isinstance(tgt_sent_chapter, str):
            #     tgt_sent_chapter = [tgt_sent_chapter]
                # print(tgt_sent_chapter)
            
            if tgt_sent_chapter in extraneous_chapter_names:
            # tgt_chapter_counter = 0
            # for tgt_chapter in tgt_sent_chapter:
                # if tgt_chapter in extraneous_chapter_names:
                    # print(tgt_sent_idx)
                    # print(tgt_chapter)
                    # get num of tgt to null alignments
                extraneous2null_counter = 0
                for item in src_aligned_sents:
                    if item == "null":
                        extraneous2null_counter += 1
                        print("have extra2null")
                # compare to number of src sents in alignmnent
                if extraneous2null_counter == len(src_aligned_sents):
                    # then all src aligned sents are null
                    extraneous2null_tpstrict += 1
                elif extraneous2null_counter == 0:
                    # no src sents are null
                    extraneous2text += 1
                    print("have extra2text")
                else:
                    # then at least one src sent is null
                    extraneous2null_tplax += 1


            #### TODO: ADAPT TO ACCOUNT FOR DIFFERENT FORMATTING ("BOOK 2" for LATIN) ####
            else: # compare src and tgt chapters
                src_aligned_chapters = set()
                src_text2text_correct_counter = 0
                src_text2text_incorrect_counter = 0

                for item in src_aligned_sents:
                    if item == "null":
                        text2null_incorrect += 1
                        text2null_lst.append(tgt_sent_idx)
                    else:
                        # get chapters of src sent (keys are str)
                        if isinstance(src_sent2section_name_dict[str(item)], list):
                            for section_name in src_sent2section_name_dict[str(item)]:
                                src_aligned_chapters.add(section_name)
                        else:
                            src_aligned_chapters.add(src_sent2section_name_dict[str(item)])

                print(f"src chapters are {src_aligned_chapters}")
                #### TODO: IS THIS WRONG? ############
                tgt_chaps_compare = tgt_sent_chapter[-5:]
                for item in src_aligned_chapters:
                    if tgt_chaps_compare == item:
                        src_text2text_correct_counter += 1
                    else:
                        src_text2text_incorrect_counter += 1

                if src_text2text_correct_counter == len(src_aligned_sents):
                    text2text_tpstrict += 1
                elif src_text2text_correct_counter > 0:
                    text2text_tplax += 1
                else:
                    text2text_incorrect += 1
                    text2text_incorrect_lst.append(tgt_sent_idx)

                # tgt_chapter_counter += 1
                    
            # if sent_extraneous2null_tpstrict == len(tgt_sent_chapter):
            #     extraneous2null_tpstrict += 1
            # if ((sent_extraneous2null_tplax > 0) & (sent_extraneous2null_tplax < len(tgt_sent_chapter))):
            #     extraneous2null_tplax += 1
            # if sent_extraneous2text > 0:
            #     extraneous2text += 1
            # if sent_text2text_tpstrict > 0:
            #     text2text_tpstrict += 1
            # if sent_text2text_tplax > 0:
            #     text2text_tplax += 1
            # if sent_text2text_incorrect > 0:
            #     text2text_incorrect += 1
            # if sent_text2null_incorrect > 0:
            #     text2null_incorrect += 1

    # remove text2null from text2text_incorrect_lst
    text2null_lst = set(text2null_lst)
    text2text_incorrect_lst = set(text2text_incorrect_lst)
    text2text_incorrect_lst -= text2null_lst
    # update num of text2text_incorrect
    text2text_incorrect -= text2null_incorrect
    
    results = [extraneous2null_tpstrict, #0
               extraneous2null_tplax, #1
               extraneous2text, #2
               text2text_tpstrict, #3 
               text2text_tplax, #4
               text2text_incorrect, #5 
               text2text_incorrect_lst, #6
               text2null_incorrect, #7
               text2null_lst]
    
    return results

In [31]:
len(vec_rslts)

11073

In [32]:
len(en1893_sent2section_name)

14648

In [33]:
len(en1893_sent2lat_sent_aligns)

14648

In [34]:
en1893_sent2section_name["8412"]

'prose_translationbook6note'

In [35]:
en1893_sent2lat_sent_aligns[14647]

{'null'}

In [36]:
lat_sent2book_name["2395"]

'book6'

In [37]:
rslts_en1893_sents = score_tgt_sents(en1893_sent2lat_sent_aligns, en1893_sent2section_name,
                   lat_sent2book_name, extraneous_sections)

tgt chapter is forewordbook0title
have extra2null
tgt chapter is forewordbook0title
have extra2null
tgt chapter is forewordbook0title
have extra2null
tgt chapter is forewordbook0title
have extra2null
tgt chapter is forewordbook0title
have extra2null
tgt chapter is forewordbook0title
have extra2null
tgt chapter is forewordbook0title
have extra2null
tgt chapter is forewordbook0title
have extra2null
tgt chapter is forewordbook0title
have extra2null
tgt chapter is forewordbook0title
have extra2null
tgt chapter is forewordbook0title
have extra2null
tgt chapter is forewordbook0title
have extra2null
tgt chapter is forewordbook0title
have extra2null
tgt chapter is forewordbook0title
have extra2null
tgt chapter is forewordbook0title
have extra2null
tgt chapter is forewordbook0title
have extra2null
tgt chapter is forewordbook0title
have extra2null
tgt chapter is forewordbook0title
have extra2null
tgt chapter is forewordbook0note
have extra2null
tgt chapter is forewordbook0note
have extra2null
tg

In [38]:
print(rslts_en1893_sents[0])
print(rslts_en1893_sents[1])
print(rslts_en1893_sents[2])
print(rslts_en1893_sents[3])
print(rslts_en1893_sents[4])
print(rslts_en1893_sents[5])
print(rslts_en1893_sents[7])

4048
0
2146
3770
42
3
4639


In [39]:
sum(rslts_en1893_sents[0:6])+rslts_en1893_sents[7]

14648

In [40]:
print(rslts_en1893_sents[6])

{8760, 8763, 8764}


## Get number of paratext and text sentences

In [41]:
metric_sents = ['metric_translationbook1',
                'metric_translationbook2',
                'metric_translationbook3',
                'metric_translationbook4',
                'metric_translationbook5',
                'metric_translationbook6']

prose_sents = ['prose_translationbook1',
               'prose_translationbook2',
               'prose_translationbook3',
               'prose_translationbook4',
               'prose_translationbook5',
               'prose_translationbook6']

prose_sent_counter = 0
metric_sent_counter = 0
num_paratext_sents = 0
num_notes = 0
for vals in en1893_sent2section_name.values():    
    if vals in metric_sents:
        metric_sent_counter += 1
    elif vals in prose_sents:
        prose_sent_counter += 1
    elif vals in extraneous_sections:
        num_paratext_sents += 1

num_en1893_text_sents = prose_sent_counter + metric_sent_counter

num_footnote_sents = 0
for vals in en1893_sent2section_name.values():    
    if vals.endswith("note"):
        num_footnote_sents += 1


print(prose_sent_counter)
print(metric_sent_counter)
print(num_en1893_text_sents)
print(num_paratext_sents)
print((num_en1893_text_sents + num_paratext_sents) == len(en1893_sent2section_name))
print((num_en1893_text_sents + num_paratext_sents) == len(en1893_sent2lat_sent_aligns))
print(num_footnote_sents)

3936
4518
8454
6194
True
True
4375


In [44]:
# extraneous2null_tpstrict, #0
#                extraneous2null_tplax, #1
#                extraneous2text, #2
#                text2text_tpstrict, #3 
#                text2text_tplax, #4
#                text2text_incorrect, #5 
#                text2text_incorrect_lst, #6
#                text2null_incorrect, #7
               # text2null_lst

In [45]:
# extraneous2null_tpstrict to num paratext sents
rslts_en1893_sents[0]/num_paratext_sents

0.6535356796900226

In [46]:
# extraneous2null_tplax to num paratext sents
rslts_en1893_sents[1]/num_paratext_sents

0.0

In [47]:
# extraneous2text to num paratext sents
rslts_en1893_sents[2]/num_paratext_sents

0.3464643203099774

In [48]:
# extraneous2text to num text sents
rslts_en1893_sents[2]/num_en1893_text_sents

0.25384433404305656

In [49]:
# text2text_tpstrict to num text sents
rslts_en1893_sents[3]/num_en1893_text_sents

0.44594274899455877

In [50]:
# text2text_tplax to num text sents
rslts_en1893_sents[4]/num_en1893_text_sents

0.0049680624556423

In [51]:
# text2text_incorrect to num text sents
rslts_en1893_sents[5]/num_en1893_text_sents

0.00035486160397445

In [52]:
# text2null_incorrect to num text sents
rslts_en1893_sents[7]/num_en1893_text_sents

0.5487343269458245

In [53]:
len(vec_rslts)

11073