In [22]:
import re
import json
import stanza

import numpy as np
import pandas as pd

from itertools import chain
from ast import literal_eval

from preprocessing_functions import load_txt_as_lst, split_txt, \
run_stanza, flatten_list, segment_series, preprocess_series, write_file

# Lucretius en1893 no section markers

In [23]:
# raw_str_path = "/home/craig.car/repos/chiron/align_texts_project/data/lucretius/lucretius_en1893_str.txt"
raw_str_path = "/home/craig.car/repos/chiron/align_texts_project/data/lucretius/lucretius_en1983_morenewlines.txt"
en1893_raw_lst = load_txt_as_lst(raw_str_path)

In [24]:
en1893_raw_lst[:10]

['REMARKS\n',
 '\n',
 'ON THE\n',
 '\n',
 'LIFE AND POEM OF LUCRETIUS.\n',
 '  OF the life of Lucretius but little information has reached us.  Ad nos vix tenuis famæ perlabitur aura.\n',
 '\n',
 'That he was a Roman by birth, is inferred from the pas  sages in his poem in which he speaks of the Roman world as  his country,1 and of the Roman language as his native tongue.2\n',
 '\n',
 'As to the time of his birth, it is stated by Eusebius in his  Chronicon, that he was born in the second year of the hundred  and seventy-first Olympiad, or ninety-five years before Christ.  At this period, Ennius had been dead about seventy years ;  Cicero was in his twelfth year; twenty-five years were to  elapse before the birth of Virgil, and four before that of Julius  Caesar. His style, indeed, would make him seem older, but  its antiquated character may be partly affected, in imitation,  perhaps, of Ennius, for whom he expresses great veneration.3\n']

In [25]:
en1893_str = " ".join(en1893_raw_lst)

In [26]:
en1893_str[:500]

'REMARKS\n \n ON THE\n \n LIFE AND POEM OF LUCRETIUS.\n   OF the life of Lucretius but little information has reached us.  Ad nos vix tenuis famæ perlabitur aura.\n \n That he was a Roman by birth, is inferred from the pas  sages in his poem in which he speaks of the Roman world as  his country,1 and of the Roman language as his native tongue.2\n \n As to the time of his birth, it is stated by Eusebius in his  Chronicon, that he was born in the second year of the hundred  and seventy-first Olympiad, or ni'

In [27]:
# load stanza model for lang
lang_ = "en"
stanza_model_ = stanza.Pipeline(lang=lang_, processors='tokenize', use_gpu=True)

2023-07-31 20:30:05 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

2023-07-31 20:30:05 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |

2023-07-31 20:30:05 INFO: Use device: cpu
2023-07-31 20:30:05 INFO: Loading: tokenize
2023-07-31 20:30:05 INFO: Done loading processors!


In [28]:
en1983_sents = preprocess_series(en1893_str, "en", stanza_model_)

segmented str into sentences


In [29]:
# without additional newlines around notes and titles, len(en1983_sents) was 13966

In [30]:
len(en1983_sents)

14648

In [31]:
en1983_sents[:10]

['REMARKS',
 'ON THE',
 'LIFE AND POEM OF LUCRETIUS.',
 'OF the life of Lucretius but little information has reached us.',
 'Ad nos vix tenuis famæ perlabitur aura.',
 'That he was a Roman by birth, is inferred from the pas  sages in his poem in which he speaks of the Roman world as  his country,1 and of the Roman language as his native tongue.',
 '2',
 'As to the time of his birth, it is stated by Eusebius in his  Chronicon, that he was born in the second year of the hundred  and seventy-first Olympiad, or ninety-five years before Christ.',
 'At this period, Ennius had been dead about seventy years ;',
 'Cicero was in his twelfth year;']

In [32]:
# path_out = "/home/craig.car/repos/chiron/align_texts_project/data/lucretius/lucretius_en1893_sents_NEW.txt"
# write_file(en1983_sents, path_out)

# Lucretius en1893 with section markers

In [33]:
raw_path_markers = "/home/craig.car/repos/chiron/align_texts_project/data/lucretius/lucretius_en1893_str_withsections.txt"
en1893_raw_lst_markers = load_txt_as_lst(raw_path_markers)

In [34]:
en1893_raw_lst_markers[:10]

['#@$% chapter= #@$%\n',
 '\n',
 '#@$%title#@$%REMARKS\n',
 '\n',
 '#@$%title#@$%ON THE\n',
 '   #@$%title#@$%LIFE AND POEM OF LUCRETIUS.\n',
 '  OF the life of Lucretius but little information has reached us.  Ad nos vix tenuis famæ perlabitur aura.\n',
 '\n',
 'That he was a Roman by birth, is inferred from the pas  sages in his poem in which he speaks of the Roman world as  his country,1 and of the Roman language as his native tongue.2\n',
 '\n']

In [35]:
en1893_str_markers = " ".join(en1893_raw_lst_markers)

In [36]:
en1893_str_markers[:1000]

'#@$% chapter= #@$%\n \n #@$%title#@$%REMARKS\n \n #@$%title#@$%ON THE\n    #@$%title#@$%LIFE AND POEM OF LUCRETIUS.\n   OF the life of Lucretius but little information has reached us.  Ad nos vix tenuis famæ perlabitur aura.\n \n That he was a Roman by birth, is inferred from the pas  sages in his poem in which he speaks of the Roman world as  his country,1 and of the Roman language as his native tongue.2\n \n As to the time of his birth, it is stated by Eusebius in his  Chronicon, that he was born in the second year of the hundred  and seventy-first Olympiad, or ninety-five years before Christ.  At this period, Ennius had been dead about seventy years ;  Cicero was in his twelfth year; twenty-five years were to  elapse before the birth of Virgil, and four before that of Julius  Caesar. His style, indeed, would make him seem older, but  its antiquated character may be partly affected, in imitation,  perhaps, of Ennius, for whom he expresses great veneration.3\n \n Concerning his famil

In [37]:
en1893_str_markers = en1893_str_markers.replace("\n", "")

In [38]:
en1893_str_markers[:1000]

'#@$% chapter= #@$%  #@$%title#@$%REMARKS  #@$%title#@$%ON THE    #@$%title#@$%LIFE AND POEM OF LUCRETIUS.   OF the life of Lucretius but little information has reached us.  Ad nos vix tenuis famæ perlabitur aura.  That he was a Roman by birth, is inferred from the pas  sages in his poem in which he speaks of the Roman world as  his country,1 and of the Roman language as his native tongue.2  As to the time of his birth, it is stated by Eusebius in his  Chronicon, that he was born in the second year of the hundred  and seventy-first Olympiad, or ninety-five years before Christ.  At this period, Ennius had been dead about seventy years ;  Cicero was in his twelfth year; twenty-five years were to  elapse before the birth of Virgil, and four before that of Julius  Caesar. His style, indeed, would make him seem older, but  its antiquated character may be partly affected, in imitation,  perhaps, of Ennius, for whom he expresses great veneration.3  Concerning his family nothing is known. The 

## Split by section (including chapters)
Note: "#@$%" added at beginning and end of section markers when extracting text from xml

In [39]:
en1893_str_markers_split = en1893_str_markers.split("#@$%")

In [40]:
en1893_str_markers_split[:20]

['',
 ' chapter= ',
 '  ',
 'title',
 'REMARKS  ',
 'title',
 'ON THE    ',
 'title',
 'LIFE AND POEM OF LUCRETIUS.   OF the life of Lucretius but little information has reached us.  Ad nos vix tenuis famæ perlabitur aura.  That he was a Roman by birth, is inferred from the pas  sages in his poem in which he speaks of the Roman world as  his country,1 and of the Roman language as his native tongue.2  As to the time of his birth, it is stated by Eusebius in his  Chronicon, that he was born in the second year of the hundred  and seventy-first Olympiad, or ninety-five years before Christ.  At this period, Ennius had been dead about seventy years ;  Cicero was in his twelfth year; twenty-five years were to  elapse before the birth of Virgil, and four before that of Julius  Caesar. His style, indeed, would make him seem older, but  its antiquated character may be partly affected, in imitation,  perhaps, of Ennius, for whom he expresses great veneration.3  Concerning his family nothing is kn

In [41]:
# define patterns
chapter_marker = re.compile(" chapter= ") # 4: commentary, prose translation, metric translation, index
title_start = "title"
title_end = "title_end"
title_commentary = re.compile("ARGUMENT") # startswith
title_book = re.compile("BOOK")
title_index = re.compile("INDEX.")
para_end = re.compile("paragraph end")
note_start = re.compile("note")
note_end = re.compile("note_end")

In [42]:
if re.match(title_start, "title_end"):
    print("yes")

yes


In [43]:
test = "ARGUMENT. asoidau"

In [44]:
test.startswith("ARGUMENT.")

True

In [45]:
len(en1893_str_markers_split)

5765

In [46]:
chapteridx2chaptername = {}
chapteridx2chaptername[0] = "foreword"
chapteridx2chaptername[1] = "prose_translation"
chapteridx2chaptername[2] = "metric_translation"
chapteridx2chaptername[3] = "index"

In [47]:
en1893_str_markers_split[:25]

['',
 ' chapter= ',
 '  ',
 'title',
 'REMARKS  ',
 'title',
 'ON THE    ',
 'title',
 'LIFE AND POEM OF LUCRETIUS.   OF the life of Lucretius but little information has reached us.  Ad nos vix tenuis famæ perlabitur aura.  That he was a Roman by birth, is inferred from the pas  sages in his poem in which he speaks of the Roman world as  his country,1 and of the Roman language as his native tongue.2  As to the time of his birth, it is stated by Eusebius in his  Chronicon, that he was born in the second year of the hundred  and seventy-first Olympiad, or ninety-five years before Christ.  At this period, Ennius had been dead about seventy years ;  Cicero was in his twelfth year; twenty-five years were to  elapse before the birth of Virgil, and four before that of Julius  Caesar. His style, indeed, would make him seem older, but  its antiquated character may be partly affected, in imitation,  perhaps, of Ennius, for whom he expresses great veneration.3  Concerning his family nothing is kn

## remove empty strings

In [48]:
df = pd.DataFrame(en1893_str_markers_split)
df.head()

Unnamed: 0,0
0,
1,chapter=
2,
3,title
4,REMARKS


In [49]:
df.shape

(5765, 1)

In [50]:
df.isna().sum()

0    0
dtype: int64

In [51]:
df[0].replace("   ", np.nan, inplace=True)
df[0].replace("  ", np.nan, inplace=True)
df[0].replace(" ", np.nan, inplace=True)
df[0].replace("", np.nan, inplace=True)

In [52]:
df.isna().sum()

0    1238
dtype: int64

In [53]:
df.dropna(inplace=True)

In [54]:
df.head()

Unnamed: 0,0
1,chapter=
3,title
4,REMARKS
5,title
6,ON THE


In [55]:
df.shape

(4527, 1)

In [56]:
en1893_str_markers_split = list(df[0])

In [57]:
en1893_str_markers_split[11]

' 2 I. 31; ii. 259. '

## Reconstruct text without markers

In [58]:
en_reconstructed = []
en_reconstructed_idx = -1

chapter_counter = -1
chap2idx_start = {}

book_counter = 0

idx2section_name = {}

hit_section_marker = False

lst_test = []

lucretius_en1893_textonly = []

for section_idx, section in enumerate(en1893_str_markers_split):
    lst_test.append(hit_section_marker)
    
    if section == "":
        continue
    elif section == " ":
        continue
    elif section == "  ":
        continue
        
    # if section is chapter marker
    elif re.match(chapter_marker, section):
        # increment chapter counter
        chapter_counter += 1
        # reset book counter
        book_counter = 0
        # mark idx of en_reconstructed where chapter starts
        chap2idx_start[chapter_counter] = en_reconstructed_idx
    
    # if section is title marker
    elif re.match(title_start, section):
        hit_section_marker = True
        # then next section is a title, which will be added to en_reconstructed
        # label as title for titles that don't match instances below (index, book, argument)
        label_nextitem = str(chapteridx2chaptername[chapter_counter]) + "book" + str(book_counter) + "title"
        
        # if title (next item in list) starts with "INDEX"
        if en1893_str_markers_split[section_idx+1].startswith("INDEX."):
            # index is in one section only; rename label
            label_nextitem = "index"

        # if next item in list starts with "BOOK"
        elif en1893_str_markers_split[section_idx+1].startswith("BOOK"):
            # new book is starting
            book_counter += 1
            # rename label
            label_nextitem = str(chapteridx2chaptername[chapter_counter]) + "book" + str(book_counter) + "title"
        
        # if next item in list starts with "ARGUMENT"
        elif en1893_str_markers_split[section_idx+1].startswith("ARGUMENT"):
            # label as commentary
            label_nextitem = str(chapteridx2chaptername[chapter_counter]) + "book" + str(book_counter) + "commentary"
        
        continue
    
    # if section marks the start of a note
    # elif re.match(note_start, section):
    elif section == "note":
        if en1893_str_markers_split[section_idx+1] == "note_end":
            hit_section_marker = False
            continue
        else:
            hit_section_marker = True
            # then next section is a footnote, add to en_reconstructed
            label_nextitem = str(chapteridx2chaptername[chapter_counter]) + "book" + str(book_counter) + "note"
    
    elif re.match(note_end, section):
        # do nothing; just there to separate footnote from following text
        hit_section_marker = False
        continue
    
    elif re.match(para_end, section):
        # do nothing; just there to create separations in sections from raw text
        hit_section_marker = False
        continue
    
    else:
        # all markers have been excluded; append to en_reconstructed
        en_reconstructed.append(section)
        en_reconstructed_idx += 1
        
        if hit_section_marker == True:
            idx2section_name[en_reconstructed_idx] = label_nextitem
            hit_section_marker = False
            
        else:
            idx2section_name[en_reconstructed_idx] = str(chapteridx2chaptername[chapter_counter]) + "book" + str(book_counter)
            if chapter_counter > 0:
                lucretius_en1893_textonly.append(section)


In [59]:
# fix errors here, then initialize en1983_sents_tokenized again
for sent_idx, sent in enumerate(en_reconstructed):
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace(".”1", ".” 1")
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace("tongue.2", "tongue. 2")
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace("veneration.3", "veneration. 3")
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace("insane.3", "insane. 3")
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace(".1", ". 1")
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace("opinion.3", "opinion. 3")
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace(".7", ". 7")
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace(".2", ". 2")
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace(".4", ". 4")
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace('language;"', 'language; "')
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace('Lucretius;"', 'Lucretius; "')
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace("?1", "? 1")
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace(".5", ". 5")
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace(".6", ". 6")
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace("It;.", "It; .")
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace(":8", ": 8")
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace(".3", ". 3")
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace(":2", ": 2")
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace("?2", "? 2")
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace('ἔχειπρός.”Wakefield.', 'ἔχειπρός. ”Wakefield.')
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace("Tacitus. 11", "Tacitus.11")
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace('Æolia.”What', 'Æolia. ”What')
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace('concerned).—', 'concerned). —')
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace('.—', '. —')
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace("nix.—Are", "nix. —Are")
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace("improvida.—Find", "improvida. —Find")
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace(";", "; ")
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace("fleshes.”", "fleshes. ”")
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace(".]Ver", ".] Ver")
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace(":so", ": so")
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace("flumina. —“", "flumina.—“")
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace("armis.–Fortè", "armis. –Fortè")
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace("—But", "— But")
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace("—Or", "— Or")
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace("minds.", "minds. ")
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace(':"', ': "')
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace('first.-The', 'first. -The')
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace(':)', ': )')
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace('body. ——I', 'body.— —I')
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace('curtains. —“', 'curtains.—“')
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace('593. —"', '593.—"')
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace('borne. 3', 'borne.3')
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace('words.J', 'words. J')
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace('compose. 1', 'compose.1')
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace("systems.”Whewell's", "systems. ”Whewell's")
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace('life?3', 'life? 3')
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace("? 1  Or,", "?1 Or,")
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace('true.]1', 'true.] 1')
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace('Phil. i. 1. 1.', 'Phil. i. 1.1.')
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace('up. 1 adhering', 'up.1 adhering')
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace('6, 6. —With', '6, 6.—With')
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace('THE YEAR. 3', 'THE YEAR.3')
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace('elbows?3', 'elbows? 3')
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace(':—idem', ': —idem')
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace('moisture.-Motion', 'moisture. -Motion')
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace('ENTIRE WHOLE. 1', 'ENTIRE WHOLE.1')
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace('sui.”Aul.Gell.', 'sui. ”Aul. Gell.')
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace('itself?3', 'itself? 3')
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace('sun?4', 'sun? 4')
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace('day :—', 'day : —')
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace('every nerve. — ', 'every nerve.—')
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace('fury !— But', 'fury ! —But')
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace('flame. —', 'flame.—')
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace('concealed. —', 'concealed.—')
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace('rose:-', 'rose: -')
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace('exhaustion:—such', 'exhaustion: —such')
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace('beneath.-On', 'beneath. -On')
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace('bring:-', 'bring: -')
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace('Corporeal:—hence', 'Corporeal: —hence')
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace(':-can', ': -can')
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace(':-a', ': -a')
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace(':-by', ': -by')
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace('.And', '. And')
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace('death.-Since', 'death. -Since')
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace(':-some', ': -some')
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace('ever!"— But', 'ever!" —But')
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace('peace. —', 'peace.—')
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace('aid.-No', 'aid. -No')
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace('man. —', 'man.—')
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace('?or', '? or')
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace(':-it', ': -it')
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace('ill. —', 'ill.—')
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace('prevail.-Thus', 'prevail. -Thus')
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace(':-for', ': -for')
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace('well-spring:—creed', 'well-spring: —creed')
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace(':different', ': different')
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace('an:!', 'an: !')
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace('v. 2. 5', 'v. 2.5')
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace('v. 1116;', 'v.1116;')
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace('1. 57,', '1.57,')
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace(':voluntary', ': voluntary')
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace('ib. ;   grow', 'ib. grow')
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace('in, v. 30', 'in, v.30')
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace('tones ? 1', 'tones ?1')

# Tokenize en1893 with and without section markers

In [60]:
en1983_sents_tokenized = []
for idx, sent in enumerate(en1983_sents):
    en1983_sents_tokenized.append(sent.split())

In [61]:
en_reconstructed_tokenized = []
for idx, sent in enumerate(en_reconstructed):
    en_reconstructed_tokenized.append(sent.split())

In [62]:
num_tokens_sents = 0
for sent in en1983_sents_tokenized:
    num_tokens_sents += len(sent)

num_tokens_chapts = 0
for sent in en_reconstructed_tokenized:
    num_tokens_chapts += len(sent)

In [63]:
num_tokens_chapts == num_tokens_sents

True

In [64]:
print(num_tokens_chapts)
print(num_tokens_sents)

202634
202634


## Find data errors in tokenized docs

In [65]:
en1983_tokens_from_sents = flatten_list(en1983_sents_tokenized)
en1983_tokens_from_chapts = flatten_list(en_reconstructed_tokenized)

In [66]:
en1983_tokens_from_sents == en1983_tokens_from_chapts

True

In [67]:
en1983_tokens_from_sents[135670:135682]

['others.]',
 'Ver.',
 '1282.',
 'Aliena',
 'rogorum—',
 'extructa.',
 'Thus',
 'abruptly',
 'ends',
 'the',
 'description',
 'of']

In [68]:
en1983_tokens_from_chapts[135670:135682]

['others.]',
 'Ver.',
 '1282.',
 'Aliena',
 'rogorum—',
 'extructa.',
 'Thus',
 'abruptly',
 'ends',
 'the',
 'description',
 'of']

In [69]:
for idx, token in enumerate(en1983_tokens_from_chapts):
    if token != en1983_tokens_from_sents[idx]:
        print(idx)

# Get sent idx to section name dict for en1893

In [70]:
def build_sent_to_section_dict(lst_tokenized_sents, lst_tokenized_chapts,
                               dict_chapter_2_section):
    """
    Build dict of sentence idx to section name
    """
    sent_idx_2_section_name = {}
    token_counter = 0 # per section/chapter
    current_section_idx = 0
    for idx_sent, sent in enumerate(lst_tokenized_sents):
        token_counter += len(sent)
        current_chapter_length = len(lst_tokenized_chapts[current_section_idx])
        if token_counter < current_chapter_length:
            # add sent to dict
            sent_idx_2_section_name[idx_sent] = dict_chapter_2_section[current_section_idx]
        elif token_counter == current_chapter_length:
            # add sent to dict as part of current section
            sent_idx_2_section_name[idx_sent] = dict_chapter_2_section[current_section_idx]
            # reset token counter and current section idx for next sent iteration
            token_counter = 0
            current_section_idx += 1
        else: # token_counter > current_chapter_length, i.e. we've crossed a section boundary 
            # add sent to current section and next section
            sent_idx_2_section_name[idx_sent] = [
                dict_chapter_2_section[current_section_idx], 
                dict_chapter_2_section[current_section_idx+1]]
            # adjust token counter by only including portion of sent in new section
            token_counter = token_counter - current_chapter_length
            # update current section idx for next sent iteration
            current_section_idx += 1
    return sent_idx_2_section_name

In [71]:
en1893_sent2section_name = build_sent_to_section_dict(
    en1983_sents_tokenized, en_reconstructed_tokenized, idx2section_name)


## Write to json

In [72]:
# path_out = "/home/craig.car/repos/chiron/align_texts_project/data/lucretius/en1893_sent2section_dict_jul25.json"
# with open(path_out, 'w') as fp:
#     json.dump(en1893_sent2section_name, fp)

In [73]:
# path_out = "/home/craig.car/repos/chiron/align_texts_project/data/lucretius/en1893_sent2section_dict_NEW.json"
# with open(path_out, 'w') as fp:
#     json.dump(en1893_sent2section_name, fp)

# Get Lucretius text only

In [74]:
prose_sents = ['prose_translationbook1',
               'prose_translationbook2',
               'prose_translationbook3',
               'prose_translationbook4',
               'prose_translationbook5',
               'prose_translationbook6']

metric_sents = ['metric_translationbook1',
                'metric_translationbook2',
                'metric_translationbook3',
                'metric_translationbook4',
                'metric_translationbook5',
                'metric_translationbook6']

In [75]:
# get raw text again and do not remove newlines (as did when building en_reconstructed)
# raw_path_markers = "/home/craig.car/repos/chiron/align_texts_project/data/lucretius/lucretius_en1893_str_withsections.txt"
raw_path_markers = "/home/craig.car/repos/chiron/align_texts_project/data/lucretius/lucretius_en1893_str_withsections_titleend.txt"
en1893_raw_withnewlines = load_txt_as_lst(raw_path_markers)

In [76]:
en1893_raw_withnewlines[:10]

['#@$% chapter= #@$%\n',
 '\n',
 '#@$%title#@$%REMARKS\n',
 '#@$%title_end#@$%\n',
 '#@$%paragraph end#@$%\n',
 '#@$%title#@$%ON THE\n',
 '#@$%title_end#@$%   #@$%title#@$%LIFE AND POEM OF LUCRETIUS.\n',
 '#@$%title_end#@$%  OF the life of Lucretius but little information has reached us.  Ad nos vix tenuis famæ perlabitur aura.\n',
 '#@$%paragraph end#@$%\n',
 'That he was a Roman by birth, is inferred from the pas  sages in his poem in which he speaks of the Roman world as  his country,1 and of the Roman language as his native tongue.2\n']

In [77]:
en1893_markers_newlines = " ".join(en1893_raw_withnewlines)

In [78]:
en1893_markers_newlines[:500]

'#@$% chapter= #@$%\n \n #@$%title#@$%REMARKS\n #@$%title_end#@$%\n #@$%paragraph end#@$%\n #@$%title#@$%ON THE\n #@$%title_end#@$%   #@$%title#@$%LIFE AND POEM OF LUCRETIUS.\n #@$%title_end#@$%  OF the life of Lucretius but little information has reached us.  Ad nos vix tenuis famæ perlabitur aura.\n #@$%paragraph end#@$%\n That he was a Roman by birth, is inferred from the pas  sages in his poem in which he speaks of the Roman world as  his country,1 and of the Roman language as his native tongue.2\n #@$'

In [79]:
en1893_str_markers_newlines_split = en1893_markers_newlines.split("#@$%")

In [80]:
# Lose newlines at end of extracted chunks after splitting
en1893_str_markers_newlines_split[:25]

['',
 ' chapter= ',
 '\n \n ',
 'title',
 'REMARKS\n ',
 'title_end',
 '\n ',
 'paragraph end',
 '\n ',
 'title',
 'ON THE\n ',
 'title_end',
 '   ',
 'title',
 'LIFE AND POEM OF LUCRETIUS.\n ',
 'title_end',
 '  OF the life of Lucretius but little information has reached us.  Ad nos vix tenuis famæ perlabitur aura.\n ',
 'paragraph end',
 '\n That he was a Roman by birth, is inferred from the pas  sages in his poem in which he speaks of the Roman world as  his country,1 and of the Roman language as his native tongue.2\n ',
 'paragraph end',
 '\n As to the time of his birth, it is stated by Eusebius in his  Chronicon, that he was born in the second year of the hundred  and seventy-first Olympiad, or ninety-five years before Christ.  At this period, Ennius had been dead about seventy years ;  Cicero was in his twelfth year; twenty-five years were to  elapse before the birth of Virgil, and four before that of Julius  Caesar. His style, indeed, would make him seem older, but  its antiquat

In [81]:
"title_end" == "title_end"

True

In [82]:
def reconstruct_txt_markers(text_markers_split, chapteridx2chaptername):
    en_reconstructed = []
    en_reconstructed_idx = -1

    chapter_counter = -1
    chap2idx_start = {}

    book_counter = 0

    idx2section_name = {}

    hit_section_marker = False
    hit_index = False
    hit_endprose6 = False
    hit_endprose6commentary = False

    lst_test = []

    lucretius_en1893_textonly = []
    
    footnotes_counter = 0

    for section_idx, section in enumerate(text_markers_split):
        lst_test.append(hit_section_marker)

        if section == "":
            continue
        elif section == " ":
            continue
        elif section == "  ":
            continue
        elif section == "   ":
            continue
        elif section == "\n ":
            continue
        elif section == "\n \n ":
            continue

        # if section is chapter marker
        elif re.match(chapter_marker, section):
            # increment chapter counter
            chapter_counter += 1
            # reset book counter
            book_counter = 0
            # mark idx of en_reconstructed where chapter starts
            chap2idx_start[chapter_counter] = en_reconstructed_idx

        # if section is title start marker
        elif section == "title":
            hit_section_marker = True
            # then next section is a title, which will be added to en_reconstructed
            # label as title for titles that don't match instances below (index, argument)
            label_nextitem = str(chapteridx2chaptername[chapter_counter]) + "book" + str(book_counter) + "title"

            # if title (next item in list) starts with "INDEX"
            if text_markers_split[section_idx+1].startswith("INDEX."):
                # remaining sections are index
                hit_index = True
                label_nextitem = "index"
                
            # if next item in list starts with "BOOK"
            elif text_markers_split[section_idx+1].startswith("BOOK"):
                # new book is starting
                book_counter += 1
                # # in metric translation, Book title is appended to beginning of first book verse
                # if chapter_counter == 2:
                #     hit_section_marker = False
                    
            # if next item in list starts with "ARGUMENT"
            elif text_markers_split[section_idx+1].startswith("ARGUMENT."):
                # label as commentary
                label_nextitem = str(chapteridx2chaptername[chapter_counter]) + "book" + str(book_counter) + "commentary"

            continue

        # if section marks the start of a note
        # elif re.match(note_start, section):
        elif section == "note":
            if text_markers_split[section_idx+1] == "note_end":
                hit_section_marker = False
                continue
            else:
                hit_section_marker = True
                # then next section is a footnote, add to en_reconstructed
                label_nextitem = str(chapteridx2chaptername[chapter_counter]) + "book" + str(book_counter) + "note"
                footnotes_counter += 1

        elif re.match(note_end, section):
            # do nothing; just there to separate footnote from following text
            hit_section_marker = False
            continue

        elif re.match(para_end, section):
            # do nothing; just there to create separations in sections from raw text
            hit_section_marker = False
            continue
        
        elif section == "title_end":
            # label section following "ARGUMENT" as commentary
            if text_markers_split[section_idx-1].startswith("ARGUMENT."):
                hit_section_marker = True
                label_nextitem = str(chapteridx2chaptername[chapter_counter]) + "book" + str(book_counter) + "commentary"
            continue
        
        else:
            # all markers have been excluded; append to en_reconstructed
            en_reconstructed.append(section)
            en_reconstructed_idx += 1

            if hit_section_marker == True:
                idx2section_name[en_reconstructed_idx] = label_nextitem
                hit_section_marker = False
            
            elif hit_index == True:
                idx2section_name[en_reconstructed_idx] = "index"
            
            elif "Thus abruptly ends the description of the plague," in section:
                idx2section_name[en_reconstructed_idx] = "prose_translationbook6commentary"
                hit_endprose6 = True
            
            elif "If the reader wish to see more accounts of pestilence" in section:
                idx2section_name[en_reconstructed_idx] = "prose_translationbook6commentary"
                hit_endprose6 = False
                
            elif hit_endprose6 == True:
                idx2section_name[en_reconstructed_idx] = "prose_translationbook6commentary"
                # if chapter_counter == 1:
                #     idx2section_name[en_reconstructed_idx] = "prose_translationbook6commentary"
                # elif chapter_counter == 2:
                #     # idx2section_name[en_reconstructed_idx] = "prose_translationbook6commentary"
                #     hit_endprose6 = False
            
            elif "THE END." in section:
                idx2section_name[en_reconstructed_idx] = "metric_translationbook6note"
            
            else:
                idx2section_name[en_reconstructed_idx] = str(chapteridx2chaptername[chapter_counter]) + "book" + str(book_counter)
                if chapter_counter > 0:
                    lucretius_en1893_textonly.append(section)
    print(f"there are {footnotes_counter} footnotes in Lucretius English edition")
    return en_reconstructed, idx2section_name, lucretius_en1893_textonly

In [83]:
en_reconstructed_withnewlines, idx2section_name_withnewlines, lucretius_en1893_textonly_withnewlines = reconstruct_txt_markers(
    en1893_str_markers_newlines_split, chapteridx2chaptername)

there are 1426 footnotes in Lucretius English edition


In [84]:
# apply same changes to en_reconstructed_withnewlines in order to match tokenization
for sent_idx, sent in enumerate(en_reconstructed_withnewlines):
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace(".”1", ".” 1")
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace("tongue.2", "tongue. 2")
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace("veneration.3", "veneration. 3")
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace("insane.3", "insane. 3")
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace(".1", ". 1")
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace("opinion.3", "opinion. 3")
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace(".7", ". 7")
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace(".2", ". 2")
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace(".4", ". 4")
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace('language;"', 'language; "')
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace('Lucretius;"', 'Lucretius; "')
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace("?1", "? 1")
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace(".5", ". 5")
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace(".6", ". 6")
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace("It;.", "It; .")
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace(":8", ": 8")
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace(".3", ". 3")
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace(":2", ": 2")
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace("?2", "? 2")
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace('ἔχειπρός.”Wakefield.', 'ἔχειπρός. ”Wakefield.')
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace("Tacitus. 11", "Tacitus.11")
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace('Æolia.”What', 'Æolia. ”What')
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace('concerned).—', 'concerned). —')
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace('.—', '. —')
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace("nix.—Are", "nix. —Are")
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace("improvida.—Find", "improvida. —Find")
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace(";", "; ")
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace("fleshes.”", "fleshes. ”")
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace(".]Ver", ".] Ver")
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace(":so", ": so")
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace("flumina. —“", "flumina.—“")
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace("armis.–Fortè", "armis. –Fortè")
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace("—But", "— But")
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace("—Or", "— Or")
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace("minds.", "minds. ")
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace(':"', ': "')
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace('first.-The', 'first. -The')
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace(':)', ': )')
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace('body. ——I', 'body.— —I')
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace('curtains. —“', 'curtains.—“')
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace('593. —"', '593.—"')
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace('borne. 3', 'borne.3')
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace('words.J', 'words. J')
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace('compose. 1', 'compose.1')
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace("systems.”Whewell's", "systems. ”Whewell's")
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace('life?3', 'life? 3')
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace("? 1  Or,", "?1 Or,")
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace('true.]1', 'true.] 1')
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace('Phil. i. 1. 1.', 'Phil. i. 1.1.')
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace('up. 1 adhering', 'up.1 adhering')
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace('6, 6. —With', '6, 6.—With')
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace('THE YEAR. 3', 'THE YEAR.3')
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace('elbows?3', 'elbows? 3')
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace(':—idem', ': —idem')
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace('moisture.-Motion', 'moisture. -Motion')
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace('ENTIRE WHOLE. 1', 'ENTIRE WHOLE.1')
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace('sui.”Aul.Gell.', 'sui. ”Aul. Gell.')
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace('itself?3', 'itself? 3')
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace('sun?4', 'sun? 4')
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace('day :—', 'day : —')
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace('every nerve. — ', 'every nerve.—')
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace('fury !— But', 'fury ! —But')
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace('flame. —', 'flame.—')
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace('concealed. —', 'concealed.—')
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace('rose:-', 'rose: -')
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace('exhaustion:—such', 'exhaustion: —such')
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace('beneath.-On', 'beneath. -On')
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace('bring:-', 'bring: -')
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace('Corporeal:—hence', 'Corporeal: —hence')
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace(':-can', ': -can')
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace(':-a', ': -a')
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace(':-by', ': -by')
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace('.And', '. And')
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace('death.-Since', 'death. -Since')
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace(':-some', ': -some')
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace('ever!"— But', 'ever!" —But')
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace('peace. —', 'peace.—')
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace('aid.-No', 'aid. -No')
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace('man. —', 'man.—')
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace('?or', '? or')
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace(':-it', ': -it')
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace('ill. —', 'ill.—')
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace('prevail.-Thus', 'prevail. -Thus')
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace(':-for', ': -for')
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace('well-spring:—creed', 'well-spring: —creed')
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace(':different', ': different')
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace('an:!', 'an: !')
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace('v. 2. 5', 'v. 2.5')
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace('v. 1116;', 'v.1116;')
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace('1. 57,', '1.57,')
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace(':voluntary', ': voluntary')
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace('ib. ;   grow', 'ib. grow')
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace('in, v. 30', 'in, v.30')
    en_reconstructed_withnewlines[sent_idx] = en_reconstructed_withnewlines[sent_idx].replace('tones ? 1', 'tones ?1')

In [85]:
len(idx2section_name_withnewlines)

3399

In [86]:
idx2section_name_withnewlines[2009]

'metric_translationbook1'

In [87]:
en1893_textonly_newlines_fromdict = []
idx_counter = 0
idx2section_name_textonly = {}
for key in range(len(idx2section_name_withnewlines)):
    if idx2section_name_withnewlines[key] in prose_sents:
        en1893_textonly_newlines_fromdict.append(en_reconstructed_withnewlines[key])
        idx2section_name_textonly[idx_counter] = idx2section_name_withnewlines[key]
        idx_counter += 1
    elif idx2section_name_withnewlines[key] in metric_sents:
        en1893_textonly_newlines_fromdict.append(en_reconstructed_withnewlines[key])
        idx2section_name_textonly[idx_counter] = idx2section_name_withnewlines[key]
        idx_counter += 1

In [88]:
# check there's a space after colon in "fruit-producing earth; 5 since"
for idx, item in enumerate(en1893_textonly_newlines_fromdict):
    if "O BOUN riFUL" in item:
        print(item)

 O BOUN riFUL Venus,1 mother of the race of Æneas,2 delight  of gods and men, who, beneath the gliding constellations  of heaven,3 fillest with life4 the ship-bearing sea and the  fruit-producing earth; 5 since by thy influence every kind of
 


In [89]:
len(en1893_textonly_newlines_fromdict)

1743

In [90]:
len(en_reconstructed_withnewlines)

3399

## Test dict: idx2section_name_textonly (chunk-level from xml)

In [91]:
# Prose Book 1: 0 - 139
for keys in range(0,140):
    if idx2section_name_textonly[keys] != "prose_translationbook1":
        print(keys)

In [92]:
# Prose Book 2: 140 - 276
for keys in range(140, 277):
    if idx2section_name_textonly[keys] != "prose_translationbook2":
        print(keys)

In [93]:
# Prose Book 3: 277 - 415
for keys in range(281, 416):
    if idx2section_name_textonly[keys] != "prose_translationbook3":
        print(keys)

In [94]:
# Prose Book 4: 416 - 576
for keys in range(416, 577):
    if idx2section_name_textonly[keys] != "prose_translationbook4":
        print(keys)

In [95]:
# Prose Book 5: 577 - 764
for keys in range(577, 765):
    if idx2section_name_textonly[keys] != "prose_translationbook5":
        print(keys)

In [96]:
# Prose Book 6: 765 - 967
for keys in range(765, 968):
    if idx2section_name_textonly[keys] != "prose_translationbook6":
        print(keys)

In [97]:
# Metric Book 1: 968 - 1075
for keys in range(968, 1076):
    if idx2section_name_textonly[keys] != "metric_translationbook1":
        print(keys)

In [98]:
# Metric Book 2: 1076 - 1187
for keys in range(1076, 1188):
    if idx2section_name_textonly[keys] != "metric_translationbook2":
        print(keys)

In [99]:
# Metric Book 3: 1188 - 1305
for keys in range(1188, 1306):
    if idx2section_name_textonly[keys] != "metric_translationbook3":
        print(keys)

In [100]:
# Metric Book 4: 1306 - 1415
for keys in range(1306, 1416):
    if idx2section_name_textonly[keys] != "metric_translationbook4":
        print(keys)

In [101]:
# Metric Book 5: 1416 - 1571
for keys in range(1416, 1572):
    if idx2section_name_textonly[keys] != "metric_translationbook5":
        print(keys)

In [102]:
# Metric Book 6: 1572 - 1742 (end)
for keys in range(1572, 1743):
    if idx2section_name_textonly[keys] != "metric_translationbook6":
        print(keys)

## Segment text only into sentences

In [103]:
# strip newlines at beginning of each chunk
en1893_textonly_newlines_fromdict = [x.lstrip() for x in en1893_textonly_newlines_fromdict]

In [104]:
en1893_textonly_newlines_fromdict[0]

'O BOUN riFUL Venus,1 mother of the race of Æneas,2 delight  of gods and men, who, beneath the gliding constellations  of heaven,3 fillest with life4 the ship-bearing sea and the  fruit-producing earth; 5 since by thy influence every kind of\n '

In [105]:
en1893_textonly_newlines_fromdict = [x.rstrip() for x in en1893_textonly_newlines_fromdict]

In [106]:
en1893_textonly_newlines_fromdict[0]

'O BOUN riFUL Venus,1 mother of the race of Æneas,2 delight  of gods and men, who, beneath the gliding constellations  of heaven,3 fillest with life4 the ship-bearing sea and the  fruit-producing earth; 5 since by thy influence every kind of'

In [107]:
en1893_str_textonly = " ".join(en1893_textonly_newlines_fromdict)

In [108]:
en1893_sents_textonly = preprocess_series(en1893_str_textonly, "en", stanza_model_)

segmented str into sentences


In [109]:
en1893_sents_textonly[:10]

['O BOUN riFUL Venus,1 mother of the race of Æneas,2 delight  of gods and men, who, beneath the gliding constellations  of heaven,3 fillest with life4 the ship-bearing sea and the  fruit-producing earth;',
 '5 since by thy influence every kind of living creature is conceived, and, springing forth, hails the  light of the sun.',
 'Thee, 0 goddess, thee the winds flee ;',
 'be  fore thee, and thy approach, the clouds of heaven disperse;',
 'for  thee the variegated earth2 puts forth3 her fragrant flowers;',
 'on thee the waters of ocean smile, and the calmed heaven  beams with effulgent4 light.',
 'For, as soon as the vernal face  of day5 is unveiled, and the genial gale of Favonius exerts  its power unconfined, the birds of the air first, 0 goddess,  testify of thee and thy coming, smitten in heart by thy influ  ence.',
 'Next, the wild herds bound over the joyous pastures,  and swim across the rapid streams.',
 'So all kinds of living  creatures, captivated by thy charms and thy allure

In [110]:
en1893_sents_textonly[-10:]

['At length the temples of the gods themselves,  Changed into charnels, and their sacred shrines  Thronged with the dead:',
 'for superstition now,',
 'The power of altars, half their sway had lost,  Whelmed in the pressure of the present woe.',
 'Nor longer now the costly rites prevailed  Of ancient burial, erst punctilious kept:',
 'For all roved restless, with distracted mind,  From scene to scene ;',
 "and worn with grief and toil  Gave to their friends th' interment chance allowed.",
 'And direst exigence impelled them oft,  Headlong, to deeds most impious;',
 "for the pyres  Funereal seized they, reared not by themselves,  And with loud dirge, and wailing wild, o'er these  Placed their own dead;",
 "amid th' unhallowed blaze  With blood contending, rather than resign",
 "The tomb thus gained, or quit th' enkindling corse."]

In [111]:
len(en1893_sents_textonly)

8165

In [112]:
# path_out = "/home/craig.car/repos/chiron/align_texts_project/data/lucretius/lucretius_en1893_sents_textonly.txt"
# write_file(en1893_sents_textonly, path_out)

## Tokenize text only

In [113]:
# fix tokenization discrepancies in en1893_textonly_newlines_fromdict
for sent_idx, sent in enumerate(en1893_textonly_newlines_fromdict):
    en1893_textonly_newlines_fromdict[sent_idx] = en1893_textonly_newlines_fromdict[sent_idx].replace('borne.3', 'borne. 3')
    en1893_textonly_newlines_fromdict[sent_idx] = en1893_textonly_newlines_fromdict[sent_idx].replace('compose.1', 'compose. 1')
    en1893_textonly_newlines_fromdict[sent_idx] = en1893_textonly_newlines_fromdict[sent_idx].replace('tones ?1', 'tones ? 1')

In [114]:
en1893_sents_textonly_tokenized = []
for idx, sent in enumerate(en1893_sents_textonly):
    en1893_sents_textonly_tokenized.append(sent.split())

en1893_chapts_textonly_tokenized = []
for idx, sent in enumerate(en1893_textonly_newlines_fromdict):
    en1893_chapts_textonly_tokenized.append(sent.split())

In [115]:
num_tokens_sents_textonly = 0
for sent in en1893_sents_textonly_tokenized:
    num_tokens_sents += len(sent)

num_tokens_chapts_textonly = 0
for sent in en1893_chapts_textonly_tokenized:
    num_tokens_chapts += len(sent)

num_tokens_sents_textonly == num_tokens_chapts_textonly

True

In [116]:
en1893_tokens_from_sents_textonly = flatten_list(en1893_sents_textonly_tokenized)
en1893_tokens_from_chapts_textonly = flatten_list(en1893_chapts_textonly_tokenized)

en1893_tokens_from_sents_textonly == en1893_tokens_from_chapts_textonly

True

In [117]:
en1893_tokens_from_sents_textonly[57618:57623]

['?1', 'Or,', 'forsooth,', 'the', 'life']

In [118]:
en1893_tokens_from_chapts_textonly[57618:57623]

['?1', 'Or,', 'forsooth,', 'the', 'life']

In [119]:
for idx, token in enumerate(en1893_tokens_from_chapts_textonly):
    if en1893_tokens_from_chapts_textonly[idx] != en1893_tokens_from_sents_textonly[idx]:
        print(idx)

## build sent2section_name_textonly dict

In [120]:
en1893_sent2section_name_textonly = build_sent_to_section_dict(en1893_sents_textonly_tokenized,
                                                               en1893_chapts_textonly_tokenized,
                                                               idx2section_name_textonly)

In [121]:
len(en1893_sent2section_name_textonly)

8165

In [122]:
len(en1893_sents_textonly_tokenized)

8165

In [123]:
# write to file
# path_out = "/home/craig.car/repos/chiron/align_texts_project/data/lucretius/en1893_sent2section_dict_textonly.json"
# with open(path_out, 'w') as fp:
#     json.dump(en1893_sent2section_name_textonly, fp)

### TODO: test sent2section_name_textonly dict

In [124]:
en1893_sent2section_name_textonly[2253]

['prose_translationbook4', 'prose_translationbook4']

In [125]:
en1893_sents_textonly[2253]

"In scenes like these, the genial soil lock up,  Or curse with barren love the man unblest, No lovely race who Ioasts to hail him sire,-  As deem the many, who, in sadness drown'd,  Oft offer victims, and, with fragrant gums,  Kindle the blazing altar, wearying heav'n  Vainly, tc fill the void reluctant womb."

In [126]:
en1893_sents_textonly[6519]

'* * * * * * * WHO, from his burning breast, a strain may strike  Meet for the boundless majesty of things ?'

In [127]:
en1893_sent2section_name_textonly[6519]

['metric_translationbook4', 'metric_translationbook5']

In [128]:
en1893_sent2section_name_textonly[6520]

'metric_translationbook5'

In [129]:
# check if any en sents overlap sections
num_ensents_manychaps = 0
overlaps_same_chap = 0
for key, values in en1893_sent2section_name_textonly.items():
    if isinstance(values, list):
        if len(values) > 1:
            num_ensents_manychaps += 1
            values_set = set(values)
            if len(values_set) == 1:
                overlaps_same_chap += 1
                # print(key)
print(num_ensents_manychaps)
print(overlaps_same_chap)

433
432


Note: The 432 sentences "overlapping" the same book is because each book is spread across multiple chunks when extracted from the original xml file. These sentences' section names are therefore converted back to a single section.

The remaining 1 sentence is the following: "* * * * * * * WHO, from his burning breast, a strain may strike  Meet for the boundless majesty of things ?" (en1893_sents_textonly[6519]). The asterisks are from the end of Book 4. I've kept this one intact.

In [130]:
for key, values in en1893_sent2section_name_textonly.items():
    if isinstance(values, list):
        if len(values) > 1:
            values_set = set(values)
            if len(values_set) == 1:
                en1893_sent2section_name_textonly[key] = "".join([char for char in values_set])

# check if any en sents overlap sections
num_ensents_manychaps = 0
overlaps_same_chap = 0
for key, values in en1893_sent2section_name_textonly.items():
    if isinstance(values, list):
        if len(values) > 1:
            num_ensents_manychaps += 1
            values_set = set(values)
            if len(values_set) == 1:
                overlaps_same_chap += 1
            else:
                print(f"Remaining sentence belonging to two sections has index {key} and is from sections {values}")
print(num_ensents_manychaps)
print(overlaps_same_chap)            

Remaining sentence belonging to two sections has index 6519 and is from sections ['metric_translationbook4', 'metric_translationbook5']
1
0


# TODO: full edition experiment
1. TEST new dict (see end of notebook)
2. Run through score vec results notebook
3. Check results
4. Update numbers in paper

In [131]:
en_reconstructed_TEST, idx2section_name_TEST, lucretius_en1893_textonly_TEST = reconstruct_txt_markers(
    en1893_str_markers_newlines_split, chapteridx2chaptername)

there are 1426 footnotes in Lucretius English edition


In [132]:
def reconcile_sents_chapts(en_reconstructed_lst):
    # apply same changes to en_reconstructed_lst in order to match tokenization
    for sent_idx, sent in enumerate(en_reconstructed_lst):
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace(".”1", ".” 1")
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace("tongue.2", "tongue. 2")
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace("veneration.3", "veneration. 3")
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace("insane.3", "insane. 3")
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace(".1", ". 1")
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace("opinion.3", "opinion. 3")
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace(".7", ". 7")
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace(".2", ". 2")
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace(".4", ". 4")
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace('language;"', 'language; "')
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace('Lucretius;"', 'Lucretius; "')
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace("?1", "? 1")
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace(".5", ". 5")
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace(".6", ". 6")
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace("It;.", "It; .")
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace(":8", ": 8")
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace(".3", ". 3")
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace(":2", ": 2")
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace("?2", "? 2")
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace('ἔχειπρός.”Wakefield.', 'ἔχειπρός. ”Wakefield.')
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace("Tacitus. 11", "Tacitus.11")
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace('Æolia.”What', 'Æolia. ”What')
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace('concerned).—', 'concerned). —')
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace('.—', '. —')
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace("nix.—Are", "nix. —Are")
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace("improvida.—Find", "improvida. —Find")
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace(";", "; ")
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace("fleshes.”", "fleshes. ”")
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace(".]Ver", ".] Ver")
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace(":so", ": so")
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace("flumina. —“", "flumina.—“")
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace("armis.–Fortè", "armis. –Fortè")
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace("—But", "— But")
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace("—Or", "— Or")
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace("minds.", "minds. ")
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace(':"', ': "')
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace('first.-The', 'first. -The')
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace(':)', ': )')
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace('body. ——I', 'body.— —I')
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace('curtains. —“', 'curtains.—“')
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace('593. —"', '593.—"')
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace('borne. 3', 'borne.3')
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace('words.J', 'words. J')
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace('compose. 1', 'compose.1')
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace("systems.”Whewell's", "systems. ”Whewell's")
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace('life?3', 'life? 3')
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace("? 1  Or,", "?1 Or,")
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace('true.]1', 'true.] 1')
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace('Phil. i. 1. 1.', 'Phil. i. 1.1.')
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace('up. 1 adhering', 'up.1 adhering')
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace('6, 6. —With', '6, 6.—With')
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace('THE YEAR. 3', 'THE YEAR.3')
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace('elbows?3', 'elbows? 3')
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace(':—idem', ': —idem')
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace('moisture.-Motion', 'moisture. -Motion')
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace('ENTIRE WHOLE. 1', 'ENTIRE WHOLE.1')
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace('sui.”Aul.Gell.', 'sui. ”Aul. Gell.')
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace('itself?3', 'itself? 3')
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace('sun?4', 'sun? 4')
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace('day :—', 'day : —')
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace('every nerve. — ', 'every nerve.—')
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace('fury !— But', 'fury ! —But')
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace('flame. —', 'flame.—')
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace('concealed. —', 'concealed.—')
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace('rose:-', 'rose: -')
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace('exhaustion:—such', 'exhaustion: —such')
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace('beneath.-On', 'beneath. -On')
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace('bring:-', 'bring: -')
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace('Corporeal:—hence', 'Corporeal: —hence')
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace(':-can', ': -can')
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace(':-a', ': -a')
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace(':-by', ': -by')
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace('.And', '. And')
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace('death.-Since', 'death. -Since')
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace(':-some', ': -some')
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace('ever!"— But', 'ever!" —But')
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace('peace. —', 'peace.—')
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace('aid.-No', 'aid. -No')
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace('man. —', 'man.—')
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace('?or', '? or')
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace(':-it', ': -it')
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace('ill. —', 'ill.—')
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace('prevail.-Thus', 'prevail. -Thus')
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace(':-for', ': -for')
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace('well-spring:—creed', 'well-spring: —creed')
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace(':different', ': different')
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace('an:!', 'an: !')
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace('v. 2. 5', 'v. 2.5')
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace('v. 1116;', 'v.1116;')
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace('1. 57,', '1.57,')
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace(':voluntary', ': voluntary')
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace('ib. ;   grow', 'ib. grow')
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace('in, v. 30', 'in, v.30')
        en_reconstructed_lst[sent_idx] = en_reconstructed_lst[sent_idx].replace('tones ? 1', 'tones ?1')
    return en_reconstructed_lst

In [133]:
en_reconstructed_TEST_rec = reconcile_sents_chapts(en_reconstructed_TEST)

In [134]:
en1893_sents_tokenized_TEST = []
for idx, sent in enumerate(en1983_sents):
    en1893_sents_tokenized_TEST.append(sent.split())

In [135]:
en_reconstructed_tokenized_TEST = []
for idx, sent in enumerate(en_reconstructed_TEST):
    en_reconstructed_tokenized_TEST.append(sent.split())

In [136]:
num_tokens_sents = 0
for sent in en1893_sents_tokenized_TEST:
    num_tokens_sents += len(sent)

num_tokens_chapts = 0
for sent in en_reconstructed_tokenized_TEST:
    num_tokens_chapts += len(sent)

In [137]:
print(num_tokens_sents)
print(num_tokens_chapts)

202634
202634


In [138]:
en1983_tokens_from_sents_TEST = flatten_list(en1893_sents_tokenized_TEST)
en1983_tokens_from_chapts_TEST = flatten_list(en_reconstructed_tokenized_TEST)

In [139]:
en1983_tokens_from_sents_TEST == en1983_tokens_from_chapts_TEST

True

In [140]:
en1983_tokens_from_chapts_TEST[61:65]

['tongue.', '2', 'As', 'to']

In [141]:
for idx, token in enumerate(en1983_tokens_from_chapts_TEST):
    if token != en1983_tokens_from_sents_TEST[idx]:
        print(idx)

In [142]:
en1893_sent2section_name_TEST = build_sent_to_section_dict(
    en1893_sents_tokenized_TEST, en_reconstructed_tokenized_TEST, idx2section_name_TEST)

In [143]:
for idx, item in enumerate(en1983_sents):
    if "Thus abruptly ends" in item:
        print(item)
        print(idx)

Thus abruptly ends the description of the plague, and the poem  of Lucretius.
8576


In [144]:
en1893_sent2section_name_TEST[8576]

'prose_translationbook6commentary'

In [145]:
en1893_sent2section_name_TEST[8614]

'metric_translationbook1'

In [146]:
en1983_sents[8614]

'PARENT of ROME ! by gods and men beloved,  Benignant VENUS!'

In [147]:
for idx, item in enumerate(en_reconstructed_TEST_rec):
    if "PARENT of ROME ! by" in item:
        print(item)
        print(idx)

  PARENT of ROME ! by gods and men beloved,  Benignant VENUS! thou, the sail-clad main  And fruitful earth, as round the seasons roll,  With life who swellest, for by thee all live,  And, living, hail the cheerful light of day : —  Thee, goddess, at thy glad approach, the winds.  The tempests fly : dedalian Earth to thee  Pours forth her sweetest flow'rets: Ocean laughs,  And the blue heavens in cloudless splendour decked.  For, when the Spring first opes her frolic eye,  And genial zephyrs long locked up respire,  Thee, goddess, then, th' aerial birds confess,  To rapture stung through every shivering plume:  Thee, the wild herds ;  hence, o'er the joyous glebe  Bounding at large;  or, with undaunted chest,  Stemming the torrent tides. Through all that lives  So, by thy charms, thy blandishments overpowered,  Springs the warm wish thy footsteps to pursue:  Till through the seas, the mountains, and the floods,  The verdant meads, and woodlands filled with song  Spurred by desire each p

In [148]:
idx2section_name_TEST[2007]

'metric_translationbook0title'

In [149]:
en_reconstructed_TEST_rec[2007]

'BY JOHN MASON GOOD.\n '

In [150]:
len(en_reconstructed_tokenized_TEST)

3399

In [151]:
len(en_reconstructed_TEST_rec)

3399

In [152]:
def build_sent_to_section_dict_NEW(lst_tokenized_sents, lst_tokenized_chapts,
                               dict_chapter_2_section):
    """
    Build dict of sentence idx to section name
    """
    sent_idx_2_section_name = {}
    token_counter = 0 # per section/chapter
    current_section_idx = 0
    for idx_sent, sent in enumerate(lst_tokenized_sents):
        token_counter += len(sent)
        current_chapter_length = len(lst_tokenized_chapts[current_section_idx])
        if token_counter < current_chapter_length:
            # add sent to dict
            sent_idx_2_section_name[idx_sent] = dict_chapter_2_section[current_section_idx]
        elif token_counter == current_chapter_length:
            # add sent to dict as part of current section
            sent_idx_2_section_name[idx_sent] = dict_chapter_2_section[current_section_idx]
            # reset token counter and current section idx for next sent iteration
            token_counter = 0
            current_section_idx += 1
        else: # token_counter > current_chapter_length, i.e. we've crossed a section boundary
            print(idx_sent)
            # add sent to current section and next section
            sent_idx_2_section_name[idx_sent] = [
                dict_chapter_2_section[current_section_idx], 
                dict_chapter_2_section[current_section_idx+1]]
            # adjust token counter by only including portion of sent in new section
            token_counter = token_counter - current_chapter_length
            # update current section idx for next sent iteration
            current_section_idx += 1
    return sent_idx_2_section_name

In [153]:
# path_out = "/home/craig.car/repos/chiron/align_texts_project/data/lucretius/en1893_sent2section_dict_jul25_NEW.json"
# with open(path_out, 'w') as fp:
#     json.dump(en1893_sent2section_name_TEST, fp)

## TEST DICT: sent2section_name full edition

In [154]:
en1893_sent2section_name_TEST[11678]

'metric_translationbook4note'

In [155]:
en1983_sents[11678]

"As the remaining portion of Good's Translation of this book has already  been given at page 182, it has been deemed unnecessary to reprint it in its  place here."

In [156]:
len(en1893_sent2section_name_TEST)

14648

In [157]:
for sent_idx in range(0, len(en1893_sent2section_name_TEST), 100):
    print(sent_idx+1)
    print(en1893_sent2section_name_TEST[sent_idx])
    print(en1983_sents[sent_idx])
    print("=============")

1
forewordbook0title
REMARKS
101
forewordbook0
Statius did him more justice, when he spoke of the  docti furor arduus Lucret, “ the lofty rage of the learned  Lucretius.”6
201
forewordbook0
vision ends.
301
forewordbook0
It is therefore probable that there are many  worlds of many kinds.
401
forewordbook0
His work, says  Lachmann, was mercenary;
501
prose_translationbook1note
3 Gliding constellations of heaven.]
601
prose_translationbook1note
Ver. 35. Æterno vulnere amoris.
701
prose_translationbook1
as when  at Aulis the chosen leaders of the Greeks, the chief of men,  foully stained the altar of the virgin Trivia with the blood  of Iphigenia.
801
prose_translationbook1note
but who will second  him ?
901
prose_translationbook1note
I Further, the same force, &c.] Ver. 239.
1001
prose_translationbook1note
3 Corroding salt.]
1101
prose_translationbook1note
To Berkeley has been imputed a thousand and a  thousand times that which he never believed or imagined.
1201
prose_translationbook1
B