In [1]:
import re
import json
import stanza

import numpy as np
import pandas as pd

from itertools import chain
from ast import literal_eval

from preprocessing_functions import load_txt_as_lst, split_txt, \
run_stanza, flatten_list, segment_series, preprocess_series, write_file

# Lucretius en1893 no section markers

In [2]:
raw_str_path = "/home/craig.car/repos/chiron/align_texts_project/data/lucretius/lucretius_en1893_str.txt"
en1893_raw_lst = load_txt_as_lst(raw_str_path)

In [3]:
en1893_raw_lst[:10]

['REMARKS\n',
 '\n',
 'ON THE\n',
 '   LIFE AND POEM OF LUCRETIUS.\n',
 '  OF the life of Lucretius but little information has reached us.  Ad nos vix tenuis famæ perlabitur aura.\n',
 '\n',
 'That he was a Roman by birth, is inferred from the pas  sages in his poem in which he speaks of the Roman world as  his country,1 and of the Roman language as his native tongue.2\n',
 '\n',
 'As to the time of his birth, it is stated by Eusebius in his  Chronicon, that he was born in the second year of the hundred  and seventy-first Olympiad, or ninety-five years before Christ.  At this period, Ennius had been dead about seventy years ;  Cicero was in his twelfth year; twenty-five years were to  elapse before the birth of Virgil, and four before that of Julius  Caesar. His style, indeed, would make him seem older, but  its antiquated character may be partly affected, in imitation,  perhaps, of Ennius, for whom he expresses great veneration.3\n',
 '\n']

In [4]:
en1893_str = " ".join(en1893_raw_lst)

In [5]:
# load stanza model for lang
lang_ = "en"
stanza_model_ = stanza.Pipeline(lang=lang_, processors='tokenize', use_gpu=True)

2023-06-27 11:19:02 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

2023-06-27 11:19:02 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |

2023-06-27 11:19:02 INFO: Using device: cpu
2023-06-27 11:19:02 INFO: Loading: tokenize
2023-06-27 11:19:02 INFO: Done loading processors!


In [6]:
en1983_sents = preprocess_series(en1893_str, "en", stanza_model_)

segmented str into sentences


In [7]:
len(en1983_sents)

13966

In [8]:
en1983_sents[:10]

['REMARKS',
 'ON THE',
 'LIFE AND POEM OF LUCRETIUS.',
 'OF the life of Lucretius but little information has reached us.',
 'Ad nos vix tenuis famæ perlabitur aura.',
 'That he was a Roman by birth, is inferred from the pas  sages in his poem in which he speaks of the Roman world as  his country,1 and of the Roman language as his native tongue.',
 '2',
 'As to the time of his birth, it is stated by Eusebius in his  Chronicon, that he was born in the second year of the hundred  and seventy-first Olympiad, or ninety-five years before Christ.',
 'At this period, Ennius had been dead about seventy years ;',
 'Cicero was in his twelfth year;']

In [9]:
# path_out = "/home/craig.car/repos/chiron/align_texts_project/data/lucretius/lucretius_en1893_sents.txt"
# write_file(en1983_sents, path_out)

# Lucretius en1893 with section markers

In [10]:
raw_path_markers = "/home/craig.car/repos/chiron/align_texts_project/data/lucretius/lucretius_en1893_str_withsections.txt"
en1893_raw_lst_markers = load_txt_as_lst(raw_path_markers)

In [11]:
en1893_raw_lst_markers[:10]

['#@$% chapter= #@$%\n',
 '\n',
 '#@$%title#@$%REMARKS\n',
 '\n',
 '#@$%title#@$%ON THE\n',
 '   #@$%title#@$%LIFE AND POEM OF LUCRETIUS.\n',
 '  OF the life of Lucretius but little information has reached us.  Ad nos vix tenuis famæ perlabitur aura.\n',
 '\n',
 'That he was a Roman by birth, is inferred from the pas  sages in his poem in which he speaks of the Roman world as  his country,1 and of the Roman language as his native tongue.2\n',
 '\n']

In [12]:
en1893_str_markers = " ".join(en1893_raw_lst_markers)

In [13]:
en1893_str_markers[:1000]

'#@$% chapter= #@$%\n \n #@$%title#@$%REMARKS\n \n #@$%title#@$%ON THE\n    #@$%title#@$%LIFE AND POEM OF LUCRETIUS.\n   OF the life of Lucretius but little information has reached us.  Ad nos vix tenuis famæ perlabitur aura.\n \n That he was a Roman by birth, is inferred from the pas  sages in his poem in which he speaks of the Roman world as  his country,1 and of the Roman language as his native tongue.2\n \n As to the time of his birth, it is stated by Eusebius in his  Chronicon, that he was born in the second year of the hundred  and seventy-first Olympiad, or ninety-five years before Christ.  At this period, Ennius had been dead about seventy years ;  Cicero was in his twelfth year; twenty-five years were to  elapse before the birth of Virgil, and four before that of Julius  Caesar. His style, indeed, would make him seem older, but  its antiquated character may be partly affected, in imitation,  perhaps, of Ennius, for whom he expresses great veneration.3\n \n Concerning his famil

In [14]:
en1893_str_markers = en1893_str_markers.replace("\n", "")

In [15]:
en1893_str_markers[:1000]

'#@$% chapter= #@$%  #@$%title#@$%REMARKS  #@$%title#@$%ON THE    #@$%title#@$%LIFE AND POEM OF LUCRETIUS.   OF the life of Lucretius but little information has reached us.  Ad nos vix tenuis famæ perlabitur aura.  That he was a Roman by birth, is inferred from the pas  sages in his poem in which he speaks of the Roman world as  his country,1 and of the Roman language as his native tongue.2  As to the time of his birth, it is stated by Eusebius in his  Chronicon, that he was born in the second year of the hundred  and seventy-first Olympiad, or ninety-five years before Christ.  At this period, Ennius had been dead about seventy years ;  Cicero was in his twelfth year; twenty-five years were to  elapse before the birth of Virgil, and four before that of Julius  Caesar. His style, indeed, would make him seem older, but  its antiquated character may be partly affected, in imitation,  perhaps, of Ennius, for whom he expresses great veneration.3  Concerning his family nothing is known. The 

## Split by section (including chapters)
Note: "#@$%" added at beginning and end of section markers when extracting text from xml

In [16]:
en1893_str_markers_split = en1893_str_markers.split("#@$%")

In [17]:
en1893_str_markers_split[:20]

['',
 ' chapter= ',
 '  ',
 'title',
 'REMARKS  ',
 'title',
 'ON THE    ',
 'title',
 'LIFE AND POEM OF LUCRETIUS.   OF the life of Lucretius but little information has reached us.  Ad nos vix tenuis famæ perlabitur aura.  That he was a Roman by birth, is inferred from the pas  sages in his poem in which he speaks of the Roman world as  his country,1 and of the Roman language as his native tongue.2  As to the time of his birth, it is stated by Eusebius in his  Chronicon, that he was born in the second year of the hundred  and seventy-first Olympiad, or ninety-five years before Christ.  At this period, Ennius had been dead about seventy years ;  Cicero was in his twelfth year; twenty-five years were to  elapse before the birth of Virgil, and four before that of Julius  Caesar. His style, indeed, would make him seem older, but  its antiquated character may be partly affected, in imitation,  perhaps, of Ennius, for whom he expresses great veneration.3  Concerning his family nothing is kn

In [18]:
# define patterns
chapter_marker = re.compile(" chapter= ") # 4: commentary, prose translation, metric translation, index
title_marker = re.compile("title")
title_commentary = re.compile("ARGUMENT") # startswith
title_book = re.compile("BOOK")
title_index = re.compile("INDEX.")
para_end = re.compile("paragraph end")
note_start = re.compile("note")
note_end = re.compile("note_end")

In [19]:
test = "ARGUMENT. asoidau"

In [20]:
test.startswith("ARGUMENT.")

True

In [31]:
len(en1893_str_markers_split)

5765

In [21]:
chapteridx2chaptername = {}
chapteridx2chaptername[0] = "foreword"
chapteridx2chaptername[1] = "prose_translation"
chapteridx2chaptername[2] = "metric_translation"
chapteridx2chaptername[3] = "index"

In [176]:
en_reconstructed = []
en_reconstructed_idx = -1

chapter_counter = -1
chap2idx_start = {}

book_counter = 0

idx2section_name = {}

for section_idx, section in enumerate(en1893_str_markers_split):
    if section == "":
        continue
    
    # if section is chapter marker
    elif re.match(chapter_marker, section):
        # increment chapter counter
        chapter_counter += 1
        # reset book counter
        book_counter = 0
        # mark idx of en_reconstructed where chapter starts
        chap2idx_start[chapter_counter] = en_reconstructed_idx
    
    # if section is title marker
    elif re.match(title_marker, section):
        # then next section is a title, which will be added to en_reconstructed
        # en_reconstructed.append(en1893_str_markers_split[section_idx+1])
        # en_reconstructed_idx += 1
        idx2section_name[en_reconstructed_idx+1] = str(chapteridx2chaptername[chapter_counter]) + "book" + str(book_counter) + "title"
        
        # if title starts with "INDEX"
        if en1893_str_markers_split[section_idx+1].startswith("INDEX."):
            ### TODO: everything after this is index. add everything after to en_reconstructed, stop iteration, and update dict idx2section_name
            # en_reconstructed.append(section)
            # en_reconstructed_idx += 1
            # idx2section_name[en_reconstructed_idx] = "index"
            pass

        # if title starts with "BOOK"
        elif en1893_str_markers_split[section_idx+1].startswith("BOOK"):
            # new book is starting
            book_counter += 1
            # "BOOK" will be added to en_reconstructed
            # en_reconstructed_idx += 1
        
        # if title starts with "ARGUMENT"
        elif en1893_str_markers_split[section_idx+2].startswith("ARGUMENT"):
            # then next section is commentary
            # en_reconstructed.append(en1893_str_markers_split[section_idx+1])
            # en_reconstructed_idx += 1
            idx2section_name[en_reconstructed_idx+2] = str(chapteridx2chaptername[chapter_counter]) + "book" + str(book_counter) + "commentary"
            # en_reconstructed_idx += 1
        
        continue
    
    # if section marks the start of a note
    elif re.match(note_start, section):
        # then next section is a footnote, add to en_reconstructed
        # en_reconstructed.append(en1893_str_markers_split[section_idx+1])
        # en_reconstructed_idx += 1
        idx2section_name[en_reconstructed_idx+1] = str(chapteridx2chaptername[chapter_counter]) + "book" + str(book_counter) + "note"
        # en_reconstructed_idx += 1
    
    elif re.match(note_end, section):
        # do nothing; just there to separate footnote from following text
        continue
    
    elif re.match(para_end, section):
        # do nothing; just there to create separations in sections from raw text
        continue
    
    else:
        # all markers have been excluded; append to en_reconstructed
        en_reconstructed.append(section)
        if en_reconstructed_idx in idx2section_name.keys():
            pass
        else:
            idx2section_name[en_reconstructed_idx] = str(chapteridx2chaptername[chapter_counter]) + "book" + str(book_counter)
        en_reconstructed_idx += 1
        

In [177]:
len(en_reconstructed)

2882

In [178]:
len(idx2section_name)

2883

In [179]:
idx2section_name

{-1: 'forewordbook0',
 1: 'forewordbook0title',
 0: 'forewordbook0',
 2: 'forewordbook0title',
 3: 'forewordbook0title',
 4: 'forewordbook0note',
 5: 'forewordbook0note',
 6: 'forewordbook0note',
 7: 'forewordbook0note',
 8: 'forewordbook0note',
 9: 'forewordbook0note',
 10: 'forewordbook0note',
 11: 'forewordbook0note',
 12: 'forewordbook0note',
 13: 'forewordbook0note',
 14: 'forewordbook0note',
 15: 'forewordbook0note',
 16: 'forewordbook0note',
 17: 'forewordbook0note',
 18: 'forewordbook0note',
 19: 'forewordbook0note',
 20: 'forewordbook0note',
 21: 'forewordbook0note',
 22: 'forewordbook0note',
 23: 'forewordbook0note',
 24: 'forewordbook0note',
 25: 'forewordbook0note',
 26: 'forewordbook0note',
 27: 'forewordbook0note',
 28: 'forewordbook0note',
 29: 'forewordbook0note',
 30: 'forewordbook0note',
 31: 'forewordbook0note',
 32: 'forewordbook0note',
 33: 'forewordbook0note',
 34: 'forewordbook0note',
 35: 'forewordbook0note',
 36: 'forewordbook0note',
 37: 'forewordbook0note',
 

In [152]:
idx2section_name[2879]

'metric_translationbook6'

In [170]:
en_reconstructed[592]

' I Blind atoms.] Ver. 1103. Primordia cæcæ. The imperceptible  primary elements of all things. '

In [147]:
en_reconstructed[-5:]

[' ',
 'BOOK VI. ',
 "With rags obscene scarce covered; o'er the bones  Skin only, nought but skin; and drowned alike  Within and outwards, with putrescent grume.  At length the temples of the gods themselves,  Changed into charnels, and their sacred shrines  Thronged with the dead: for superstition now,  The power of altars, half their sway had lost,  Whelmed in the pressure of the present woe.  Nor longer now the costly rites prevailed  Of ancient burial, erst punctilious kept:  For all roved restless, with distracted mind,  From scene to scene ; and worn with grief and toil  Gave to their friends th' interment chance allowed.  And direst exigence impelled them oft,  Headlong, to deeds most impious; for the pyres  Funereal seized they, reared not by themselves,  And with loud dirge, and wailing wild, o'er these  Placed their own dead; amid th' unhallowed blaze  With blood contending, rather than resign  The tomb thus gained, or quit th' enkindling corse.  THE END.  ",
 '  ',
 "INDEX.