In [1]:
import re
import json
import stanza

import numpy as np
import pandas as pd

from itertools import chain
from ast import literal_eval

from preprocessing_functions import load_txt_as_lst, split_txt, \
run_stanza, flatten_list, segment_series, preprocess_series, write_file

# Lucretius en1893 no section markers

In [2]:
raw_str_path = "/home/craig.car/repos/chiron/align_texts_project/data/lucretius/lucretius_en1893_str.txt"
en1893_raw_lst = load_txt_as_lst(raw_str_path)

In [3]:
en1893_raw_lst[:10]

['REMARKS\n',
 '\n',
 'ON THE\n',
 '   LIFE AND POEM OF LUCRETIUS.\n',
 '  OF the life of Lucretius but little information has reached us.  Ad nos vix tenuis famæ perlabitur aura.\n',
 '\n',
 'That he was a Roman by birth, is inferred from the pas  sages in his poem in which he speaks of the Roman world as  his country,1 and of the Roman language as his native tongue.2\n',
 '\n',
 'As to the time of his birth, it is stated by Eusebius in his  Chronicon, that he was born in the second year of the hundred  and seventy-first Olympiad, or ninety-five years before Christ.  At this period, Ennius had been dead about seventy years ;  Cicero was in his twelfth year; twenty-five years were to  elapse before the birth of Virgil, and four before that of Julius  Caesar. His style, indeed, would make him seem older, but  its antiquated character may be partly affected, in imitation,  perhaps, of Ennius, for whom he expresses great veneration.3\n',
 '\n']

In [4]:
en1893_str = " ".join(en1893_raw_lst)

In [5]:
# load stanza model for lang
lang_ = "en"
stanza_model_ = stanza.Pipeline(lang=lang_, processors='tokenize', use_gpu=True)

2023-06-30 12:00:52 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

2023-06-30 12:00:52 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |

2023-06-30 12:00:52 INFO: Using device: cpu
2023-06-30 12:00:52 INFO: Loading: tokenize
2023-06-30 12:00:52 INFO: Done loading processors!


In [6]:
en1983_sents = preprocess_series(en1893_str, "en", stanza_model_)

segmented str into sentences


In [7]:
len(en1983_sents)

13966

In [8]:
en1983_sents[:10]

['REMARKS',
 'ON THE',
 'LIFE AND POEM OF LUCRETIUS.',
 'OF the life of Lucretius but little information has reached us.',
 'Ad nos vix tenuis famæ perlabitur aura.',
 'That he was a Roman by birth, is inferred from the pas  sages in his poem in which he speaks of the Roman world as  his country,1 and of the Roman language as his native tongue.',
 '2',
 'As to the time of his birth, it is stated by Eusebius in his  Chronicon, that he was born in the second year of the hundred  and seventy-first Olympiad, or ninety-five years before Christ.',
 'At this period, Ennius had been dead about seventy years ;',
 'Cicero was in his twelfth year;']

In [9]:
# path_out = "/home/craig.car/repos/chiron/align_texts_project/data/lucretius/lucretius_en1893_sents.txt"
# write_file(en1983_sents, path_out)

# Lucretius en1893 with section markers

In [10]:
raw_path_markers = "/home/craig.car/repos/chiron/align_texts_project/data/lucretius/lucretius_en1893_str_withsections.txt"
en1893_raw_lst_markers = load_txt_as_lst(raw_path_markers)

In [11]:
en1893_raw_lst_markers[:10]

['#@$% chapter= #@$%\n',
 '\n',
 '#@$%title#@$%REMARKS\n',
 '\n',
 '#@$%title#@$%ON THE\n',
 '   #@$%title#@$%LIFE AND POEM OF LUCRETIUS.\n',
 '  OF the life of Lucretius but little information has reached us.  Ad nos vix tenuis famæ perlabitur aura.\n',
 '\n',
 'That he was a Roman by birth, is inferred from the pas  sages in his poem in which he speaks of the Roman world as  his country,1 and of the Roman language as his native tongue.2\n',
 '\n']

In [12]:
en1893_str_markers = " ".join(en1893_raw_lst_markers)

In [13]:
en1893_str_markers[:1000]

'#@$% chapter= #@$%\n \n #@$%title#@$%REMARKS\n \n #@$%title#@$%ON THE\n    #@$%title#@$%LIFE AND POEM OF LUCRETIUS.\n   OF the life of Lucretius but little information has reached us.  Ad nos vix tenuis famæ perlabitur aura.\n \n That he was a Roman by birth, is inferred from the pas  sages in his poem in which he speaks of the Roman world as  his country,1 and of the Roman language as his native tongue.2\n \n As to the time of his birth, it is stated by Eusebius in his  Chronicon, that he was born in the second year of the hundred  and seventy-first Olympiad, or ninety-five years before Christ.  At this period, Ennius had been dead about seventy years ;  Cicero was in his twelfth year; twenty-five years were to  elapse before the birth of Virgil, and four before that of Julius  Caesar. His style, indeed, would make him seem older, but  its antiquated character may be partly affected, in imitation,  perhaps, of Ennius, for whom he expresses great veneration.3\n \n Concerning his famil

In [14]:
en1893_str_markers = en1893_str_markers.replace("\n", "")

In [15]:
en1893_str_markers[:1000]

'#@$% chapter= #@$%  #@$%title#@$%REMARKS  #@$%title#@$%ON THE    #@$%title#@$%LIFE AND POEM OF LUCRETIUS.   OF the life of Lucretius but little information has reached us.  Ad nos vix tenuis famæ perlabitur aura.  That he was a Roman by birth, is inferred from the pas  sages in his poem in which he speaks of the Roman world as  his country,1 and of the Roman language as his native tongue.2  As to the time of his birth, it is stated by Eusebius in his  Chronicon, that he was born in the second year of the hundred  and seventy-first Olympiad, or ninety-five years before Christ.  At this period, Ennius had been dead about seventy years ;  Cicero was in his twelfth year; twenty-five years were to  elapse before the birth of Virgil, and four before that of Julius  Caesar. His style, indeed, would make him seem older, but  its antiquated character may be partly affected, in imitation,  perhaps, of Ennius, for whom he expresses great veneration.3  Concerning his family nothing is known. The 

## Split by section (including chapters)
Note: "#@$%" added at beginning and end of section markers when extracting text from xml

In [16]:
en1893_str_markers_split = en1893_str_markers.split("#@$%")

In [17]:
en1893_str_markers_split[:20]

['',
 ' chapter= ',
 '  ',
 'title',
 'REMARKS  ',
 'title',
 'ON THE    ',
 'title',
 'LIFE AND POEM OF LUCRETIUS.   OF the life of Lucretius but little information has reached us.  Ad nos vix tenuis famæ perlabitur aura.  That he was a Roman by birth, is inferred from the pas  sages in his poem in which he speaks of the Roman world as  his country,1 and of the Roman language as his native tongue.2  As to the time of his birth, it is stated by Eusebius in his  Chronicon, that he was born in the second year of the hundred  and seventy-first Olympiad, or ninety-five years before Christ.  At this period, Ennius had been dead about seventy years ;  Cicero was in his twelfth year; twenty-five years were to  elapse before the birth of Virgil, and four before that of Julius  Caesar. His style, indeed, would make him seem older, but  its antiquated character may be partly affected, in imitation,  perhaps, of Ennius, for whom he expresses great veneration.3  Concerning his family nothing is kn

In [18]:
# define patterns
chapter_marker = re.compile(" chapter= ") # 4: commentary, prose translation, metric translation, index
title_marker = re.compile("title")
title_commentary = re.compile("ARGUMENT") # startswith
title_book = re.compile("BOOK")
title_index = re.compile("INDEX.")
para_end = re.compile("paragraph end")
note_start = re.compile("note")
note_end = re.compile("note_end")

In [19]:
test = "ARGUMENT. asoidau"

In [20]:
test.startswith("ARGUMENT.")

True

In [21]:
len(en1893_str_markers_split)

5765

In [22]:
chapteridx2chaptername = {}
chapteridx2chaptername[0] = "foreword"
chapteridx2chaptername[1] = "prose_translation"
chapteridx2chaptername[2] = "metric_translation"
chapteridx2chaptername[3] = "index"

In [23]:
en1893_str_markers_split[:25]

['',
 ' chapter= ',
 '  ',
 'title',
 'REMARKS  ',
 'title',
 'ON THE    ',
 'title',
 'LIFE AND POEM OF LUCRETIUS.   OF the life of Lucretius but little information has reached us.  Ad nos vix tenuis famæ perlabitur aura.  That he was a Roman by birth, is inferred from the pas  sages in his poem in which he speaks of the Roman world as  his country,1 and of the Roman language as his native tongue.2  As to the time of his birth, it is stated by Eusebius in his  Chronicon, that he was born in the second year of the hundred  and seventy-first Olympiad, or ninety-five years before Christ.  At this period, Ennius had been dead about seventy years ;  Cicero was in his twelfth year; twenty-five years were to  elapse before the birth of Virgil, and four before that of Julius  Caesar. His style, indeed, would make him seem older, but  its antiquated character may be partly affected, in imitation,  perhaps, of Ennius, for whom he expresses great veneration.3  Concerning his family nothing is kn

## remove empty strings

In [24]:
df = pd.DataFrame(en1893_str_markers_split)
df.head()

Unnamed: 0,0
0,
1,chapter=
2,
3,title
4,REMARKS


In [25]:
df.shape

(5765, 1)

In [26]:
df.isna().sum()

0    0
dtype: int64

In [27]:
df[0].replace("   ", np.nan, inplace=True)
df[0].replace("  ", np.nan, inplace=True)
df[0].replace(" ", np.nan, inplace=True)
df[0].replace("", np.nan, inplace=True)

In [28]:
df.isna().sum()

0    1238
dtype: int64

In [29]:
df.dropna(inplace=True)

In [30]:
df.head()

Unnamed: 0,0
1,chapter=
3,title
4,REMARKS
5,title
6,ON THE


In [31]:
df.shape

(4527, 1)

In [32]:
en1893_str_markers_split = list(df[0])

In [33]:
en1893_str_markers_split[11]

' 2 I. 31; ii. 259. '

## Reconstruct text without markers

In [260]:
en_reconstructed = []
en_reconstructed_idx = -1

chapter_counter = -1
chap2idx_start = {}

book_counter = 0

idx2section_name = {}

hit_section_marker = False

lst = []

for section_idx, section in enumerate(en1893_str_markers_split):
    lst.append(hit_section_marker)
    
    if section == "":
        continue
    elif section == " ":
        continue
    elif section == "  ":
        continue
        
    # if section is chapter marker
    elif re.match(chapter_marker, section):
        # increment chapter counter
        chapter_counter += 1
        # reset book counter
        book_counter = 0
        # mark idx of en_reconstructed where chapter starts
        chap2idx_start[chapter_counter] = en_reconstructed_idx
    
    # if section is title marker
    elif re.match(title_marker, section):
        hit_section_marker = True
        # then next section is a title, which will be added to en_reconstructed
        # label as title for titles that don't match instances below (index, book, argument)
        label_nextitem = str(chapteridx2chaptername[chapter_counter]) + "book" + str(book_counter) + "title"
        
        # if title (next item in list) starts with "INDEX"
        if en1893_str_markers_split[section_idx+1].startswith("INDEX."):
            # index is in one section only; rename label
            label_nextitem = "index"

        # if next item in list starts with "BOOK"
        elif en1893_str_markers_split[section_idx+1].startswith("BOOK"):
            # new book is starting
            book_counter += 1
            # rename label
            label_nextitem = str(chapteridx2chaptername[chapter_counter]) + "book" + str(book_counter) + "title"
        
        # if next item in list starts with "ARGUMENT"
        elif en1893_str_markers_split[section_idx+1].startswith("ARGUMENT"):
            # label as commentary
            label_nextitem = str(chapteridx2chaptername[chapter_counter]) + "book" + str(book_counter) + "commentary"
        
        continue
    
    # if section marks the start of a note
    # elif re.match(note_start, section):
    elif section == "note":
        if en1893_str_markers_split[section_idx+1] == "note_end":
            hit_section_marker = False
            continue
        else:
            hit_section_marker = True
            # then next section is a footnote, add to en_reconstructed
            label_nextitem = str(chapteridx2chaptername[chapter_counter]) + "book" + str(book_counter) + "note"
    
    elif re.match(note_end, section):
        # do nothing; just there to separate footnote from following text
        hit_section_marker = False
        continue
    
    elif re.match(para_end, section):
        # do nothing; just there to create separations in sections from raw text
        hit_section_marker = False
        continue
    
    else:
        # all markers have been excluded; append to en_reconstructed
        en_reconstructed.append(section)
        en_reconstructed_idx += 1
        
        if hit_section_marker == True:
            idx2section_name[en_reconstructed_idx] = label_nextitem
            hit_section_marker = False
            
        else:
            idx2section_name[en_reconstructed_idx] = str(chapteridx2chaptername[chapter_counter]) + "book" + str(book_counter)


In [261]:
# fix errors here, then initialize en1983_sents_tokenized again
for sent_idx, sent in enumerate(en_reconstructed):
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace(".”1", ".” 1")
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace("tongue.2", "tongue. 2")
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace("veneration.3", "veneration. 3")
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace("insane.3", "insane. 3")
    # en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace("ferrea;”", "ferrea; ”")
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace(".1", ". 1")
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace(".3", ". 3")
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace(".7", ". 7")
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace(".2", ". 2")
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace(".4", ". 4")
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace('language;"', 'language; "')
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace('Lucretius;"', 'Lucretius; "')
    # en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace(";5", "; 5")
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace("?1", "? 1")
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace(".5", ". 5")
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace(".6", ". 6")
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace("It;.", "It; .")
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace(":8", ": 8")
    # en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace(";-it", "; -it")
    # en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace(";3", "; 3")
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace(":2", ": 2")
    # en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace(";1", "; 1")
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace("?2", "? 2")
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace('ἔχειπρός.”Wakefield.', 'ἔχειπρός. ”Wakefield.')
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace("Tacitus. 11", "Tacitus.11")
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace('Æolia.”What', 'Æolia. ”What')
    # en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace('arisen;"', 'arisen; " ')
    # en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace("nix.—Are", "nix. —Are")
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace(".—", ". —")
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace(";", "; ")
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace("fleshes.”", "fleshes. ”")
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace(".]Ver", ".] Ver")
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace(":so", ": so")
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace("flumina. —“", "flumina.—“")
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace("armis.–Fortè", "armis. –Fortè")
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace("—But", "— But")
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace("—Or", "— Or")
    en_reconstructed[sent_idx] = en_reconstructed[sent_idx].replace("minds.", "minds. ")
    

# Tokenize both docs

In [262]:
en1983_sents_tokenized = []
for idx, sent in enumerate(en1983_sents):
    en1983_sents_tokenized.append(sent.split())

In [263]:
en_reconstructed_tokenized = []
for idx, sent in enumerate(en_reconstructed):
    en_reconstructed_tokenized.append(sent.split())

In [264]:
num_tokens_sents = 0
for sent in en1983_sents_tokenized:
    num_tokens_sents += len(sent)

num_tokens_chapts = 0
for sent in en_reconstructed_tokenized:
    num_tokens_chapts += len(sent)

In [265]:
num_tokens_chapts == num_tokens_sents

False

In [266]:
print(num_tokens_chapts)
print(num_tokens_sents)

202617
202635


## Find data errors in tokenized docs

In [267]:
en1983_tokens_from_sents = flatten_list(en1983_sents_tokenized)
en1983_tokens_from_chapts = flatten_list(en_reconstructed_tokenized)

In [271]:
en1983_tokens_from_sents[59992:59997]

['seems:', '"', 'anima', 'atque', 'animw-videtur.-']

In [273]:
en1983_tokens_from_chapts[59992:59997]

['seems:"', 'anima', 'atque', 'animw-videtur.-', 'Vere']

In [270]:
for idx, token in enumerate(en1983_tokens_from_chapts):
    if token != en1983_tokens_from_sents[idx]:
        print(idx)

59992
59993
59994
59995
59996
59997
59998
59999
60000
60001
60002
60003
60004
60005
60006
60007
60008
60009
60010
60011
60012
60013
60014
60015
60016
60017
60018
60019
60020
60021
60022
60023
60024
60025
60026
60027
60028
60029
60030
60031
60032
60033
60034
60035
60036
60037
60038
60039
60040
60041
60042
60043
60044
60045
60046
60047
60048
60049
60050
60051
60052
60053
60054
60055
60056
60057
60058
60059
60060
60061
60062
60063
60064
60065
60066
60067
60068
60069
60070
60071
60072
60073
60074
60075
60076
60077
60078
60079
60081
60082
60083
60084
60085
60086
60087
60088
60089
60090
60091
60092
60093
60094
60095
60096
60097
60098
60099
60100
60101
60102
60103
60104
60105
60106
60107
60108
60109
60110
60111
60112
60113
60114
60115
60116
60117
60118
60119
60120
60121
60122
60123
60124
60125
60126
60127
60128
60129
60130
60131
60132
60133
60134
60135
60136
60137
60138
60139
60140
60141
60142
60143
60144
60145
60146
60147
60148
60149
60150
60151
60152
60153
60154
60155
60156
60157
60158
6015