In [1]:
import re
import json
import stanza

import numpy as np
import pandas as pd

from itertools import chain
from ast import literal_eval

from functions import load_txt_as_lst, split_txt, \
run_stanza, flatten_list, segment_series, preprocess_series, write_file, \
build_sent_to_section_dict

In [2]:
top10_path = "/home/craig.car/repos/chiron/align_texts_project/data/top10-redo.jsonl"
top10_df = pd.read_json(top10_path, lines=True)

In [3]:
top10_df.head()

Unnamed: 0,id,book,seq,loc,text
0,urn:cts:latinLit:phi0550.phi001.perseus-lat1:1.1,urn:cts:latinLit:phi0550.phi001.perseus-lat1,0,urn:cts:latinLit:phi0550.phi001:1.1,"Aeneadum genetrix, hominum divomque voluptas,\n"
1,urn:cts:latinLit:phi0550.phi001.perseus-lat1:1.2,urn:cts:latinLit:phi0550.phi001.perseus-lat1,1,urn:cts:latinLit:phi0550.phi001:1.2,"alma Venus, caeli subter labentia signa\n"
2,urn:cts:latinLit:phi0550.phi001.perseus-lat1:1.3,urn:cts:latinLit:phi0550.phi001.perseus-lat1,2,urn:cts:latinLit:phi0550.phi001:1.3,"quae mare navigerum, quae terras frugiferentis\n"
3,urn:cts:latinLit:phi0550.phi001.perseus-lat1:1.4,urn:cts:latinLit:phi0550.phi001.perseus-lat1,3,urn:cts:latinLit:phi0550.phi001:1.4,"concelebras, per te quoniam genus omne animant..."
4,urn:cts:latinLit:phi0550.phi001.perseus-lat1:1.5,urn:cts:latinLit:phi0550.phi001.perseus-lat1,4,urn:cts:latinLit:phi0550.phi001:1.5,concipitur visitque exortum lumina solis:\n


In [4]:
top10_df.groupby("book").count()

Unnamed: 0_level_0,id,seq,loc,text
book,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
urn:cts:greekLit:tlg0011.tlg002.perseus-grc2,1257,1257,1257,1257
urn:cts:greekLit:tlg0012.tlg001.perseus-grc2,15686,15686,15686,15686
urn:cts:greekLit:tlg0012.tlg002.perseus-grc2,12107,12107,12107,12107
urn:cts:greekLit:tlg0085.tlg005.opp-grc3,1643,1643,1643,1643
urn:cts:greekLit:tlg0085.tlg005.perseus-grc2,1649,1649,1649,1649
urn:cts:latinLit:phi0550.phi001.perseus-lat1,7420,7420,7420,7420
urn:cts:latinLit:phi0959.phi006.perseus-lat2,11927,11927,11927,11927


In [5]:
lucretius_lat = top10_df.loc[top10_df["book"]=="urn:cts:latinLit:phi0550.phi001.perseus-lat1"]

In [6]:
lucretius_lat.head()

Unnamed: 0,id,book,seq,loc,text
0,urn:cts:latinLit:phi0550.phi001.perseus-lat1:1.1,urn:cts:latinLit:phi0550.phi001.perseus-lat1,0,urn:cts:latinLit:phi0550.phi001:1.1,"Aeneadum genetrix, hominum divomque voluptas,\n"
1,urn:cts:latinLit:phi0550.phi001.perseus-lat1:1.2,urn:cts:latinLit:phi0550.phi001.perseus-lat1,1,urn:cts:latinLit:phi0550.phi001:1.2,"alma Venus, caeli subter labentia signa\n"
2,urn:cts:latinLit:phi0550.phi001.perseus-lat1:1.3,urn:cts:latinLit:phi0550.phi001.perseus-lat1,2,urn:cts:latinLit:phi0550.phi001:1.3,"quae mare navigerum, quae terras frugiferentis\n"
3,urn:cts:latinLit:phi0550.phi001.perseus-lat1:1.4,urn:cts:latinLit:phi0550.phi001.perseus-lat1,3,urn:cts:latinLit:phi0550.phi001:1.4,"concelebras, per te quoniam genus omne animant..."
4,urn:cts:latinLit:phi0550.phi001.perseus-lat1:1.5,urn:cts:latinLit:phi0550.phi001.perseus-lat1,4,urn:cts:latinLit:phi0550.phi001:1.5,concipitur visitque exortum lumina solis:\n


In [7]:
lucretius_lat.shape

(7420, 5)

Each row corresponds to a line of text in [Perseus version](http://www.perseus.tufts.edu/hopper/text?doc=Perseus%3Atext%3A1999.02.0130%3Abook%3D1%3Acard%3D1).

# Get Lucretius sentences, no section markings

In [8]:
# remove ending \n
lucretius_lat_lst = list(lucretius_lat["text"].str.strip("\n"))

In [9]:
lucretius_lat_lst[:10]

['Aeneadum genetrix, hominum divomque voluptas,',
 'alma Venus, caeli subter labentia signa',
 'quae mare navigerum, quae terras frugiferentis',
 'concelebras, per te quoniam genus omne animantum',
 'concipitur visitque exortum lumina solis:',
 'te, dea, te fugiunt venti, te nubila caeli',
 'adventumque tuum, tibi suavis daedala tellus',
 'summittit flores, tibi rident aequora ponti',
 'placatumque nitet diffuso lumine caelum.',
 'nam simul ac species patefactast verna diei']

In [10]:
lucretius_lat_str = " ".join(lucretius_lat_lst)

In [11]:
lucretius_lat_str[:1000]

'Aeneadum genetrix, hominum divomque voluptas, alma Venus, caeli subter labentia signa quae mare navigerum, quae terras frugiferentis concelebras, per te quoniam genus omne animantum concipitur visitque exortum lumina solis: te, dea, te fugiunt venti, te nubila caeli adventumque tuum, tibi suavis daedala tellus summittit flores, tibi rident aequora ponti placatumque nitet diffuso lumine caelum. nam simul ac species patefactast verna diei et reserata viget genitabilis aura favoni, aeriae primum volucris te, diva, tuumque significant initum perculsae corda tua vi. inde ferae pecudes persultant pabula laeta et rapidos tranant amnis: ita capta lepore te sequitur cupide quo quamque inducere pergis. denique per maria ac montis fluviosque rapacis frondiferasque domos avium camposque virentis omnibus incutiens blandum per pectora amorem efficis ut cupide generatim saecla propagent. quae quoniam rerum naturam sola gubernas nec sine te quicquam dias in luminis oras exoritur neque fit laetum nequ

In [12]:
# load stanza model for Latin
lang_ = "la"
stanza_model_ = stanza.Pipeline(lang=lang_, processors='tokenize', use_gpu=True)

2023-07-25 15:00:40 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

2023-07-25 15:00:40 INFO: Loading these models for language: la (Latin):
| Processor | Package |
-----------------------
| tokenize  | ittb    |

2023-07-25 15:00:40 INFO: Using device: cpu
2023-07-25 15:00:40 INFO: Loading: tokenize
2023-07-25 15:00:40 INFO: Done loading processors!


In [13]:
lucretius_lat_sents_stanza = preprocess_series(lucretius_lat_str, "la", stanza_model_)

segmented str into sentences


In [14]:
lucretius_lat_sents_nostanza = preprocess_series(lucretius_lat_str, "el", stanza_model_)

segmented str into sentences


In [15]:
lucretius_lat_sents_stanza == lucretius_lat_sents_nostanza

False

In [16]:
lucretius_lat_sents_stanza[45:47] 

['quippe ubi non essent genitalia corpora cuique, qui posset mater rebus consistere certa?',
 'at nunc seminibus quia certis quaeque creantur, inde enascitur atque oras in luminis exit, materies ubi inest cuiusque et corpora prima;']

In [17]:
lucretius_lat_sents_nostanza[45:47]

['quippe ubi non essent genitalia corpora cuique, qui posset mater rebus consistere certa? at nunc seminibus quia certis quaeque creantur, inde enascitur atque oras in luminis exit, materies ubi inest cuiusque et corpora prima;',
 'atque hac re nequeunt ex omnibus omnia gigni, quod certis in rebus inest secreta facultas.']

In [18]:
len(lucretius_lat_sents_stanza)

2428

In [19]:
len(lucretius_lat_sents_nostanza)

2264

In [20]:
lucretius_lat_sents_stanza[:10]

['Aeneadum genetrix, hominum divomque voluptas, alma Venus, caeli subter labentia signa quae mare navigerum, quae terras frugiferentis concelebras, per te quoniam genus omne animantum concipitur visitque exortum lumina solis:',
 'te, dea, te fugiunt venti, te nubila caeli adventumque tuum, tibi suavis daedala tellus summittit flores, tibi rident aequora ponti placatumque nitet diffuso lumine caelum.',
 'nam simul ac species patefactast verna diei et reserata viget genitabilis aura favoni, aeriae primum volucris te, diva, tuumque significant initum perculsae corda tua vi.',
 'inde ferae pecudes persultant pabula laeta et rapidos tranant amnis:',
 'ita capta lepore te sequitur cupide quo quamque inducere pergis.',
 'denique per maria ac montis fluviosque rapacis frondiferasque domos avium camposque virentis omnibus incutiens blandum per pectora amorem efficis ut cupide generatim saecla propagent.',
 'quae quoniam rerum naturam sola gubernas nec sine te quicquam dias in luminis oras exori

In [21]:
lucretius_lat_sents_stanza[-10:]

['omnia denique sancta deum delubra replerat corporibus mors exanimis onerataque passim cuncta cadaveribus caelestum templa manebant, hospitibus loca quae complerant aedituentes.',
 'nec iam religio divom nec numina magni pendebantur enim:',
 'praesens dolor exsuperabat.',
 'nec mos ille sepulturae remanebat in urbe, quo prius hic populus semper consuerat humari;',
 'perturbatus enim totus trepidabat et unus quisque suum pro re cognatum maestus humabat.',
 'multaque res subita et paupertas horrida suasit;',
 'namque suos consanguineos aliena rogorum insuper extructa ingenti clamore locabant subdebantque faces, multo cum sanguine saepe rixantes, potius quam corpora desererentur, inque aliis alium populum sepelire suorum certantes;',
 'lacrimis lassi luctuque redibant;',
 'inde bonam partem in lectum maerore dabantur;',
 'nec poterat quisquam reperiri, quem neque morbus nec mors nec luctus temptaret tempore tali.']

In [22]:
# path_out = "/home/craig.car/repos/chiron/align_texts_project/data/lucretius/lucretius_lat_sents.txt"
# write_file(lucretius_lat_sents_stanza, path_out)

# Get Latin sentences, by book

In [23]:
def concatenate_txt(txt_series):
    '''
    Converts to str (in case of NaN present as float) and concatenates rows 
    into one continuous string
    '''
    # convert all rows to string
    txt_series = txt_series.apply(str)
    # join into a single string
    return ' '.join(txt_series)

In [24]:
def get_perseus_txt_by_book(df, cts_tag, num_books):
    '''
    Extract Perseus text in df by book
    '''
    txt_by_book = []
    idx2book_name = {}
    idx_counter = 0
    for book_idx in range(1, num_books+1):
        loc_tag = cts_tag_ + str(book_idx)
        book_text = concatenate_txt(lucretius_lat[lucretius_lat['loc'].str.startswith(loc_tag)]['text'].replace('\n',' ', regex=True))
        txt_by_book.append(book_text)
        # add to dict. chap name format: "booknum"
        book_name = "book" + str(book_idx)
        idx2book_name[idx_counter] = book_name
        idx_counter += 1
    return txt_by_book, idx2book_name

In [25]:
cts_tag_ = "urn:cts:latinLit:phi0550.phi001:"
lucretius_by_book, lucretius_sent2book_name = get_perseus_txt_by_book(lucretius_lat, cts_tag_, 6)
len(lucretius_by_book)

6

In [26]:
# path_out = "/home/craig.car/repos/chiron/align_texts_project/data/lucretius/lucretius_lat_bybook.txt"
# write_file(lucretius_by_book, path_out)

# Tokenize Latin sents and by book

In [27]:
sents_tokenized = []
for idx, sent in enumerate(lucretius_lat_sents_stanza):
    sents_tokenized.append(sent.split())

In [28]:
books_tokenized = []
for idx, book in enumerate(lucretius_by_book):
    books_tokenized.append(book.split())

In [29]:
tokens_from_sents = flatten_list(sents_tokenized)
tokens_from_books = flatten_list(books_tokenized)

In [30]:
tokens_from_sents == tokens_from_books

True

In [31]:
len(tokens_from_sents)

49036

In [32]:
len(tokens_from_books)

49036

## build sent 2 book name dict and write to json

In [33]:
lat_sent2bookname = build_sent_to_section_dict(sents_tokenized, books_tokenized, 
                                               lucretius_sent2book_name)

In [34]:
path_out = "/home/craig.car/repos/chiron/align_texts_project/data/lucretius/lat_sent2book_dict.json"
with open(path_out, 'w') as fp:
    json.dump(lat_sent2bookname, fp)

## check for sents that cross book boundaries - should be none

In [35]:
values = list(lat_sent2bookname.values())

In [36]:
# idx of sents that cross chapter boundaries (verified)
for idx, value in enumerate(values):
    if isinstance(value,list):
        print(idx)