In [1]:
import re
import json
import stanza

import numpy as np
import pandas as pd

from itertools import chain
from ast import literal_eval

from preprocessing_functions import load_txt_as_lst, split_txt, \
run_stanza, flatten_list, segment_series

# Lucretius en1893 no section markers

In [20]:
raw_str_path = "/home/craig.car/repos/chiron/align_texts_project/data/lucretius/lucretius_en1893_str.txt"
en1893_raw_lst = load_txt_as_lst(raw_str_path)

In [5]:
en1893_raw_lst[:10]

['REMARKS\n',
 '\n',
 'ON THE\n',
 '   LIFE AND POEM OF LUCRETIUS.\n',
 '  OF the life of Lucretius but little information has reached us.  Ad nos vix tenuis famæ perlabitur aura.\n',
 '\n',
 'That he was a Roman by birth, is inferred from the pas  sages in his poem in which he speaks of the Roman world as  his country,1 and of the Roman language as his native tongue.2\n',
 '\n',
 'As to the time of his birth, it is stated by Eusebius in his  Chronicon, that he was born in the second year of the hundred  and seventy-first Olympiad, or ninety-five years before Christ.  At this period, Ennius had been dead about seventy years ;  Cicero was in his twelfth year; twenty-five years were to  elapse before the birth of Virgil, and four before that of Julius  Caesar. His style, indeed, would make him seem older, but  its antiquated character may be partly affected, in imitation,  perhaps, of Ennius, for whom he expresses great veneration.3\n',
 '\n']

In [7]:
en1893_str = " ".join(en1893_raw_lst)

In [6]:
# load stanza model for lang
lang_ = "en"
stanza_model_ = stanza.Pipeline(lang=lang_, processors='tokenize', use_gpu=True)

2023-06-13 16:06:43 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

2023-06-13 16:06:43 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |

2023-06-13 16:06:43 INFO: Using device: cpu
2023-06-13 16:06:43 INFO: Loading: tokenize
2023-06-13 16:06:43 INFO: Done loading processors!


In [38]:
def preprocess_series(txt_str, lang, stanza_model):
    # split text into sentences
    series_split = segment_series(txt_str, lang, stanza_model)
    print("segmented str into sentences")
    # save as df and change col name
    series_df = pd.DataFrame(series_split)
    series_df.columns = ['text']
    # remove whitespace at beginning and end
    series_df['text'] = series_df['text'].str.strip()
    # drop rows with NaN
    series_df.dropna(how='any', inplace=True)
    # drop rows with empty strings
    series_df.drop(series_df.loc[series_df['text']==''].index, inplace=True)
    # send to list
    series_lst = list(series_df['text'])
    return series_lst

def write_file(input_lst, name_out):
    filename = name_out
    with open(filename, 'w') as file:
        for sentence in input_lst:
            file.write(f"{sentence}\n")

In [32]:
en1983_sents = preprocess_series(en1893_str, "en", stanza_model_)

segmented str into sentences


In [33]:
len(en1983_sents)

13966

In [34]:
en1983_sents[:10]

['REMARKS',
 'ON THE',
 'LIFE AND POEM OF LUCRETIUS.',
 'OF the life of Lucretius but little information has reached us.',
 'Ad nos vix tenuis famæ perlabitur aura.',
 'That he was a Roman by birth, is inferred from the pas  sages in his poem in which he speaks of the Roman world as  his country,1 and of the Roman language as his native tongue.',
 '2',
 'As to the time of his birth, it is stated by Eusebius in his  Chronicon, that he was born in the second year of the hundred  and seventy-first Olympiad, or ninety-five years before Christ.',
 'At this period, Ennius had been dead about seventy years ;',
 'Cicero was in his twelfth year;']

In [40]:
# path_out = "/home/craig.car/repos/chiron/align_texts_project/data/lucretius/lucretius_en1893_sents.txt"
# write_file(en1983_sents, path_out)