In [370]:
import re
import json
import stanza
import argparse

import pandas as pd

from itertools import chain
from ast import literal_eval

In [2]:
def load_txt_as_lst(path_in):
    output_lst = []
    with open(path_in, "rt") as f:
        for line in f:
            output_lst.append(line)
    return output_lst

Do differently: use thuc segmented on full string, then use num of tokens to compare back to full text with chapter annotations.

# Greek text from David's file

In [3]:
pathin_thuc = "/home/craig.car/spring2023/data/cts-sections.jsonl.gz"

In [4]:
thuc_raw = pd.read_json(
    pathin_thuc,
    lines=True,
    compression='gzip')

In [5]:
thuc_grc_df = thuc_raw.loc[thuc_raw['book']=='urn:cts:greekLit:tlg0003.tlg001.perseus-grc2']


In [6]:
thuc_grc_df.head()

Unnamed: 0,book,id,loc,seq,text,cites,group,lang,title,translation,wlang,work
0,urn:cts:greekLit:tlg0003.tlg001.perseus-grc2,urn:cts:greekLit:tlg0003.tlg001.perseus-grc2:1...,urn:cts:greekLit:tlg0003.tlg001:1.1.1,0,Θουκυδίδης Ἀθηναῖος ξυνέγραψε τὸν πόλεμον τῶν\...,"[book, chapter, section]",Thucydides,grc,History of the Peloponnesian War,False,grc,urn:cts:greekLit:tlg0003.tlg001
1,urn:cts:greekLit:tlg0003.tlg001.perseus-grc2,urn:cts:greekLit:tlg0003.tlg001.perseus-grc2:1...,urn:cts:greekLit:tlg0003.tlg001:1.1.2,1,κίνησις γὰρ αὕτη μεγίστη δὴ τοῖς Ἕλλησιν ἐγένε...,"[book, chapter, section]",Thucydides,grc,History of the Peloponnesian War,False,grc,urn:cts:greekLit:tlg0003.tlg001
2,urn:cts:greekLit:tlg0003.tlg001.perseus-grc2,urn:cts:greekLit:tlg0003.tlg001.perseus-grc2:1...,urn:cts:greekLit:tlg0003.tlg001:1.1.3,2,τὰ γὰρ πρὸ αὐτῶν καὶ τὰ ἔτι παλαίτερα σαφῶς μὲ...,"[book, chapter, section]",Thucydides,grc,History of the Peloponnesian War,False,grc,urn:cts:greekLit:tlg0003.tlg001
3,urn:cts:greekLit:tlg0003.tlg001.perseus-grc2,urn:cts:greekLit:tlg0003.tlg001.perseus-grc2:1...,urn:cts:greekLit:tlg0003.tlg001:1.2.1,3,φαίνεται γὰρ ἡ νῦν Ἑλλὰς καλουμένη οὐ πάλαι βε...,"[book, chapter, section]",Thucydides,grc,History of the Peloponnesian War,False,grc,urn:cts:greekLit:tlg0003.tlg001
4,urn:cts:greekLit:tlg0003.tlg001.perseus-grc2,urn:cts:greekLit:tlg0003.tlg001.perseus-grc2:1...,urn:cts:greekLit:tlg0003.tlg001:1.2.2,4,"τῆς γὰρ ἐμπορίας οὐκ οὔσης, οὐδ’ ἐπιμειγνύντες...","[book, chapter, section]",Thucydides,grc,History of the Peloponnesian War,False,grc,urn:cts:greekLit:tlg0003.tlg001


In [7]:
thuc_grc_df.loc[thuc_grc_df["loc"]=="urn:cts:greekLit:tlg0003.tlg001:1.36.4", ["text"]]

Unnamed: 0,text
132,τοιαῦτα μὲν οἱ Κερκυραῖοι εἶπον: οἱ δὲ Κορίνθι...


In [8]:
thuc_grc_series = thuc_grc_df['text'].replace('\n',' ', regex=True).str.strip()

In [9]:
thuc_grc_series[132]

'τοιαῦτα μὲν οἱ Κερκυραῖοι εἶπον: οἱ δὲ Κορίνθιοι μετ’ αὐτοὺς τοιάδε.'

In [10]:
def concatenate_txt(txt_series):
    '''
    Conveinplace= to str (in case of NaN present as float) and concatenates rows 
    into one continuous string
    '''
    # convert all rows to string
    txt_series = txt_series.apply(str)
    # join into a single string
    return ' '.join(txt_series)

def split_txt(txt_str, lang):
    if lang=='en':
        # split keeping split char
        split_lst = re.split("([;:])", txt_str)
        delimiters = ";:"
    else:
        split_lst = re.split("([;:.])", txt_str)
        delimiters = ";:."
    delete_idx = []
    for idx, token in enumerate(split_lst):
        if token in delimiters:
            split_lst[idx-1] = split_lst[idx-1]+split_lst[idx]
            delete_idx.append(idx)
    for index in sorted(delete_idx, reverse=True):
        del split_lst[index]
    return split_lst

def run_stanza(en_str):
    '''
    returns sentences as list
    '''
    # load stanza model for en
    nlp = stanza.Pipeline(lang='en', processors='tokenize')
    doc = nlp(en_str)
    return [sentence.text for sentence in doc.sentences]

def flatten_list(nested_list):
    return list(chain.from_iterable(nested_list))

def segment_series(txt_series, lang):
    '''
    Modified from segment_en in preprocess_functions.py:
        uses split_txt function (works for en or el)
        removes text between % (Faroosh's comments)
    '''
    # join into one str
    series_str = concatenate_txt(txt_series)
    if lang == 'el':
        # remove "\n" from text
        
        # split on ;:. for el
        return split_txt(series_str, lang)
    # TODO: update split_txt() for German and Persian
    else:
        # run stanza
        series_sents = run_stanza(series_str)
        # split further on ;:
        series_split = []
        for sent in series_sents:
            series_split.append(split_txt(sent, lang))
        return flatten_list(series_split)

def preprocess_series(txt_series, lang, keep_speaker_label, speaker_label_names):
    # convert all rows to string
    txt_series = txt_series.apply(str)
    # remove whitespace at beginning and end
    txt_series = txt_series.str.strip()
    # remove speaker labels if present
    # TODO: simpler way to do this?
    if keep_speaker_label == False:
        if lang == 'el':
            for idx, item in enumerate(txt_series):
                if item.startswith('Σωκράτης.'):
                    txt_series.loc[idx] = txt_series.loc[idx].lstrip('Σωκράτης.')
                elif item.startswith('Κρίτων.'):
                    txt_series.loc[idx] = txt_series.loc[idx].lstrip('Κρίτων.')
        else:
            for idx, item in enumerate(txt_series):
                if txt_series.loc[idx].startswith(speaker_label_names[0]):
                    txt_series.loc[idx] = txt_series.loc[idx].lstrip(speaker_label_names[0])
                elif txt_series.loc[idx].startswith(speaker_label_names[1]):
                    txt_series.loc[idx] = txt_series.loc[idx].lstrip(speaker_label_names[1])
        # remove whitespace at beginning and end
        txt_series = txt_series.str.strip()
    # split text into sentences
    series_split = segment_series(txt_series, lang)
    
    # save as df and change col name
    series_df = pd.DataFrame(series_split)
    series_df.columns = ['text']
    # remove whitespace at beginning and end
    series_df['text'] = series_df['text'].str.strip()
    # drop rows with NaN
    series_df.dropna(how='any', inplace=True)
    # # drop rows with empty strings
    # series_df.drop(series_df.loc[series_df['text']==''].index, inplace=True)
    # send to list
    series_lst = list(series_df['text'])

    return series_lst
    # return series_split

def write_file(input_lst, name_out):
    filename = name_out
    with open(filename, 'w') as file:
        for sentence in input_lst:
            file.write(f"{sentence}\n")


In [11]:
thuc_grc_series = thuc_grc_df['text'].apply(str)

In [12]:
thuc_grc_series = thuc_grc_series.replace('\n',' ', regex=True).str.strip()

In [13]:
thuc_grk_processed = preprocess_series(thuc_grc_series, "el",
                                       True, None)
    

In [14]:
# add ' and ] back to end of sentence they belong to (moved to next sentence by sentence splitting on .)
for idx, sent in enumerate(thuc_grk_processed):
    if sent.startswith("’ "):
        thuc_grk_processed[idx-1] += "’"
        thuc_grk_processed[idx] = thuc_grk_processed[idx][2:]
    elif sent.startswith("]"):
        thuc_grk_processed[idx-1] += "]"
        thuc_grk_processed[idx] = thuc_grk_processed[idx][2:]

In [15]:
len(thuc_grk_processed)

6098

In [379]:
# delete last row (empty)
thuc_grk_processed[-1]

'[ὅταν ὁ μετὰ τοῦτο τὸ θέρος χειμὼν τελευτήσῃ, ἓν καὶ εἰκοστὸν ἔτος πληροῦται.]'

In [336]:
del thuc_grk_processed[-1]

In [337]:
len(thuc_grk_processed)

6097

In [384]:
thuc_grk_processed[-1]

'[ὅταν ὁ μετὰ τοῦτο τὸ θέρος χειμὼν τελευτήσῃ, ἓν καὶ εἰκοστὸν ἔτος πληροῦται.]'

In [16]:
for idx, sent in enumerate(thuc_grk_processed):
    thuc_grk_processed[idx] = thuc_grk_processed[idx].replace(" .", ".")

In [17]:
thuc_grk_processed[3279]

'ἐκκλησίαν δὲ ποιήσαντας τοὺς στρατηγοὺς καὶ τοὺς πρυτάνεις πρῶτον περὶ τῆς εἰρήνης.'

In [386]:
# thuc_grk_processed is original
# thuc_grk_processed_1 is after stripping whitespaces
# thuc_grk_processed_3 is after moving last ] to previous sent
# grk_pathout = "/home/craig.car/spring2023/data/align_noisy_data/thucydides/thuc_grk_processed_3.txt"
# write_file(thuc_grk_processed, grk_pathout)


# Thuc Fr translation: no chapters

In [19]:
path_thuc_fr = "/home/craig.car/spring2023/data/align_noisy_data/thucydides/thuc_fr_1863_str.txt"
thuc_fr_lst_raw = load_txt_as_lst(path_thuc_fr)


In [20]:
thuc_fr_lst_raw[:5]

['NOTICE BIOGRAPHIQUE. \n',
 '\n',
 'Les seuls renseignements certains que nous possédions sur la personne de Thucydide se tirent de quelques passages de son livre. Les autres données qui se rencontrent çà et là, notamment dans les deux biographies, dont l’une est attribuée à Marcellinus et l’autre est anonyme, sont d’une date trop récente pour avoir beaucoup d’autorité. Aussi, sans entrer dans des détails d’un intérêt secondaire, nous bornerons-nous à rapporter les circonstances les plus essentielles de la vie de Thucydide, celles qui ont eu quelque influence sur sa carrière d’historien. \n',
 '\n',
 'Thucydide s’est nommé en plusieurs endroits de son ouvrage, comme s’il eût craint que le titre ne se perdit. En tête du livre, il prend la qualité de citoyen d’Athènes; une seule fois (Liv. IV, chap. civ), il ajoute à son nom celui de son père Oloros; c’est lorsqu’il se cite luimême en qualité de fonctionnaire public. \n']

In [21]:
thuc_fr_lst_raw[0] + "»"

'NOTICE BIOGRAPHIQUE. \n»'

In [22]:
thuc_fr_str = " ".join(thuc_fr_lst_raw)
thuc_fr_str = re.sub("« ", "«", thuc_fr_str)
thuc_fr_str = re.sub(" »", "»", thuc_fr_str)

In [27]:
def split_txt(txt_str, lang):
    if lang=='el':
        split_lst = re.split("([;:.])", txt_str)
        delimiters = ";:."
    else:
        # split keeping split char
        split_lst = re.split("([;:])", txt_str)
        # includes punctuation that caused errors: update for different texts/languages
        delimiters = ";:"        
    # add delimiters back to previous token
    delete_idx = []
    for idx, phrase in enumerate(split_lst):
        if phrase in delimiters:
            split_lst[idx-1] = split_lst[idx-1]+split_lst[idx]
            delete_idx.append(idx)
    for index in sorted(delete_idx, reverse=True):
        del split_lst[index]
    
    split_lst_no_newlines = []
    for idx, phrase in enumerate(split_lst):
        split_lst_no_newlines.extend(phrase.split("\n"))
    # return split_lst
    return split_lst_no_newlines

def run_stanza(text_str, lang, model_):
    '''
    returns sentences as list
    '''
    doc = model_(text_str)
    return [sentence.text for sentence in doc.sentences]

def flatten_list(nested_list):
    return list(chain.from_iterable(nested_list))

def segment_series(txt_str, lang, stanza_model):
    if lang == 'el':
        # split on ;:. for el
        return split_txt(txt_str, lang)
    else:
        # run stanza
        series_sents = run_stanza(txt_str, lang, stanza_model)
        # split further on ;:
        series_split = []
        trailing_punct = "»"
        for sent_idx, sent in enumerate(series_sents):
            # if sent in trailing_punct:
            #     print(f"idx is {sent_idx} and sent is {sent}")
            #     print(series_split[-1])
            #     series_split[-1][0] = series_split[-1][0]+sent
            # else:
            new_sent = split_txt(sent, lang)
            series_split.append(new_sent)
        return flatten_list(series_split)

In [28]:
def preprocess_series(txt_str, lang, stanza_model, keep_speaker_label, speaker_label_names):
    # split text into sentences
    series_split = segment_series(txt_str, lang, stanza_model)
    print("segmented str into sentences")
    
    # save as df and change col name
    series_df = pd.DataFrame(series_split)
    series_df.columns = ['text']
    
    # remove whitespace at beginning and end
    series_df['text'] = series_df['text'].str.strip()

    # drop rows with NaN
    series_df.dropna(how='any', inplace=True)

    # drop rows with empty strings
    series_df.drop(series_df.loc[series_df['text']==''].index, inplace=True)

    # send to list
    series_lst = list(series_df['text'])

    return series_lst
    # return series_split

## Load Stanza
To use GPU, from https://github.com/stanfordnlp/stanza/issues/530
(Have not done this myself yet - need new env for python version)

Environment (please complete the following information):
- OS: Ubuntu 20.10
- Python version: Python 3.8.5 [GCC 7.3.0] :: Anaconda, Inc. on linux
- Stanza version: 1.1.1

Additional context
- PyTorch installed successfully with conda install pytorch torchvision torchaudio cudatoolkit=11.0 -c pytorch.



In [390]:
# load stanza model for lang
lang_ = "fr"
stanza_model_ = stanza.Pipeline(lang=lang_, processors='tokenize', use_gpu=True)

2023-04-29 15:14:45 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

2023-04-29 15:14:45 INFO: Loading these models for language: fr (French):
| Processor | Package |
-----------------------
| tokenize  | gsd     |
| mwt       | gsd     |

2023-04-29 15:14:45 INFO: Use device: cpu
2023-04-29 15:14:45 INFO: Loading: tokenize
2023-04-29 15:14:45 INFO: Loading: mwt
2023-04-29 15:14:45 INFO: Done loading processors!


In [391]:
thuc_fr_processed = preprocess_series(thuc_fr_str, "fr", stanza_model_,
                                      True, None)

segmented str into sentences


In [392]:
len(thuc_fr_processed)

17267

In [393]:
# add trailing » to previous sentence
delete_idx = []
trailing_punct = "»"
for sent_idx, sent in enumerate(thuc_fr_processed):
    # if » was split into its own sentence
    if sent == trailing_punct:
        thuc_fr_processed[sent_idx-1] = thuc_fr_processed[sent_idx-1]+trailing_punct
        delete_idx.append(sent_idx)
    elif sent.startswith(trailing_punct):
        # if » was split to the beginning of the following sentence (followed by whitespace)
        thuc_fr_processed[sent_idx-1] = thuc_fr_processed[sent_idx-1]+trailing_punct
        thuc_fr_processed[sent_idx] = thuc_fr_processed[sent_idx][1:]
for index in sorted(delete_idx, reverse=True):
    del thuc_fr_processed[index]

In [394]:
len(thuc_fr_processed)

17203

In [189]:
def write_file(input_lst, name_out):
    filename = name_out
    with open(filename, 'w') as file:
        for sentence in input_lst:
            file.write(f"{sentence}\n")


In [389]:
# # 3 is after adding splitting on newlines at end of split_text()
# # 4 is after adding » to preceding token (to avoid splitting onto new line)
# # 5 is after moving » that was at start of a sentence to previous sentence
# path_out = "/home/craig.car/spring2023/data/align_noisy_data/thucydides/thuc_fr_1863_processed_5.txt"
# write_file(thuc_fr_processed, path_out)


# Thuc Fr translation: with chapters

In [38]:
thuc_fr_chap_raw = "/home/craig.car/spring2023/data/align_noisy_data/thucydides/thuc_fr_str_with_chapter.txt"


In [39]:
thuc_fr_chap_lst = load_txt_as_lst(thuc_fr_chap_raw)

In [40]:
thuc_fr_chap_lst[:5]

['NOTICE BIOGRAPHIQUE. \n',
 '\n',
 'Les seuls renseignements certains que nous possédions sur la personne de Thucydide se tirent de quelques passages de son livre. Les autres données qui se rencontrent çà et là, notamment dans les deux biographies, dont l’une est attribuée à Marcellinus et l’autre est anonyme, sont d’une date trop récente pour avoir beaucoup d’autorité. Aussi, sans entrer dans des détails d’un intérêt secondaire, nous bornerons-nous à rapporter les circonstances les plus essentielles de la vie de Thucydide, celles qui ont eu quelque influence sur sa carrière d’historien. \n',
 '\n',
 'Thucydide s’est nommé en plusieurs endroits de son ouvrage, comme s’il eût craint que le titre ne se perdit. En tête du livre, il prend la qualité de citoyen d’Athènes; une seule fois (Liv. IV, chap. civ), il ajoute à son nom celui de son père Oloros; c’est lorsqu’il se cite luimême en qualité de fonctionnaire public. \n']

In [41]:
thuc_fr_chap_str = " ".join(thuc_fr_chap_lst)

In [42]:
thuc_fr_chap_str = re.sub("« ", "«", thuc_fr_chap_str)
thuc_fr_chap_str = re.sub(" »", "»", thuc_fr_chap_str)

## Split by section (including chapters)

In [43]:
thuc_fr_chap_str_split = thuc_fr_chap_str.split("#@$%")

In [44]:
len(thuc_fr_chap_str_split)

1865

In [45]:
# define patterns
book_pattern = re.compile(" book=[0-9] ")
chap_pattern = re.compile(" book=[0-9],chapter=[0-9]+ ")
notes_pattern = re.compile(" section=[0-9] ")
index_pattern = re.compile(" back= ")

## logic
- If after book_pattern: Book summary and table of contents
- If after chap_pattern: Text from chapter number specified in tag
- Before book 1: foreward and table of contents. After book=8,chapter=109: notes, etc

In [268]:
txt_no_markers = []
txt_no_markers.append(thuc_fr_chap_str_split[0])
# counter to track idx of txt_no_markers
txt_no_markers_counter = 0

chap_num_2_idx_thuc_fr_chap_sents = {}
idx_thuc_fr_chap_sents_2_section_name = {}
idx_thuc_fr_chap_sents_2_section_name[0] = "foreword"

book_counter = 0
chapter_counter = 0
notes_book_counter = 0

# we know that thuc_fr_chap_str_split[0] is foreword
for section_idx, section in enumerate(thuc_fr_chap_str_split[1:]):
    # if section is a book marker
    if re.match(book_pattern, section) != None:
        # increment book counter
        book_counter += 1
        # reset chapter_counter
        chapter_counter = 1
        # following section is book introduction and table of contents
        # add to txt_no_markers but do not mark as chapter
        txt_no_markers.append(thuc_fr_chap_str_split[section_idx+2])
        # add to idx_thuc_fr_chap_sents_2_section_name
        txt_no_markers_counter += 1
        idx_thuc_fr_chap_sents_2_section_name[txt_no_markers_counter] = ("book "+str(book_counter)+" introduction")
        
    # if section is a chapter marker
    elif re.match(chap_pattern, section) != None:
        # following section is chapter text
        txt_no_markers.append(thuc_fr_chap_str_split[section_idx+2])
        # add to dict b/c is chapter
        chap_num = str(book_counter)+","+str(chapter_counter)
        chap_num_2_idx_thuc_fr_chap_sents[chap_num] = txt_no_markers_counter
        # increment chapter counter
        chapter_counter += 1
        # update txt_no_markers idx counter
        txt_no_markers_counter += 1
        idx_thuc_fr_chap_sents_2_section_name[txt_no_markers_counter] = chap_num
    
    elif re.match(notes_pattern, section) != None:
        # following section contains notes per chapter
        txt_no_markers.append(thuc_fr_chap_str_split[section_idx+2])
        txt_no_markers_counter += 1
        notes_book_counter += 1
        book_num = str(notes_book_counter)
        idx_thuc_fr_chap_sents_2_section_name[txt_no_markers_counter] = "book "+book_num+" notes"
    
    elif re.match(index_pattern, section) != None:
        # following (and last) section contains index
        txt_no_markers.append(thuc_fr_chap_str_split[section_idx+2])
        txt_no_markers_counter += 1
        idx_thuc_fr_chap_sents_2_section_name[txt_no_markers_counter] = "index"
    
    else:
        pass

In [269]:
# values are idx of chapters
chapter_indices = list(chap_num_2_idx_thuc_fr_chap_sents.values())
chapter_indices = sorted(chapter_indices)

In [270]:
idx_thuc_fr_chap_sents_2_section_name[318]

'3,67'

In [271]:
for sent_idx, sent in enumerate(txt_no_markers):
    txt_no_markers[sent_idx] = txt_no_markers[sent_idx].replace("-·", "- ·")
    # txt_no_markers[sent_idx] = txt_no_markers[sent_idx].replace(".", ". ")
for sent_idx, sent in enumerate(txt_no_markers):
    txt_no_markers[sent_idx] = txt_no_markers[sent_idx].replace("« ", "«")
    txt_no_markers[sent_idx] = txt_no_markers[sent_idx].replace(" »", "»")

#### fix missing whitespace in token 69581

In [272]:
txt_no_markers[318] = ' \n \n LXVII. «Nous sommes entrés dans ces détails, ô Lacédëmoniens, afin de motiver à vos yeux la sentence que vous allez rendre, et de légitimer plus encore aux nôtres la vengeance qui nous anime. Ne vous laissez pas attendrir par l’énumération de leurs anciens services, si tant est qu’ils soient réels. Les bienfaits passés peuvent être un moyen de défense pour les victimes d’une injustice; mais ils doivent attirer une double animadversion sur les auteurs d’actes infâmes, parce que leur crime est un démenti donné à leurs mérites précédents. Que leurs doléances et leurs supplications ne leur soient d’aucun secours, non plus que leurs appels aux sépulcres de vos pères et à leur propre abandon. A notre tour, nous évoquerons notre jeunesse impitoyablement massacrée, elle dont les pères sont morts à Goronée pour entraîner dans votre parti la Béotie, ou, vieux et délaissés dans leurs demeures solitaires, vous supplient bien plus fortement de les venger. La pitié n’est due qu’à l’infortune imméritée; une souffrance aussi juste que la leur doit être au contraire un sujet de joie. \n \n «Pour ce qui est de leur isolement actuel, ils ne doivent  l’imputer qu’à eux-mêmes. Ils ont sciemment repoussé les meilleurs alliés, foulé aux pieds les plus saintes lois par un esprit de haine plutôt que de justice. Même aujourd’hui la satisfaction qu’ils nous auront donnée ne sera pas équivalente à leur crime; elle sera fixée par la loi, car ce n’est point, comme ils le disent, à la suite d’un combat et les mains étendues qu’ils se sont livrés, mais en vertu d’une convention formelle et en se soumettant à un jugement. \n \n «Lacédémoniens, prêtez main forte à la loi des Grecs, qu’ils ont violée; et, comme nous avons souffert de cette violation, récompensez le zèle dont nous avons fait preuve. Qu’il ne soit pas dit que nous avons été supplantés dans votre amitié par la séduction de leurs discours. Montrez’ aux Grecs par un grand exemple qu’à vos yeux le langage ne prévaudra jamais sur les actes : louables, une courte mention leur suffit; coupables, (??)l leur faut de belles phrases pour voile. Mais si des chefs, comme vous aujourd’hui, savent établir contre les coupables des jugements expéditifs, on cherchera moins à pallier des actions criminelles par des discours pompeux.» \n \n '


# Build dict of sent id to chapter name, using running number of tokens

### TODO: newline splitting no longer necessary? (See new, additional splitting on newlines after split_txt)

In [122]:
len(thuc_fr_processed)

17203

In [123]:
thuc_fr_processed[246].split("\n")

['[<i) TILiJe, 11, Γ» 76 cl 612.']

In [124]:
thuc_fr_processed_splitnewline = []
for sent in thuc_fr_processed:
    thuc_fr_processed_splitnewline.extend(sent.split("\n"))

In [125]:
len(thuc_fr_processed_splitnewline)

17203

In [126]:
thuc_fr_processed_splitnewline[0].split()

['NOTICE', 'BIOGRAPHIQUE.']

In [127]:
thuc_fr_processed == thuc_fr_processed_splitnewline

True

### Tokenize both docs (thuc_fr_processed and txt_no_markers)

In [273]:
thuc_sents_tokenized = []
for idx, sent in enumerate(thuc_fr_processed):
    thuc_sents_tokenized.append(sent.split())

In [274]:
len(thuc_sents_tokenized)

17203

In [275]:
thuc_chaps_tokenized = []
for idx, sent in enumerate(txt_no_markers):
    thuc_chaps_tokenized.append(sent.split())

In [276]:
len(thuc_chaps_tokenized)

933

In [277]:
num_tokens_sents = 0
for sent in thuc_sents_tokenized:
    num_tokens_sents += len(sent)

In [278]:
num_tokens_chapts = 0
for sent in thuc_chaps_tokenized:
    num_tokens_chapts += len(sent)

In [279]:
num_tokens_chapts == num_tokens_sents

True

In [280]:
num_tokens_chapts

244871

In [281]:
num_tokens_sents

244871

### Find data errors in tokenized docs: difference in length of tokens = 20 initially

Data errors that I fixed in txt file (to which print out from xml is saved)
- 29475 (book 1, chapter 136) "égal;qu’enfin" --> no whitespace after ;
- book 2, chapter 13: 'Potidée;·', '—', 'dans' --> added whitespace after ;
- ...

All missing whitespaces after ; In one case it was after .

In [282]:
thuc_tokens_from_sents = flatten_list(thuc_sents_tokenized)

In [283]:
len(thuc_tokens_from_sents)

244871

In [284]:
thuc_tokens_from_chapts = flatten_list(thuc_chaps_tokenized)

In [285]:
len(thuc_tokens_from_chapts)

244871

In [286]:
thuc_tokens_from_chapts[113628:113630]

['héros;»', 'eh']

In [287]:
thuc_tokens_from_sents[113628:113630]

['héros;»', 'eh']

In [288]:
for idx, token in enumerate(thuc_tokens_from_chapts):
    if token != thuc_tokens_from_sents[idx]:
        print(idx)

### build dict sent idx to section name

In [289]:
#### OLD VERSION - does not account for sents that pass chapter boundaries ####
#### resulting dict for fr text is the same with this function and new function - sents don't cross chapters? ####

def build_sent_to_section_dict(lst_tokenized_sents, lst_tokenized_chapts,
                               dict_chapter_2_section):
    """
    Build dict of sentence idx to section name
    """
    sent_idx_2_section_name = {}
    token_counter = 0 # per section/chapter
    current_section_idx = 0
    for idx_sent, sent in enumerate(lst_tokenized_sents):
        token_counter += len(sent)
        current_chapter_length = len(lst_tokenized_chapts[current_section_idx])
        if token_counter <= current_chapter_length:
            # add sent to dict
            sent_idx_2_section_name[idx_sent] = dict_chapter_2_section[current_section_idx]
        else:
            # reset token counter and move on to next chapter/section
            token_counter = len(sent)
            current_section_idx += 1
            # add sent to dict
            sent_idx_2_section_name[idx_sent] = dict_chapter_2_section[current_section_idx]
            
    return sent_idx_2_section_name

In [290]:
def build_sent_to_section_dict_2(lst_tokenized_sents, lst_tokenized_chapts,
                               dict_chapter_2_section):
    """
    Build dict of sentence idx to section name
    """
    sent_idx_2_section_name = {}
    token_counter = 0 # per section/chapter
    current_section_idx = 0
    # have_match = False

    for idx_sent, sent in enumerate(lst_tokenized_sents):
        token_counter += len(sent)
        current_chapter_length = len(lst_tokenized_chapts[current_section_idx])
        
        if token_counter < current_chapter_length:
            # add sent to dict
            sent_idx_2_section_name[idx_sent] = dict_chapter_2_section[current_section_idx]
        elif token_counter == current_chapter_length:
            # # change flag
            # have_match = True
            # add sent to dict as part of current section
            sent_idx_2_section_name[idx_sent] = dict_chapter_2_section[current_section_idx]
            # reset token counter and current section idx for next sent iteration
            token_counter = 0
            current_section_idx += 1
        else: # token_counter > current_chapter_length, i.e. we've crossed a section boundary 
            # add sent to current section and next section
            sent_idx_2_section_name[idx_sent] = [
                dict_chapter_2_section[current_section_idx], 
                dict_chapter_2_section[current_section_idx+1]
            ]
            # adjust token counter by only including portion of sent in new section
            token_counter = token_counter - current_chapter_length
            # update current section idx for next sent iteration
            current_section_idx += 1
            
#         else:
#             if have_match == True:
#                 # add sent to current section
#                 sent_idx_2_section_name[idx_sent] = dict_chapter_2_section[current_section_idx]
#                 # reset token counter fully
#                 token_counter = len(sent)
#                 print("+++++++++++++++++++++++++++++++++++")
#             else:
#                 print(f"sent idx is {idx_sent}")
#                 print(f"token counter is {token_counter} and sent length is {len(sent)}")
#                 print(f"current chapter length is {current_chapter_length}")
#                 print(f"current section idx is {current_section_idx}")
#                 # add sent to current section and next section
#                 sent_idx_2_section_name[idx_sent] = [
#                     dict_chapter_2_section[current_section_idx], 
#                     dict_chapter_2_section[current_section_idx+1]
#                 ]
#                 # adjust token counter by including portion of sent in new section only
#                 token_counter = token_counter - current_chapter_length
#             # update current section idx for both cases
#             current_section_idx += 1
#             # reset have_match in case it was True at beginning of if statement
#             have_match = False
        
    return sent_idx_2_section_name

### get dict for fr sents

In [291]:
fr_sent_2_section_name_OLD = build_sent_to_section_dict(
    thuc_sents_tokenized, thuc_chaps_tokenized, idx_thuc_fr_chap_sents_2_section_name)

In [292]:
fr_sent_2_section_name = build_sent_to_section_dict_2(
    thuc_sents_tokenized, thuc_chaps_tokenized, idx_thuc_fr_chap_sents_2_section_name)


In [293]:
fr_sent_2_section_name_OLD == fr_sent_2_section_name

True

In [294]:
len(fr_sent_2_section_name) == len(thuc_sents_tokenized)

True

## get dict for greek sents, using thuc_grk_processed

In [295]:
def tokenize(lst_text):
    tokenized = []
    for idx, sent in enumerate(lst_text):
        tokenized.append(sent.split())
    return tokenized

### tokenize thuc_grk_processed (per sent)

In [339]:
grk_sents_tokenized = tokenize(thuc_grk_processed)

In [340]:
len(thuc_grk_processed) == len(grk_sents_tokenized)

True

In [341]:
len(thuc_grk_processed)

6097

### Get grk by chapter

In [342]:
thuc_grc_df.head()

Unnamed: 0,book,id,loc,seq,text,cites,group,lang,title,translation,wlang,work
0,urn:cts:greekLit:tlg0003.tlg001.perseus-grc2,urn:cts:greekLit:tlg0003.tlg001.perseus-grc2:1...,urn:cts:greekLit:tlg0003.tlg001:1.1.1,0,Θουκυδίδης Ἀθηναῖος ξυνέγραψε τὸν πόλεμον τῶν\...,"[book, chapter, section]",Thucydides,grc,History of the Peloponnesian War,False,grc,urn:cts:greekLit:tlg0003.tlg001
1,urn:cts:greekLit:tlg0003.tlg001.perseus-grc2,urn:cts:greekLit:tlg0003.tlg001.perseus-grc2:1...,urn:cts:greekLit:tlg0003.tlg001:1.1.2,1,κίνησις γὰρ αὕτη μεγίστη δὴ τοῖς Ἕλλησιν ἐγένε...,"[book, chapter, section]",Thucydides,grc,History of the Peloponnesian War,False,grc,urn:cts:greekLit:tlg0003.tlg001
2,urn:cts:greekLit:tlg0003.tlg001.perseus-grc2,urn:cts:greekLit:tlg0003.tlg001.perseus-grc2:1...,urn:cts:greekLit:tlg0003.tlg001:1.1.3,2,τὰ γὰρ πρὸ αὐτῶν καὶ τὰ ἔτι παλαίτερα σαφῶς μὲ...,"[book, chapter, section]",Thucydides,grc,History of the Peloponnesian War,False,grc,urn:cts:greekLit:tlg0003.tlg001
3,urn:cts:greekLit:tlg0003.tlg001.perseus-grc2,urn:cts:greekLit:tlg0003.tlg001.perseus-grc2:1...,urn:cts:greekLit:tlg0003.tlg001:1.2.1,3,φαίνεται γὰρ ἡ νῦν Ἑλλὰς καλουμένη οὐ πάλαι βε...,"[book, chapter, section]",Thucydides,grc,History of the Peloponnesian War,False,grc,urn:cts:greekLit:tlg0003.tlg001
4,urn:cts:greekLit:tlg0003.tlg001.perseus-grc2,urn:cts:greekLit:tlg0003.tlg001.perseus-grc2:1...,urn:cts:greekLit:tlg0003.tlg001:1.2.2,4,"τῆς γὰρ ἐμπορίας οὐκ οὔσης, οὐδ’ ἐπιμειγνύντες...","[book, chapter, section]",Thucydides,grc,History of the Peloponnesian War,False,grc,urn:cts:greekLit:tlg0003.tlg001


In [343]:
book_nums = [x for x in range(1, 9)]
chap_nums = [146, 103, 116, 135, 116, 105, 87, 109]
book_to_number_chaps = dict(zip(book_nums, chap_nums))
book_to_number_chaps

{1: 146, 2: 103, 3: 116, 4: 135, 5: 116, 6: 105, 7: 87, 8: 109}

In [344]:
book_to_number_chaps[1] + 1

147

In [345]:
grk_txt_by_chapter = []
grk_chap_idx_2_chap_name = {}
idx_counter = 0

for book_idx in range(1, 9):
    number_chapters = book_to_number_chaps[book_idx] + 1
    book_txt_by_chapter = []
    for chapter_num in range(1, number_chapters):
        loc_tag = "urn:cts:greekLit:tlg0003.tlg001:" + str(book_idx) + "." + str(chapter_num) + "."
        chapter_text = concatenate_txt(thuc_grc_df[thuc_grc_df['loc'].str.startswith(loc_tag)]['text'].replace('\n',' ', regex=True))
        book_txt_by_chapter.append(chapter_text)
        # add to dict. chap name format: "booknum,chapnum"
        chapter_name = str(book_idx) + "," + str(chapter_num)
        grk_chap_idx_2_chap_name[idx_counter] = chapter_name
        idx_counter += 1
    # add chapters to full text list
    grk_txt_by_chapter.extend(book_txt_by_chapter)
        

In [346]:
len(grk_txt_by_chapter) == sum(chap_nums)

True

In [347]:
grk_txt_by_chapter[-1]

'ὁ δὲ Τισσαφέρνης αἰσθόμενος καὶ τοῦτο τῶν Πελοποννησίων τὸ ἔργον καὶ οὐ μόνον τὸ ἐν τῇ Μιλήτῳ καὶ Κνίδῳ ʽκαὶ ἐνταῦθα γὰρ αὐτοῦ ἐξεπεπτώκεσαν οἱ φρουροἴ, διαβεβλῆσθαί τε νομίσας αὐτοῖς σφόδρα καὶ δείσας μὴ καὶ ἄλλο τι ἔτι βλάπτωσι, καὶ ἅμα ἀχθόμενος εἰ Φαρνάβαζος ἐξ ἐλάσσονος χρόνου καὶ δαπάνης δεξάμενος αὐτοὺς κατορθώσει τι μᾶλλον τῶν πρὸς τοὺς Ἀθηναίους, πορεύεσθαι διενοεῖτο πρὸς αὐτοὺς ἐπὶ τοῦ Ἑλλησπόντου, ὅπως μέμψηταί τε τῶν περὶ τὴν Ἄντανδρον γεγενημένων καὶ τὰς διαβολὰς καὶ περὶ τῶν Φοινισσῶν νεῶν καὶ τῶν ἄλλων ὡς εὐπρεπέστατα ἀπολογήσηται. καὶ ἀφικόμενος πρῶτον ἐς Ἔφεσον θυσίαν ἐποιήσατο τῇ Ἀρτέμιδι.  [ὅταν ὁ μετὰ τοῦτο τὸ θέρος χειμὼν τελευτήσῃ, ἓν καὶ εἰκοστὸν ἔτος πληροῦται.] '

In [348]:
len(grk_chap_idx_2_chap_name)

917

In [349]:
len(grk_chap_idx_2_chap_name) == len(grk_txt_by_chapter)

True

### TODO: not necessary anymore? send grk txt by chapter to file, to edit whitespace errors

In [350]:
# grk_txt_to_edit_path = "/home/craig.car/spring2023/data/align_noisy_data/thucydides/grk_text_by_chapter.txt"
# write_file(grk_txt_by_chapter, grk_txt_to_edit_path)

### Tokenize grk text by chapter

In [351]:
# TODO: not necessary anymore?
# # load grk text by chapter from file, where whitespace errors have been corrected
# grk_txt_by_chapter_cleaned = load_txt_as_lst(grk_txt_to_edit_path)

In [352]:
for sent_idx, sent in enumerate(grk_txt_by_chapter):
    grk_txt_by_chapter[sent_idx] = grk_txt_by_chapter[sent_idx].replace(".", ". ")
    grk_txt_by_chapter[sent_idx] = grk_txt_by_chapter[sent_idx].replace(":", ": ")
    grk_txt_by_chapter[sent_idx] = grk_txt_by_chapter[sent_idx].replace(";", "; ")

for sent_idx, sent in enumerate(grk_txt_by_chapter):
    grk_txt_by_chapter[sent_idx] = grk_txt_by_chapter[sent_idx].replace(". ’", ".’")
    grk_txt_by_chapter[sent_idx] = grk_txt_by_chapter[sent_idx].replace(": ’", ":’")
    grk_txt_by_chapter[sent_idx] = grk_txt_by_chapter[sent_idx].replace(". ]", ".]")
    grk_txt_by_chapter[sent_idx] = grk_txt_by_chapter[sent_idx].replace("\n", " ")
    # grk_txt_by_chapter_cleaned[sent_idx] = re.sub(". ’", ".’", grk_txt_by_chapter_cleaned[sent_idx])
    # grk_txt_by_chapter_cleaned[sent_idx] = re.sub("\n", " ", grk_txt_by_chapter_cleaned[sent_idx])
    # grk_txt_by_chapter_cleaned[sent_idx] = re.sub(". ]", ".]", grk_txt_by_chapter_cleaned[sent_idx])

for sent_idx, sent in enumerate(grk_txt_by_chapter):    
    grk_txt_by_chapter[sent_idx] = grk_txt_by_chapter[sent_idx].replace(" . . .", ".")

In [353]:
# re.sub(". ’", ".’", grk_txt_by_chapter_cleaned[35])

In [354]:
# grk_txt_by_chapter[482]

In [355]:
# grk_chaps_tokenized = tokenize(grk_txt_by_chapter_cleaned)
grk_chaps_tokenized = tokenize(grk_txt_by_chapter)

In [356]:
len(grk_chaps_tokenized) == len(grk_txt_by_chapter)

True

### Find data errors in tokenized docs for Grk: difference in length = 104

All were due to missing whitespace after punctuation

In [357]:
num_tokens_sents = 0
for sent in grk_sents_tokenized:
    num_tokens_sents += len(sent)
num_tokens_sents

150155

In [358]:
num_tokens_chaps = 0
for sent in grk_chaps_tokenized:
    num_tokens_chaps += len(sent)
num_tokens_chaps

150155

In [359]:
grk_tokens_from_sents = flatten_list(grk_sents_tokenized)
len(grk_tokens_from_sents)

150155

In [360]:
grk_tokens_from_chaps = flatten_list(grk_chaps_tokenized)
len(grk_tokens_from_chaps)

150155

In [361]:
grk_tokens_from_chaps[150154:150156]

['πληροῦται.]']

In [362]:
grk_tokens_from_sents[150154:150156]

['πληροῦται.]']

In [363]:
for idx, token in enumerate(grk_tokens_from_chaps):
    if token != grk_tokens_from_sents[idx]:
        print(idx)

### get dict for grk sents

In [364]:
#### Original code to fix bugs in building grk dict. Moved to function, build_sent_to_section_dict_2() #####

# grk_sent_2_section_name = {}
# token_counter = 0 # per section/chapter
# current_section_idx = 0
# have_match = False

# for idx_sent, sent in enumerate(grk_sents_tokenized):
#     token_counter += len(sent)
#     current_chapter_length = len(grk_chaps_tokenized[current_section_idx])
#     if token_counter < current_chapter_length:
#         # add sent to dict
#         grk_sent_2_section_name[idx_sent] = grk_chap_idx_2_chap_name[current_section_idx]
#     elif token_counter == current_chapter_length:
#         grk_sent_2_section_name[idx_sent] = grk_chap_idx_2_chap_name[current_section_idx]
#         # print("+++++++")
#         # print(f"token_counter == current chap length. sent idx is {idx_sent}")
#         # print(f"current section idx is {current_section_idx}")
#         have_match = True
#     else:
#         # print("======")
#         # print(f"moved to new section. sent idx is {idx_sent}")
#         # print(f"section idx was {current_section_idx}")
#         # print(f"token counter was {token_counter} and chap length was {current_chapter_length}")
        
#         if have_match == True:
#             # add sent to next section only
#             grk_sent_2_section_name[idx_sent] = grk_chap_idx_2_chap_name[current_section_idx+1]
#             # reset token counter fully
#             token_counter = len(sent)
            
#         else:
#             # add sent to current section and next section
#             grk_sent_2_section_name[idx_sent] = [
#                 grk_chap_idx_2_chap_name[current_section_idx], 
#                 grk_chap_idx_2_chap_name[current_section_idx+1]
#             ]
#             # adjust token counter by including portion of sent in new section only
#             token_counter = token_counter - current_chapter_length
            
#         # update current section idx for both cases
#         current_section_idx += 1
#         # reset have_match in case it was True at beginning of if statement
#         have_match = False
        
#         # current_chapter_length = len(grk_chaps_tokenized[current_section_idx])
        
#     # print(f"current sent_idx is {idx_sent}")
#     # print(f"sent length is {len(sent)}")
#     # print(f"token counter is {token_counter}")
#     # print(f"section length is {current_chapter_length}")
#     # print(grk_sent_2_section_name[idx_sent])


In [365]:
grk_sent_2_section_name = build_sent_to_section_dict_2(
    grk_sents_tokenized, grk_chaps_tokenized, grk_chap_idx_2_chap_name)

In [366]:
thuc_grk_processed[229]

'τοιαῦτα μὲν οἱ Κερκυραῖοι εἶπον:'

In [367]:
grk_sent_2_section_name[229]

'1,36'

In [368]:
grk_txt_by_chapter[35]

'‘καὶ ὅτῳ τάδε ξυμφέροντα μὲν δοκεῖ λέγεσθαι, φοβεῖται δὲ μὴ δι’ αὐτὰ πειθόμενος τὰς σπονδὰς λύσῃ, γνώτω τὸ μὲν δεδιὸς αὐτοῦ ἰσχὺν ἔχον τοὺς ἐναντίους μᾶλλον φοβῆσον, τὸ δὲ θαρσοῦν μὴ δεξαμένου ἀσθενὲς ὂν πρὸς ἰσχύοντας τοὺς ἐχθροὺς ἀδεέστερον ἐσόμενον, καὶ ἅμα οὐ περὶ τῆς Κερκύρας νῦν τὸ πλέον ἢ καὶ τῶν Ἀθηνῶν βουλευόμενος, καὶ οὐ τὰ κράτιστα αὐταῖς προνοῶν, ὅταν ἐς τὸν μέλλοντα καὶ ὅσον οὐ παρόντα πόλεμον τὸ αὐτίκα περισκοπῶν ἐνδοιάζῃ χωρίον προσλαβεῖν ὃ μετὰ μεγίστων καιρῶν οἰκειοῦταί τε καὶ πολεμοῦται.   τῆς τε γὰρ Ἰταλίας καὶ Σικελίας καλῶς παράπλου κεῖται, ὥστε μήτε ἐκεῖθεν ναυτικὸν ἐᾶσαι Πελοποννησίοις ἐπελθεῖν τό τε ἐνθένδε πρὸς τἀκεῖ παραπέμψαι, καὶ ἐς τἆλλα ξυμφορώτατόν ἐστιν.   βραχυτάτῳ δ’ ἂν κεφαλαίῳ, τοῖς τε ξύμπασι καὶ καθ’ ἕκαστον, τῷδ’ ἂν μὴ προέσθαι ἡμᾶς μάθοιτε:  τρία μὲν ὄντα λόγου ἄξια τοῖς Ἕλλησι ναυτικά, τὸ παρ’ ὑμῖν καὶ τὸ ἡμέτερον καὶ τὸ Κορινθίων:  τούτων δὲ εἰ περιόψεσθε τὰ δύο ἐς ταὐτὸν ἐλθεῖν καὶ Κορίνθιοι ἡμᾶς προκαταλήψονται, Κερκυραίοις τε καὶ Πελοποννησ

In [369]:
grk_chap_idx_2_chap_name[35]

'1,36'

### write sent to section name dicts to json

In [374]:
# path_out = "/home/craig.car/spring2023/data/align_noisy_data/thucydides/grk_sent_2_section_name_dict.json"
# with open(path_out, 'w') as fp:
#     json.dump(grk_sent_2_section_name, fp)

In [373]:
# path_out = "/home/craig.car/spring2023/data/align_noisy_data/thucydides/fr_sent_2_section_name_dict.json"
# with open(path_out, 'w') as fp:
#     json.dump(fr_sent_2_section_name, fp)

### get grk sents that cross chapter boundaries
Sentences here: https://docs.google.com/spreadsheets/d/1cQJ9Ypt_h4cHS-0ABbKfDHAE57ZLU32IONAadqdifaU/edit#gid=1836213835

In [375]:
values = list(grk_sent_2_section_name.values())

In [383]:
num = 5677
for lst in [thuc_grk_processed, grk_sent_2_section_name]:
    print(lst[num])

# get chap idx by searching for sent in grk_txt_by_chapter.txt
chap = 850
print(grk_txt_by_chapter[chap])
print(grk_txt_by_chapter[chap+1])

ἀγανακτῶν δὲ ὁ μὲν Τισσαφέρνης ἀπεχώρησεν ἀπ’ αὐτῶν δι’  ὀργῆς καὶ ἄπρακτος, οἱ δ’ ἐς τὴν Ῥόδον ἐπικηρυκευομένων ἀπὸ τῶν δυνατωτάτων ἀνδρῶν τὴν γνώμην εἶχον πλεῖν, ἐλπίζοντες νῆσόν τε οὐκ ἀδύνατον καὶ ναυβατῶν πλήθει καὶ πεζῷ προσάξεσθαι, καὶ ἅμα ἡγούμενοι αὐτοὶ ἀπὸ τῆς ὑπαρχούσης ξυμμαχίας δυνατοὶ ἔσεσθαι Τισσαφέρνην μὴ αἰτοῦντες χρήματα τρέφειν τὰς ναῦς.
['8,43', '8,44']
οἱ δὲ Ἀθηναῖοι ταῖς ἐκ τῆς Σάμου ναυσὶ πάσαις, ὡς ᾔσθοντο τὰ τῆς ναυμαχίας, πλεύσαντες ἐς τὴν Σύμην καὶ ἐπὶ μὲν τὸ ἐν τῇ Κνίδῳ ναυτικὸν οὐχ ὁρμήσαντες, οὐδ᾽ ἐκεῖνοι ἐπ’ ἐκείνους, λαβόντες δὲ τὰ ἐν τῇ Σύμῃ σκεύη τῶν νεῶν καὶ Λωρύμοις τοῖς ἐν τῇ ἠπείρῳ προσβαλόντες ἀπέπλευσαν ἐς τὴν Σάμον.   ἅπασαι δ’ ἤδη οὖσαι ἅμα ἐν τῇ Κνίδῳ αἱ τῶν Πελοποννησίων νῆες ἐπεσκευάζοντό τε εἴ τι ἔδει καὶ πρὸς τὸν Τισσαφέρνην (παρεγένετο γάρ) λόγους ἐποιοῦντο οἱ ἕνδεκα ἄνδρες τῶν Λακεδαιμονίων περί τε τῶν ἤδη πεπραγμένων, εἴ τι μὴ ἤρεσκεν αὐτοῖς, καὶ περὶ τοῦ μέλλοντος πολέμου, ὅτῳ τρόπῳ ἄριστα καὶ ξυμφορώτατα ἀμφοτέροις πολεμήσεται.   μάλι

In [380]:
# idx of Grk sents that cross chapter boundaries (verified)
for idx, value in enumerate(values):
    if isinstance(value,list):
        print(idx)

155
162
2257
3577
5677


In [None]:
def read_alignments(fin):
    """
    function built by vecalign. see:
    https://github.com/caro28/vecalign/blob/master/dp_utils.py
    """
    
    alignments = []
    with open(fin, 'rt', encoding="utf-8") as infile:
        for line in infile:
            fields = [x.strip() for x in line.split(':') if len(x.strip())]
            if len(fields) < 2:
                raise Exception('Got line "%s", which does not have at least two ":" separated fields' % line.strip())
            try:
                src = literal_eval(fields[0])
                tgt = literal_eval(fields[1])
            except:
                raise Exception('Failed to parse line "%s"' % line.strip())
            alignments.append((src, tgt))

    # I know bluealign files have a few entries entries missing,
    #   but I don't fix them in order to be consistent previous reported scores
    return alignments

In [None]:
# get vecalign results
vec_rslts_path = "/home/craig.car/spring2023/data/align_noisy_data/thucydides/el2fr_rslts_0418.txt"
el_2_fr_vec_rslts = read_alignments(vec_rslts_path)

In [None]:
el_2_fr_vec_rslts[:5]

In [None]:
type(grk_sent_2_section_name[5449])

In [None]:
fr_sent_2_section_name[13809]

In [None]:
fr_sent_2_section_name[el_2_fr_vec_rslts[0][1][0]]

In [None]:
import numpy as np

In [None]:
# fr_section_names = np.unique(np.array(list(fr_sent_2_section_name.values())))

In [None]:
fr_extraneous_sections = ['book 1 introduction', 'book 1 notes', 'book 2 introduction',
       'book 2 notes', 'book 3 introduction', 'book 3 notes',
       'book 4 introduction', 'book 4 notes', 'book 5 introduction',
       'book 5 notes', 'book 6 introduction', 'book 6 notes',
       'book 7 introduction', 'book 7 notes', 'book 8 introduction',
       'book 8 notes', 'foreword', 'index']

## TODO: check rslts are correct

In [None]:
tp_strict = 0 # +1 per alignment if there's an exact match
tp_lax = 0 # +1 per alignment if there's any overlap
overlaps = []
errors = []
fr_extraneous_sections = ['book 1 introduction', 'book 1 notes', 'book 2 introduction',
       'book 2 notes', 'book 3 introduction', 'book 3 notes',
       'book 4 introduction', 'book 4 notes', 'book 5 introduction',
       'book 5 notes', 'book 6 introduction', 'book 6 notes',
       'book 7 introduction', 'book 7 notes', 'book 8 introduction',
       'book 8 notes', 'foreword', 'index']
correct_nulls = 0

for idx_align, alignment in enumerate(el_2_fr_vec_rslts):
    src_sents = alignment[0]
    tgt_sents = alignment[1]
    # get set of chapters from src, then from tgt
    chapters_from_src = set()
    chapters_from_tgt = set()
    # if alignment is null on src side, then chapters_from_src remains empty set
    if src_sents != []:
        for src_id in src_sents:
            if isinstance(grk_sent_2_section_name[src_id], list):
                for section_name in grk_sent_2_section_name[src_id]:
                    chapters_from_src.add(section_name)
            else:
                chapters_from_src.add(grk_sent_2_section_name[src_id])
    # if alignment is null on tgt side, then chapters_from_tgt remains empty set
    if tgt_sents != []:
        for tgt_id in tgt_sents:
            if isinstance(fr_sent_2_section_name[tgt_id], list):
                for section_name_ in fr_sent_2_section_name[tgt_id]:
                    chapters_from_tgt.add(section_name_)
            else:
                chapters_from_tgt.add(fr_sent_2_section_name[tgt_id])
                
    # compare the sets, get tp strict and lax
    if chapters_from_src == chapters_from_tgt:
        tp_strict += 1
        print("have exact match")
    
    # account for correct null : fr extraneous sections 
    elif chapters_from_src == set():
        tgt_counter = 0
        for chapter in chapters_from_tgt:
            if chapter in fr_extraneous_sections:
                tgt_counter += 1
        # tp_strict if all tgt chapters are extraneous
        if tgt_counter == len(chapters_from_tgt):
            tp_strict += 1
            correct_nulls += 1
            print("have correct null")
        
    else:
        overlap = chapters_from_src.intersection(chapters_from_tgt)
        if len(overlap) != 0:
            tp_lax += 1
            overlaps.append(alignment)
        else:
            # save errors
            error_dict = {}
            error_dict["alignment"] = alignment
            error_dict["alignmnent_idx"] = idx_align
            error_dict["src_chapters"] = chapters_from_src
            error_dict["tgt_chapters"] = chapters_from_tgt
            errors.append(error_dict)
    
    
    

In [None]:
def score_vec_rslts_chapter_level(vr_rslts_lst, el_sent2section_dict,
                                 fr_sent2section_dict, fr_extra_section_names):

    tp_strict = 0 # +1 per alignment if there's an exact match
    tp_lax = 0 # +1 per alignment if there's any overlap
    overlaps = []
    errors = []
    correct_nulls = 0

    for idx_align, alignment in enumerate(vr_rslts_lst):
        # skip alignments null on both sides
        if alignment == ([],[]):
            continue
        else:
            src_sents = alignment[0]
            tgt_sents = alignment[1]
            # get set of chapters from src, then from tgt
            chapters_from_src = set()
            chapters_from_tgt = set()
            # if alignment is null on src side, then chapters_from_src remains empty set
            if src_sents != []:
                for src_id in src_sents:
                    if isinstance(el_sent2section_dict[src_id], list):
                        for section_name in el_sent2section_dict[src_id]:
                            chapters_from_src.add(section_name)
                    else:
                        chapters_from_src.add(el_sent2section_dict[src_id])
            # if alignment is null on tgt side, then chapters_from_tgt remains empty set
            if tgt_sents != []:
                for tgt_id in tgt_sents:
                    if isinstance(fr_sent2section_dict[tgt_id], list):
                        for section_name_ in fr_sent2section_dict[tgt_id]:
                            chapters_from_tgt.add(section_name_)
                    else:
                        chapters_from_tgt.add(fr_sent2section_dict[tgt_id])

            # compare the sets, get tp strict and lax
            if chapters_from_src == chapters_from_tgt:
                tp_strict += 1
                print("have exact match")

            # account for correct null : afr extraneous sections 
            elif chapters_from_src == set():
                tgt_counter = 0
                for chapter in chapters_from_tgt:
                    if chapter in fr_extra_section_names:
                        tgt_counter += 1
                # tp_strict if all tgt chapters are extraneous
                if tgt_counter == len(chapters_from_tgt):
                    tp_strict += 1
                    correct_nulls += 1
                    print("have correct null")

            else:
                overlap = chapters_from_src.intersection(chapters_from_tgt)
                if len(overlap) != 0:
                    tp_lax += 1
                    overlaps.append(alignment)
                else:
                    # save errors
                    error_dict = {}
                    error_dict["alignment"] = alignment
                    error_dict["alignmnent_idx"] = idx_align
                    error_dict["src_chapters"] = chapters_from_src
                    error_dict["tgt_chapters"] = chapters_from_tgt
                    errors.append(error_dict)
        
    return tp_strict, tp_lax, overlaps, errors, correct_nulls



In [None]:
vec_rslts = [
            ([], [13809]),
            ([2], [154]),
            ([954], [1833, 1846, 1880]),
            ([2861], [11486]),
            ([4201], [14187]),
            ([6094], [16125]),
            ([], [])
        ]


In [None]:
fr_extraneous_sections = ['book 1 introduction', 'book 1 notes', 'book 2 introduction',
           'book 2 notes', 'book 3 introduction', 'book 3 notes',
           'book 4 introduction', 'book 4 notes', 'book 5 introduction',
           'book 5 notes', 'book 6 introduction', 'book 6 notes',
           'book 7 introduction', 'book 7 notes', 'book 8 introduction',
           'book 8 notes', 'foreword', 'index']

tp_strict_, tp_lax_, overlaps_, errors_, correct_nulls_ = score_vec_rslts_chapter_level(
    el_2_fr_vec_rslts, grk_sent_2_section_name, fr_sent_2_section_name, fr_extraneous_sections)

In [None]:
tp_strict_ - correct_nulls_

In [None]:
tp_strict_

In [None]:
correct_nulls_

In [None]:
tp_strict/len(el_2_fr_vec_rslts)

In [None]:
(tp_strict+65)/len(el_2_fr_vec_rslts)

In [None]:
tp_lax/len(el_2_fr_vec_rslts)

In [None]:
len(errors)/len(el_2_fr_vec_rslts)

##### len(errors)

In [None]:
grk_sent_2_section_name[6097]

In [None]:
fr_sent_2_section_name[16839]

In [None]:
correct_nulls

In [None]:
tp_strict

In [None]:
# thuc_fr_chap_sents = []
# for section in txt_no_markers:
#     segmented_section = preprocess_series(section, "fr", True, None)
#     thuc_fr_chap_sents.append(segmented_section) 

In [None]:
# num_sents = 0
# rslt_chaps = []
# for chap in thuc_fr_chap_sents:
#     rslt_sents = []
#     for sent in chap:
#         rslt_sents.extend(sent.split("\n"))
#     num_sents += len(rslt_sents)
#     rslt_chaps.append(rslt_sents)

In [None]:
# thuc_fr_chap_sents[39]

In [None]:
# rslt_chaps[-1][-5:]

In [None]:
# fullstring_split = load_txt_as_lst("/home/craig.car/spring2023/data/align_noisy_data/thuc_fr_1863_processed_2.txt")

In [None]:
# chap_newline_sents = flatten_list(rslt_chaps)

In [None]:
# for i, row in enumerate(zip(fullstring_split, chap_newline_sents)):
#     full_sent = row[0].strip()
#     chap_newline_sent = row[1].strip()
#     if full_sent != chap_newline_sent:
#         print(i)
#         print("full ================ sent")
#         print(full_sent)
#         print("chapter ============= sent")
#         print(chap_newline_sent)
#         break

In [None]:
# chap_newline_sents[2994]

In [None]:
# # 1+109+87+105+116+134+116+103+145+8+8+1 = 933 (counting sections from end)
# len(thuc_fr_chap_sents)

In [None]:
# thuc_fr_chap_sents[5]

## get num sentences per section

In [None]:
# # chap_num_2_length_chap = {}
# idx_thuc_fr_chap_sents_2_section_length = {}
# for idx, section in enumerate(thuc_fr_chap_sents):
#     idx_thuc_fr_chap_sents_2_section_length[idx] = len(section)

In [None]:
# idx_thuc_fr_chap_sents_2_section_length[932]

In [None]:
# len(thuc_fr_chap_sents[932])

In [None]:
# len(idx_thuc_fr_chap_sents_2_section_name)

In [None]:
# section_name_2_section_length = {}
# for key, value in idx_thuc_fr_chap_sents_2_section_name.items():
#     section_name_2_section_length[value] = idx_thuc_fr_chap_sents_2_section_length[key]

In [None]:
# # check values match across dicts
# list(idx_thuc_fr_chap_sents_2_section_length.values()) == list(section_name_2_section_length.values())


In [None]:
# len(section_name_2_section_length)

## TODO: doesn't match length of processed txt doc

In [None]:
# lengths = list(idx_thuc_fr_chap_sents_2_section_length.values())

In [None]:
# sum(lengths)

In [None]:
# # newline in sent 3 below isn't creating a line break, but it is in the processed text written to file
# thuc_fr_chap_sents[59]

In [None]:
# idx_thuc_fr_chap_sents_2_section_length[59]

## build dict of sent_idx_2_section_name using section_name_2_section_length

In [None]:
# fr_sent_id_2_section_name = {}
# last_num = -1
# # e.g. key is "1,1" and value is 6 (num of sents in 1,1)
# for key, value in section_name_2_section_length.items():
#     for num in range(1, value+1):
#         sent_idx = num+last_num
#         fr_sent_id_2_section_name[sent_idx] = key
#     last_num += value+1


In [None]:
# len(fr_sent_id_2_section_name)

In [None]:
# fr_sent_id_2_section_name