In [1]:
from ast import literal_eval
from functions import load_txt_as_lst
import pandas as pd
from collections import defaultdict

In [2]:

#### vecalign function ####
def read_alignments(fin):
    alignments = []
    with open(fin, 'rt', encoding="utf-8") as infile:
        for line in infile:
            fields = [x.strip() for x in line.split(':') if len(x.strip())]
            if len(fields) < 2:
                raise Exception('Got line "%s", which does not have at least two ":" separated fields' % line.strip())
            try:
                src = literal_eval(fields[0])
                tgt = literal_eval(fields[1])
            except:
                raise Exception('Failed to parse line "%s"' % line.strip())
            alignments.append((src, tgt))

    # I know bluealign files have a few entries entries missing,
    #   but I don't fix them in order to be consistent previous reported scores
    return alignments

In [3]:
th1_rslts = read_alignments("/home/craig.car/repos/chiron/chironata/data/alignments_rslts/thucydides_1863.rslts")
th2_rslts = read_alignments("/home/craig.car/repos/chiron/chironata/data/alignments_rslts/thucydides_1_1852.rslts")
el_sents = load_txt_as_lst("/home/craig.car/repos/chiron/chironata/data/src_data/urn:cts:greekLit:tlg0003.tlg001.sents")
th1_sents = load_txt_as_lst("/home/craig.car/repos/chiron/chironata/data/french_trans-dev/thucydides_1863.sents")
th2_sents = load_txt_as_lst("/home/craig.car/repos/chiron/chironata/data/french_trans-dev/thucydides_1_1852.sents")

In [4]:
def build_src_2_tgt_dict(alignments_lst):
    '''
    If alignment is null on one side, inserts "null" 
    '''
    src_id_to_tgt_ids = defaultdict(set)
    for src, tgt in alignments_lst:
        if src == []:
            src = ["null"]
        if tgt == []:
            tgt = ["null"]
        for src_id in src:
            for tgt_id in tgt:
                src_id_to_tgt_ids[src_id].add(tgt_id)
    return src_id_to_tgt_ids

def build_tgt_2_src_dict(alignments_lst):
    '''
    If alignment is null on one side, inserts "null" 
    '''
    # tgt_id_to_src_ids = defaultdict(set)
    tgt_id_to_src_ids = {}
    for src, tgt in alignments_lst:
        if src == []:
            src = ["null"]
        if tgt == []:
            tgt = ["null"]
        for tgt_id in tgt:
            for src_id in src:
                if tgt_id in tgt_id_to_src_ids.keys():
                    tgt_id_to_src_ids[tgt_id].add(src_id)
                else:
                    tgt_id_to_src_ids[tgt_id] = {src_id}
                # tgt_id_to_src_ids[tgt_id].add(src_id)
    return tgt_id_to_src_ids

In [5]:
el2th1 = build_src_2_tgt_dict(th1_rslts)
el2th2 = build_src_2_tgt_dict(th2_rslts)

th1_2_el = build_tgt_2_src_dict(th1_rslts)
th2_2_el = build_tgt_2_src_dict(th2_rslts)

## TODO: problem with iterating through alignments: how to capture []:[tgt] in th2?

In [9]:
#### iterate through el indices #### 

# for sent_idx in range(len(el2th1)):
el_series = []
th1_series = []
th2_series = []

building_src = set()
building_tgt_th1 = set()
building_tgt_th2 = set()

for sent_idx in range(35,39):
    # get tgt preds for src idx across all translations
    el_2_th1_tgt_preds = el2th1[sent_idx]
    el_2_th2_tgt_preds = el2th2[sent_idx]
    
    # get src preds for all tgt preds across all translations
    el_2_th1_src_preds = set()
    for tgt_pred in el_2_th1_tgt_preds:
        src_preds = th1_2_el[tgt_pred]
        el_2_th1_src_preds.update(src_preds)
    
    el_2_th2_src_preds = set()
    for tgt_pred in el_2_th2_tgt_preds:
        src_preds = th2_2_el[tgt_pred]
        el_2_th2_src_preds.update(src_preds)
    
    # check that no tgt preds are missing
    for src_pred in el_2_th1_src_preds:
        tgt_preds = el2th1[src_pred]
        el_2_th1_tgt_preds.update(tgt_preds)
    
    for src_pred in el_2_th2_src_preds:
        tgt_preds = el2th2[src_pred]
        el_2_th2_tgt_preds.update(tgt_preds)
    src_max = max(len(el_2_th1_src_preds), len(el_2_th2_src_preds))
    tgt_max = max(len(el_2_th1_tgt_preds), len(el_2_th2_tgt_preds))
    
    building_src.update(el_2_th1_src_preds)
    building_src.update(el_2_th2_src_preds)
    building_tgt_th1.update(el_2_th1_tgt_preds)
    building_tgt_th2.update(el_2_th2_tgt_preds)
    

print(building_src)
print(building_tgt_th1)
print(building_tgt_th2)


{35, 36, 37, 38}
{110, 111}
{89, 90}


In [11]:
### iterate through pred alignments of 1st translation ###

el_series_idx = []
th1_series_idx = []
th2_series_idx = []

building_src = set()

visited_el = set()

for alignment in th1_rslts:
    src_preds_th1 = set(alignment[0])
    tgt_preds_th1 = set(alignment[1])
    
    if src_preds_th1 == set():
        el_series_idx.append(set())
        th1_series_idx.append(tgt_preds_th1)
    else:
    
        for idx in src_preds_th1:
            if idx in visited_el:
                # print(idx)
                continue
            else:
                building_src.update(src_preds_th1)
                # get tgt from th2
                tgt_preds_th2 = set()
                for src_pred in building_src:
                    tgt_preds = el2th2[src_pred]
                    tgt_preds_th2.update(tgt_preds)
                # check for any missing src
                for tgt_pred in tgt_preds_th1:
                    src_preds = th1_2_el[tgt_pred]
                    building_src.update(src_preds)

                for tgt_pred in tgt_preds_th2:
                    src_preds = th2_2_el[tgt_pred]
                    building_src.update(src_preds)

                # get tgt for any new src:
                for src in building_src:
                    tgt_preds_th1.update(el2th1[src])
                    tgt_preds_th2.update(el2th2[src])
                # print(f"%%{building_src}, {tgt_preds_th1}, {tgt_preds_th2}")

                # get src for any new tgt:
                for tgt in tgt_preds_th1:
                    building_src.update(th1_2_el[tgt])
                for tgt in tgt_preds_th2:
                    building_src.update(th2_2_el[tgt])
                # print(f"%%{building_src}, {tgt_preds_th1}, {tgt_preds_th2}")

                # get tgt for any new src:
                for src in building_src:
                    tgt_preds_th1.update(el2th1[src])
                    tgt_preds_th2.update(el2th2[src])
                # print(f"%%{building_src}, {tgt_preds_th1}, {tgt_preds_th2}")

                # add to visited_el
                visited_el.update(building_src)

                # add to series
                el_series_idx.append(building_src)
                th1_series_idx.append(tgt_preds_th1)
                th2_series_idx.append(tgt_preds_th2)
                building_src = set()
    
# print(el_series)
# print(th1_series)
# print(th2_series)


In [12]:
el_series = []
for idx_set in el_series_idx:
    if idx_set == set():
        el_series.append("")
    else:
        row = ""
        for idx in idx_set:
            sent = el_sents[idx]
            row += f"[{idx}] {sent}\n"
        el_series.append(row)

In [13]:
len(el_series)

4631

In [14]:
len(th1_series_idx)

4631

In [16]:
#### probably not capturing []:[tgt] in th2

len(th2_series_idx)

4326

In [18]:
# ### With HTML ###

# import pandas as pd
# from IPython.display import display, HTML

# '''
# https://stackoverflow.com/questions/40990700/pandas-dataframes-in-jupyter-columns-of-equal-width-and-centered
# https://stackoverflow.com/questions/34322448/pretty-printing-newlines-inside-a-string-in-a-pandas-dataframe
# '''

# data = [
#        {'Greek': "ὃς δ᾽ ἂν ὑμῶν παραμείνῃ, ὁρῶν ὃν τρόπον ἡμεῖς τάς τε δίκας δικάζομεν καὶ τἆλλα τὴν πόλιν διοικοῦμεν, ἤδη φαμὲν τοῦτον ὡμολογηκέναι ἔργῳ ἡμῖν ἃ ἂν ἡμεῖς κελεύωμεν ποιήσειν ταῦτα, καὶ τὸν μὴ πειθόμενον τριχῇ φαμεν ἀδικεῖν, ὅτι τε γεννηταῖς οὖσιν ἡμῖν οὐ πείθεται, καὶ ὅτι τροφεῦσι, καὶ ὅτι ὁμολογήσας ἡμῖν πείσεσθαι οὔτε πείθεται οὔτε πείθει ἡμᾶς, εἰ μὴ καλῶς τι ποιοῦμεν, προτιθέντων ἡμῶν καὶ οὐκ ἀγρίως ἐπιταττόντων ποιεῖν ἃ ἂν κελεύωμεν, ἀλλὰ ἐφιέντων δυοῖν θάτερα, ἢ πείθειν ἡμᾶς ἢ ποιεῖν, τούτων οὐδέτερα ποιεῖ.", 'Fowler': '[0] But we say that whoever of you stays here, seeing how we administer justice and how we govern the state in other respects, has thereby entered into an agreement with us to do what we command;\n[1] and we say that he who does not obey does threefold wrong, because he disobeys us who are his parents, because he disobeys us who nurtured him, and because after agreeing to obey us he neither obeys us nor convinces us that we are wrong, though we give him the opportunity and do not roughly order him to do what we command, but when we allow him a choice of two things, either to convince us of error or to do our bidding, he does neither of these things.”', 'Jowett': '[0] But he who has experience of the manner in which we order justice and administer the State, and still remains, has entered into an implied contract that he will do as we command him.\n[1] And he who disobeys us is, as we maintain, thrice wrong:\n[2] first, because in disobeying us he is disobeying his parents;\n[3] secondly, because we are the authors of his education;\n[4] thirdly, because he has made an agreement with us that he will duly obey our commands;\n[5] and he neither obeys them nor convinces us that our commands are wrong;\n[6] and we do not rudely impose them, but give him the alternative of obeying or convincing us;'},
#     {'Greek': 'ταύταις δή φαμεν καὶ σέ, ὦ Σώκρατες, ταῖς αἰτίαις ἐνέξεσθαι, εἴπερ ποιήσεις ἃ ἐπινοεῖς, καὶ οὐχ ἥκιστα Ἀθηναίων σέ, ἀλλ᾽ ἐν τοῖς μάλιστα.', 'Fowler':'[2] and we say that he who does not obey does threefold wrong, because he disobeys us who are his parents, because he disobeys us who nurtured him, and because after agreeing to obey us he neither obeys us nor convinces us that we are wrong, though we give him the opportunity and do not roughly order him to do what we command, but when we allow him a choice of two things, either to convince us of error or to do our bidding, he does neither of these things.”', 'Jowett':'[7] that is what we offer and he does neither.\n[8] These are the sort of accusations to which, as we were saying, you, Socrates, will be exposed if you accomplish your intentions;\n[9] you, above all other Athenians."'}
#        ]
# df = pd.DataFrame(data)

# d = dict(selector="th",
#     props=[('text-align', 'center')])
# df.style.set_properties(**{'width':'30em', 'text-align':'left', 'white-space': 'pre-wrap'})\
#         .set_table_styles([d])