In [6]:
import json, os, glob
from ast import literal_eval
from functions import read_alignments, load_txt_as_lst
from collections import defaultdict

In [7]:
def build__src_2_tgt_dict(alignments_lst):
    '''
    If alignment is null on one side, inserts "null" 
    '''
    src_id_to_tgt_ids = defaultdict(set)
    for src, tgt in alignments_lst:
        if (src, tgt) == ([], []):
            continue
        else:
            if src == []:
                src = ["null"]
            if tgt == []:
                tgt = ["null"]
            for src_id in src:
                for tgt_id in tgt:
                    src_id_to_tgt_ids[src_id].add(tgt_id)
    # convert to vals to list for writing to json
    for key in src_id_to_tgt_ids.keys():
        src_id_to_tgt_ids[key] = list(src_id_to_tgt_ids[key])
    return src_id_to_tgt_ids

def initialize_text_dict(src_sents_path):
    '''
    Initialize dict, build dict of src idx 2 src sent
    '''
    text_dict = {}
    sents = load_txt_as_lst(src_sents_path)
    sents = [sent.rstrip("\n") for sent in sents]
    txt_keys = [num for num in range(len(sents))]
    txt_dict = dict(zip(txt_keys, sents))
    text_dict["src_text"] = txt_dict
    return text_dict

def build_translation_dict(aligns_rslts_path, tgt_sents_path):
    aligns = read_alignments(aligns_rslts_path)
    transl_dict = {}
    # build dict of src idx 2 tgt idx
    transl_dict["aligns_idx"] = build__src_2_tgt_dict(aligns)
    
    # build dict of tgt idx 2 tgt sent
    tgt_sents = load_txt_as_lst(tgt_sents_path)
    # get keys: idx of tgt sents
    tgt_sents_idx = [num for num in range(len(tgt_sents))]
    # strip trailing \n from writing file to text; these are vals
    tgt_sents = [sent.rstrip("\n") for sent in tgt_sents]
    tgt_txt_dict = dict(zip(tgt_sents_idx, tgt_sents))
    transl_dict["tgt_text"] = tgt_txt_dict
    return transl_dict

# Build logic

In [None]:
# thuc1_aligns_rslts = "/home/craig.car/repos/chiron/chironata/data/alignments_rslts/urn:cts:greekLit:tlg0003.tlg001_thucydides_1_1852.rslts"
# thuc2_aligns_rslts = "/home/craig.car/repos/chiron/chironata/data/alignments_rslts/urn:cts:greekLit:tlg0003.tlg001_thucydides_1863.rslts"

In [None]:
# thuc1_aligns = read_alignments(thuc1_aligns_rslts)
# thuc2_aligns = read_alignments(thuc2_aligns_rslts)

In [None]:
# # top level: filename would be "urn:cts:greekLit:tlg0003.tlg001.json"
# thucydides = {}

# # dict for thuc1 translation
# thucydides_1_1852 = {}

In [None]:
# thucydides_1_1852["aligns_idx"] = build__src_2_tgt_dict(thuc1_aligns)

In [None]:
# thucydides_1_1852["aligns_idx"][0]

In [None]:
# thucydides["thucydides_1_1852"] = thucydides_1_1852

In [None]:
# thucydides_sents = load_txt_as_lst("/home/craig.car/repos/chiron/chironata/data/src_data/urn:cts:greekLit:tlg0003.tlg001.sents")
# thuc1_sents = load_txt_as_lst("/home/craig.car/repos/chiron/chironata/data/french_trans-dev/thucydides_1_1852.sents")

In [None]:
# # sending to file added newlines to end of each line - strip
# thucydides_sents = [sent.rstrip("\n") for sent in thucydides_sents]
# thuc1_sents = [sent.rstrip("\n") for sent in thuc1_sents]

In [None]:
# src_txt_keys = [num for num in range(len(thucydides_sents))]
# src_txt = dict(zip(src_txt_keys, thucydides_sents))

In [None]:
# tgt_txt_keys = [num for num in range(len(thuc1_sents))]
# print(len(tgt_txt_keys))
# tgt_txt = dict(zip(tgt_txt_keys, thuc1_sents))
# print(len(thuc1_sents))

In [None]:
# thucydides["src_text"] = src_txt
# thucydides["thucydides_1_1852"]["tgt_text"] = tgt_txt

# Build from functions

In [None]:
# build from functions
thuc1_aligns_rslts = "/home/craig.car/repos/chiron/chironata/data/alignments_rslts/urn:cts:greekLit:tlg0003.tlg001_thucydides_1_1852.rslts"
thuc_sents_path = "/home/craig.car/repos/chiron/chironata/data/src_data/urn:cts:greekLit:tlg0003.tlg001.sents"
thucydides = initialize_text_dict(thuc_sents_path)
thuc1_sents_path = "/home/craig.car/repos/chiron/chironata/data/french_trans-dev/thucydides_1_1852.sents"
thucydides["thucydides_1_1852"] = build_translation_dict(thuc1_aligns_rslts, thuc1_sents_path)

In [None]:
thucydides.keys()

In [None]:
thucydides['thucydides_1_1852'].keys()

In [None]:
# add another translation to test
thuc2_aligns_rslts = "/home/craig.car/repos/chiron/chironata/data/alignments_rslts/urn:cts:greekLit:tlg0003.tlg001_thucydides_1863.rslts"
thuc2_sents_path = "/home/craig.car/repos/chiron/chironata/data/french_trans-dev/thucydides_1863.sents"
thucydides["thucydides_1863"] = build_translation_dict(thuc2_aligns_rslts, thuc2_sents_path)

In [None]:
thucydides.keys()

In [None]:
thucydides["thucydides_1863"].keys()

In [None]:
len(thucydides["thucydides_1863"]['aligns_idx'].keys())

In [None]:
len(thucydides["thucydides_1863"]['tgt_text'].keys())

In [None]:
# last value is empty. also in src2tgt dict. but doesn't seem to cause errors?
print(thucydides["thucydides_1863"]['aligns_idx'][4418])
print(thucydides["thucydides_1863"]['aligns_idx'][4419])

In [None]:
thucydides["src_text"][4418]

In [None]:
test = load_txt_as_lst(thuc2_aligns_rslts)

In [None]:
test[-1]

In [None]:
print(len(test))

# Build for all thucydides translations
To scale: iterate through every file in src_data (src sents)

In [8]:
# load lookup table
lookup_path = "/home/craig.car/repos/chiron/chironata/data/cts_lookup_table.json"
with open(lookup_path) as f:
    lookup = json.load(f)

# get dir names
align_rslts_dir = "/home/craig.car/repos/chiron/chironata/data/alignments_rslts/"
src_data_dir = "/home/craig.car/repos/chiron/chironata/data/src_data/"
fr_dir = "/home/craig.car/repos/chiron/chironata/data/french_trans-dev/"
de_dir = "/home/craig.car/repos/chiron/chironata/data/german_trans-dev/"
it_dir = "/home/craig.car/repos/chiron/chironata/data/italian_trans-dev/"
en_dir = "/home/craig.car/repos/chiron/chironata/data/english_trans-dev/"
pathout = "/home/craig.car/repos/chiron/chironata/sentence_aligned_texts/"

In [9]:
lookup["urn:cts:greekLit:tlg0551.tlg013"]

['appian_1832_10', 'appian_1830_5', 'Appianus_1828']

In [10]:
# thucydides = initialize_text_dict(thuc_sents_path)
# thuc1_sents_path = "/home/craig.car/repos/chiron/chironata/data/french_trans-dev/thucydides_1_1852.sents"
# thucydides["thucydides_1_1852"] = build_translation_dict(thuc1_aligns_rslts, thuc1_sents_path)

In [21]:
# src_sents_path = "/home/craig.car/repos/chiron/chironata/data/src_data/urn:cts:greekLit:tlg0003.tlg001.sents"
for src_sents_path in glob.iglob(src_data_dir+"*.sents"):
    # get cts urn
    ctsurn = os.path.splitext(os.path.basename(src_sents_path))[0]
    path_out = pathout+ctsurn+".json"
    if os.path.isfile(path_out) == False:
        print(f"working on {ctsurn}")
        # initialize dict; will use cts as filename when write to json file
        text_dict = initialize_text_dict(src_sents_path)
        # print(text_dict)
        # # print(text_dict.keys())
        # get translations using lookup table
        translations = lookup[ctsurn]
        # print(translations)
        #### using 1863 translation as test - iterate through all translations for final version ###
        # translation = translations[3]
        for translation in translations:
            # print(translation)
            # get alignment results
            align_rslt_filename = align_rslts_dir+ctsurn+"_"+translation+".rslts"
            # print(align_rslt_filename)
            transl_sents_filename = translation+".sents"
            # print(transl_sents_filename)
            if os.path.isfile(align_rslt_filename):
                # get tgt sents
                if os.path.isfile(de_dir+transl_sents_filename):
                    tranls_sents_path = de_dir+transl_sents_filename
                elif os.path.isfile(en_dir+transl_sents_filename):
                    tranls_sents_path = en_dir+transl_sents_filename
                elif os.path.isfile(it_dir+transl_sents_filename):
                    tranls_sents_path = it_dir+transl_sents_filename
                elif os.path.isfile(fr_dir+transl_sents_filename):
                    tranls_sents_path = fr_dir+transl_sents_filename

                transl_dict = build_translation_dict(align_rslt_filename, tranls_sents_path)
                text_dict[translation] = transl_dict

        # write to json
        with open(path_out, 'w') as fp:
            json.dump(text_dict, fp, ensure_ascii=False)

In [67]:
text_dict.keys()

dict_keys(['src_text', 'thucydides_1_1852'])

In [68]:
text_dict['thucydides_1_1852'].keys()

dict_keys(['aligns_idx', 'tgt_text'])

In [49]:
text_dict['thucydides_1_1852']['tgt_text']

{0: 'I.',
 1: 'L’Athénien Thucydide a écrit l’histoire de la guerre entre les Péloponnésiens et les Athéniens et suivi toutes les phases de cette lutte.',
 2: 'Il a commencé son œuvre au début même des hostilités, prévoyant dès lors combien cette guerre serait importante, combien plus mémorable que celles qui avaient précédé :',
 3: 'il en avait pour preuve les immenses ressources de tout genre avec lesquelles les deux peuples allaient s’entre-choquer, et les dispositions des autres États de la Grèce qu’il voyait ou prendre parti immédiatement, ou méditer dès lors de le faire.',
 4: 'C’est là, en effet, le plus vaste mouvement qui jamais se soit produit chez les Grecs ;',
 5: 'il embrassa une partie des barbares1, et ébranla pour ainsi dire au loin l’univers.',
 6: 'Les événements qui ont immédiatement précédé2 et ceux qui appartiennent à une époque plus reculée3 ne pou vaient, dans l’éloignement, être exactement connus ;',
 7: 'toutefois, à en croire des indices qui m’ont paru certain