In [1]:
import json, os, glob
from ast import literal_eval
from functions import read_alignments, load_txt_as_lst
from collections import defaultdict

In [2]:
def build__src_2_tgt_dict(alignments_lst):
    '''
    If alignment is null on one side, inserts "null" 
    '''
    src_id_to_tgt_ids = defaultdict(set)
    for src, tgt in alignments_lst:
        if (src, tgt) == ([], []):
            continue
        else:
            if src == []:
                src = ["null"]
            if tgt == []:
                tgt = ["null"]
            for src_id in src:
                for tgt_id in tgt:
                    src_id_to_tgt_ids[src_id].add(tgt_id)
    # convert to vals to list for writing to json
    for key in src_id_to_tgt_ids.keys():
        src_id_to_tgt_ids[key] = list(src_id_to_tgt_ids[key])
    return src_id_to_tgt_ids

def build_tgt_2_src_dict(alignments_lst):
    '''
    If alignment is null on one side, inserts "null" 
    '''
    tgt_id_to_src_ids = defaultdict(set)
    # tgt_id_to_src_ids = {}
    for src, tgt in alignments_lst:
        if (src, tgt) == ([], []):
            continue
        else:
            if src == []:
                src = ["null"]
            if tgt == []:
                tgt = ["null"]
            for tgt_id in tgt:
                for src_id in src:
                    if tgt_id in tgt_id_to_src_ids.keys():
                        tgt_id_to_src_ids[tgt_id].add(src_id)
                    else:
                        tgt_id_to_src_ids[tgt_id] = {src_id}
                    # tgt_id_to_src_ids[tgt_id].add(src_id)
    # convert to vals to list for writing to json
    for key in tgt_id_to_src_ids.keys():
        tgt_id_to_src_ids[key] = list(tgt_id_to_src_ids[key])
    return tgt_id_to_src_ids

def initialize_text_dict(src_sents_path):
    '''
    Initialize dict, build dict of src idx 2 src sent
    '''
    text_dict = {}
    sents = load_txt_as_lst(src_sents_path)
    sents = [sent.rstrip("\n") for sent in sents]
    txt_keys = [num for num in range(len(sents))]
    txt_dict = dict(zip(txt_keys, sents))
    text_dict["src_text"] = txt_dict
    return text_dict

def build_translation_dict(aligns_rslts_path, tgt_sents_path, lang):
    aligns = read_alignments(aligns_rslts_path)
    transl_dict = {}
    # build dict of src idx 2 tgt idx
    transl_dict["src2tgt_aligns"] = build__src_2_tgt_dict(aligns)
    # build dict of tgt idx 2 src idx
    transl_dict["tgt2src_aligns"] = build_tgt_2_src_dict(aligns)
    
    # build dict of tgt idx 2 tgt sent
    tgt_sents = load_txt_as_lst(tgt_sents_path)
    # get keys: idx of tgt sents
    tgt_sents_idx = [num for num in range(len(tgt_sents))]
    # strip trailing \n from writing file to text; these are vals
    tgt_sents = [sent.rstrip("\n") for sent in tgt_sents]
    tgt_txt_dict = dict(zip(tgt_sents_idx, tgt_sents))
    transl_dict["tgt_text"] = tgt_txt_dict
    
    transl_dict["tgt_lang"] = lang
    return transl_dict

# Build logic

In [None]:
# thuc1_aligns_rslts = "/home/craig.car/repos/chiron/chironata/data/alignments_rslts/urn:cts:greekLit:tlg0003.tlg001_thucydides_1_1852.rslts"
# thuc2_aligns_rslts = "/home/craig.car/repos/chiron/chironata/data/alignments_rslts/urn:cts:greekLit:tlg0003.tlg001_thucydides_1863.rslts"

In [None]:
# thuc1_aligns = read_alignments(thuc1_aligns_rslts)
# thuc2_aligns = read_alignments(thuc2_aligns_rslts)

In [None]:
# # top level: filename would be "urn:cts:greekLit:tlg0003.tlg001.json"
# thucydides = {}

# # dict for thuc1 translation
# thucydides_1_1852 = {}

In [None]:
# thucydides_1_1852["aligns_idx"] = build__src_2_tgt_dict(thuc1_aligns)

In [None]:
# thucydides_1_1852["aligns_idx"][0]

In [None]:
# thucydides["thucydides_1_1852"] = thucydides_1_1852

In [None]:
# thucydides_sents = load_txt_as_lst("/home/craig.car/repos/chiron/chironata/data/src_data/urn:cts:greekLit:tlg0003.tlg001.sents")
# thuc1_sents = load_txt_as_lst("/home/craig.car/repos/chiron/chironata/data/french_trans-dev/thucydides_1_1852.sents")

In [None]:
# # sending to file added newlines to end of each line - strip
# thucydides_sents = [sent.rstrip("\n") for sent in thucydides_sents]
# thuc1_sents = [sent.rstrip("\n") for sent in thuc1_sents]

In [None]:
# src_txt_keys = [num for num in range(len(thucydides_sents))]
# src_txt = dict(zip(src_txt_keys, thucydides_sents))

In [None]:
# tgt_txt_keys = [num for num in range(len(thuc1_sents))]
# print(len(tgt_txt_keys))
# tgt_txt = dict(zip(tgt_txt_keys, thuc1_sents))
# print(len(thuc1_sents))

In [None]:
# thucydides["src_text"] = src_txt
# thucydides["thucydides_1_1852"]["tgt_text"] = tgt_txt

# Build from functions

In [None]:
# build from functions
thuc1_aligns_rslts = "/home/craig.car/repos/chiron/chironata/data/alignments_rslts/urn:cts:greekLit:tlg0003.tlg001_thucydides_1_1852.rslts"
thuc_sents_path = "/home/craig.car/repos/chiron/chironata/data/src_data/urn:cts:greekLit:tlg0003.tlg001.sents"
thucydides = initialize_text_dict(thuc_sents_path)
thuc1_sents_path = "/home/craig.car/repos/chiron/chironata/data/french_trans-dev/thucydides_1_1852.sents"
thucydides["thucydides_1_1852"] = build_translation_dict(thuc1_aligns_rslts, thuc1_sents_path)

In [None]:
thucydides.keys()

In [None]:
thucydides['thucydides_1_1852'].keys()

In [None]:
# add another translation to test
thuc2_aligns_rslts = "/home/craig.car/repos/chiron/chironata/data/alignments_rslts/urn:cts:greekLit:tlg0003.tlg001_thucydides_1863.rslts"
thuc2_sents_path = "/home/craig.car/repos/chiron/chironata/data/french_trans-dev/thucydides_1863.sents"
thucydides["thucydides_1863"] = build_translation_dict(thuc2_aligns_rslts, thuc2_sents_path)

In [None]:
thucydides.keys()

In [None]:
thucydides["thucydides_1863"].keys()

In [None]:
len(thucydides["thucydides_1863"]['aligns_idx'].keys())

In [None]:
len(thucydides["thucydides_1863"]['tgt_text'].keys())

In [None]:
# last value is empty. also in src2tgt dict. but doesn't seem to cause errors?
print(thucydides["thucydides_1863"]['aligns_idx'][4418])
print(thucydides["thucydides_1863"]['aligns_idx'][4419])

In [None]:
thucydides["src_text"][4418]

In [None]:
test = load_txt_as_lst(thuc2_aligns_rslts)

In [None]:
test[-1]

In [None]:
print(len(test))

# Build for all src texts

In [3]:
# load lookup table
lookup_path = "/home/craig.car/repos/chiron/chironata/data/cts_lookup_table.json"
with open(lookup_path) as f:
    lookup = json.load(f)

# get dir names
align_rslts_dir = "/home/craig.car/repos/chiron/chironata/data/alignments_rslts/"
src_data_dir = "/home/craig.car/repos/chiron/chironata/data/src_data/"
fr_dir = "/home/craig.car/repos/chiron/chironata/data/french_trans-dev/"
de_dir = "/home/craig.car/repos/chiron/chironata/data/german_trans-dev/"
it_dir = "/home/craig.car/repos/chiron/chironata/data/italian_trans-dev/"
en_dir = "/home/craig.car/repos/chiron/chironata/data/english_trans-dev/"
pathout = "/home/craig.car/repos/chiron/chironata/sentence_aligned_texts/"

In [4]:
lookup["urn:cts:greekLit:tlg0019.tlg004"]

['aristophanes_1873',
 'aristophanes_1881_1',
 'aristophanes_1_1915',
 'aristophanes_2_1915',
 'aristophanes_1907_1',
 'tlg0019.tlg004.ogl-eng2.xml',
 'aristophanes_1_1830',
 'aristophanes_1_1858',
 'aristophanes_3_1830']

In [7]:
# path_tofix = "/home/craig.car/repos/chiron/chironata/data/src_data/urn:cts:greekLit:tlg0554.tlg001.sents"
# for src_sents_path in glob.iglob(path_tofix):
for src_sents_path in glob.iglob(src_data_dir+"*.sents"):   #### uncomment to run on whole directory
    # get cts urn
    ctsurn = os.path.splitext(os.path.basename(src_sents_path))[0]
    path_out = pathout+ctsurn+".json"
    if os.path.isfile(path_out) == False: ### uncomment to run only on missing files, and indent code below
        print(f"working on {ctsurn}")
        # initialize dict; will use cts as filename when write to json file
        text_dict = initialize_text_dict(src_sents_path)
        # get translations using lookup table
        translations = lookup[ctsurn]
        print(translations)
        for translation in translations:
            if translation.endswith(".xml"):
                translation = translation.split(".xml")[0]
            # get alignment results
            align_rslt_filename = align_rslts_dir+ctsurn+"_"+translation+".rslts"
            transl_sents_filename = translation+".sents"
            if os.path.isfile(align_rslt_filename):
                # get tgt sents and lang
                if os.path.isfile(de_dir+transl_sents_filename):
                    tranls_sents_path = de_dir+transl_sents_filename
                    trans_lang = "de"
                elif os.path.isfile(en_dir+transl_sents_filename):
                    tranls_sents_path = en_dir+transl_sents_filename
                    trans_lang = "en"
                elif os.path.isfile(it_dir+transl_sents_filename):
                    tranls_sents_path = it_dir+transl_sents_filename
                    trans_lang = "it"
                elif os.path.isfile(fr_dir+transl_sents_filename):
                    tranls_sents_path = fr_dir+transl_sents_filename
                    trans_lang = "fr"
                transl_dict = build_translation_dict(align_rslt_filename, tranls_sents_path, trans_lang)
                text_dict[translation] = transl_dict
            # write to json
            with open(path_out, 'w') as fp:
                json.dump(text_dict, fp, ensure_ascii=False)

working on urn:cts:greekLit:tlg0099.tlg001
['strabo_1858', 'strabo_4_1890', 'strabo_1856']
working on urn:cts:greekLit:tlg0554.tlg001
['chariton_1764', 'chariton_1913']


In [16]:
text_dict.keys()

dict_keys(['src_text', 'aristophanes_1_1858', 'aristophanes_1_1915', 'aristophanes_2_1915', 'aristophanes_3_1830', 'aristophanes_1881_1', 'aristophanes_1_1830', 'aristophanes_1873', 'tlg0019.tlg004.ogl-eng2', 'aristophanes_1907_1'])

In [20]:
len(text_dict['aristophanes_1_1858']['tgt2src_aligns'].keys())

10625

In [21]:
len(text_dict['aristophanes_1_1858']['tgt_text'].keys())

10625

# Build for individual concatenated src text runs

In [19]:
def build_data_dict_concatenated(src_sents_path, path_out, align_rslts_path,
                                 tgt_sents_path, transl_lang):
    # initialize dict; will use cts as filename when write to json file
    text_dict = initialize_text_dict(src_sents_path)    
    transl_dict = build_translation_dict(align_rslts_path, tgt_sents_path, transl_lang)
    text_dict["plautus_1862_1"] = transl_dict
    # write to json
    with open(path_out, 'w') as fp:
        json.dump(text_dict, fp, ensure_ascii=False)

In [18]:
# build for plautus concatenated vol1 (tgt: plautus_1862_1)
build_data_dict_concatenated(
    "/home/craig.car/repos/chiron/chironata/data/concatenated_src_data/src_plautus_vol1.sents",
    "/home/craig.car/repos/chiron/chironata/data/concatenated_src_data/datadict.json",
    "/home/craig.car/repos/chiron/chironata/data/concatenated_src_data/src_plautus_vol1.rslts",
    "/home/craig.car/repos/chiron/chironata/data/german_trans-dev/plautus_1862_1.sents",
    "de")