In [161]:
import json,re 
import pandas as pd 
import os

all_translations = {}
for fp in os.listdir('../assets/translations'):
    with open(f"../assets/translations/{fp}") as file: 
        all_translations.update(json.load(file))
print(len(all_translations),"known translations")

def combine_punc_with_text(segment): 
    segment = re.sub(r'\s+([,.?!;:)])', r'\1', segment)
    segment = re.sub(r'([(])\s+', r'\1', segment) 
    segment = re.sub(r"\s+"," ",segment)
    return segment

def get_foreign(era,prefix,get="missing"): 
    with open(f"../assets/processed/{era}/sub-segments/{prefix}.json",'r') as file:
        s_ids, standardized, fw_subchunks = json.load(file)

    parts = {}
    for idx, s in enumerate(standardized):
        s_id = s_ids[idx]
        new_id = [str(s) for s in s_id[0]]
        new_id = [new_id,str(s_id[1])]
        parts[str(new_id)] = combine_punc_with_text(s)

    fstrings = {}
    foreign = []
    for fid in fw_subchunks:
        if parts[fid] not in fstrings:
            fstrings[parts[fid]] = None 
            if get == "missing":
                if parts[fid] not in all_translations: 
                    foreign.append((fid,parts[fid]))
            else: 
                foreign.append((fid,parts[fid]))
    print(len(foreign),"sentences to translate")
    return parts, foreign,fw_subchunks

from dotenv import load_dotenv
env_path = '../../DH/openai.env'
load_dotenv(dotenv_path=env_path)
OPENAI_API_KEY = os.getenv('SECRET_KEY')

from openai import OpenAI
client = OpenAI(api_key=OPENAI_API_KEY)

def translate(sentences):
    sentences = [f"{fid}: {s}" for fid, s in sentences if s not in all_translations]
    if len(sentences) == 0: return None 
    # Prepare the system message
    system_message = {
        "role": "system",
        "content": "You are an assistant that translates Latin text from Early Modern English sermons. The input sentences are separated by newlines and preceded by their unique four-part id (containing underscores). Only output the translation of the Latin and ignore the English. Be as continuous as possible and precede the translation with its corresponding id. Do not repeat my input."
    }

    # Prepare the user message with the list of words
    user_message = {
        "role": "user",
        "content": "Translate to English if most of the sentence is in Latin: " + "\n".join(sentences)
    }

    # Call the OpenAI API with the messages
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[system_message, user_message]
    )

    # Extract the result from the response
    return response

299 known translations


In [None]:
# start, end = 0,21 # preE B: 0:20, 20:50
translations = translate(foreign[start:end])
translations = translations.choices[0].message.content
for t in translations.split("\n"): 
    if ":" not in t: continue
    fid = t.split(":")[0].split("_")
    sid = fid[:3]
    sid[1],sid[2] = int(sid[1]), int(sid[2])
    pid = int(fid[-1])
    all_translations[parts[str([sid,pid])]] = t.split(":")[1]
    print(parts[str([sid,pid])],f"\n\t{t.split(':')[1]}\n")
with open(f"../assets/translations/{era}_{prefix}.json","w+") as file: 
    json.dump(all_translations,file)

# Stats

In [162]:
parts,foreign,fw_subchunks= {},[],{}

with open('../assets/corpora.json','r') as file: 
    corpora = json.load(file)

for era in corpora: 
    for prefix,id_list in corpora[era].items():
        if len(id_list) == 0: continue
        print(era,prefix)
        p,f,fw = get_foreign(era,prefix,'All')
        parts.update(p)
        foreign.extend(f)
        fw_subchunks.update(fw)

pre-Elizabethan B
300 sentences to translate
pre-Elizabethan A0
1874 sentences to translate
pre-Elizabethan A1
716 sentences to translate
pre-Elizabethan A2
1391 sentences to translate
pre-Elizabethan A6
70 sentences to translate
pre-Elizabethan A7
46 sentences to translate
Elizabethan B
626 sentences to translate
Elizabethan A0
5059 sentences to translate
Elizabethan A1
3994 sentences to translate
Elizabethan A2
283 sentences to translate
Elizabethan A6
9404 sentences to translate
Elizabethan A7
106 sentences to translate
Jacobean B
1595 sentences to translate
Jacobean A0
16775 sentences to translate
Jacobean A1
9412 sentences to translate
Jacobean A2
1412 sentences to translate
Jacobean A6
758 sentences to translate
Jacobean A7
833 sentences to translate
Jacobean A8
6 sentences to translate
Carolinian B
3040 sentences to translate
Carolinian A0
15010 sentences to translate
Carolinian A1
13247 sentences to translate
Carolinian A2
2392 sentences to translate
Carolinian A3
92 sentences 

In [182]:
counts = []
condensed = []
for fid,fw in fw_subchunks.items():
    if len(fw)/len(parts[fid].split(" ")) > 0.32: # 25th percentile 
    # if parts[fid] in all_translations: 
        # print(parts[fid])
        # print(parts[fid])
        condensed.append(len(parts[fid]))
        counts.append(len(fw)/len(parts[fid].split(" ")))
import numpy as np 
print(len(condensed),"sub-segments to translate")
print(np.mean(condensed),"average sub-segment length")
percents = [5,10,25,30,40,50,75,90,99]
for p in percents: 
    print(f"{p}-th percentile of foreign phrase length: {np.percentile(counts,p)}")

149530 sub-segments to translate
83.20904835150137 average sub-segment length
5-th percentile of foreign phrase length: 0.38461538461538464
10-th percentile of foreign phrase length: 0.47058823529411764
25-th percentile of foreign phrase length: 0.6363636363636364
30-th percentile of foreign phrase length: 0.6666666666666666
40-th percentile of foreign phrase length: 0.75
50-th percentile of foreign phrase length: 0.782608695652174
75-th percentile of foreign phrase length: 0.8888888888888888
90-th percentile of foreign phrase length: 1.0
99-th percentile of foreign phrase length: 1.0
