In [None]:
import json,math,sys,re,os
from tqdm import tqdm 
sys.path.append('../')

from lib.spelling import standard,standardizer 
print(len(standard),"known spellings")

prefixes = ['B','A0','A1','A2','A3','A4','A5','A6','A7','A8','A9']
eras = ["pre-Elizabethan","Elizabethan","Carolinian","Jacobean","CivilWar",
        "Interregnum","JamesII","CharlesII","WilliamAndMary"]
         
def get_words(data,type,target_pos="all"): 
    standardized = {}

    for item in data.values(): 
        if type != "margin": 
            item = item.items()
        for i in item: 
            encodings = i[1]
            for token, pos, s in encodings:
                token = token.strip(".")
                pos = pos.strip(".")
                s = s.strip(".")

                # proper nouns, abbreviations 
                if token == pos: continue # no punctuation 
                if len(s) == 0: continue
                if len(pos) == 0: continue 
                if len(token) == 0: continue
                if s[0].islower(): continue
                if re.search("\d",s): continue
                if "NOTE" in token or "NONLATINALPHABET" in token: continue
                s = s.lower()

                def add_to_standard():
                    if s not in standardized: standardized[s] = 0
                    standardized[s] += 1
                
                if target_pos == "all": 
                    if 'fw' not in pos and 'crd' not in pos: 
                        add_to_standard()
                    elif s[0].isupper():
                        if ('fw' in pos): add_to_standard() 
                elif target_pos == "verbs": 
                    if "v" in pos:add_to_standard()
                elif target_pos == "nouns":
                    if "np" in pos:add_to_standard() 
                    elif s[0].isupper():
                        if ('n' in pos) or ('fw' in pos): add_to_standard() 
    return standardized

def add_to_dict(old,new): 
    for word, freq in new.items():
        if len(word) < 2: continue 
        if word not in old: old[word] = freq
        else: old[word] += freq
    return old   

In [None]:
standardized = {}
pos = "all"
for era in os.listdir(f"../assets/processed"):
    if era == ".DS_Store": continue 

    print(era)
    for prefix in os.listdir(f"../assets/processed/{era}/json"):
        if prefix == ".DS_Store": continue 
        if "_info" in prefix: continue
        print(prefix)
        with open(f"../assets/processed/{era}/json/{prefix}","r") as file: 
            data = json.load(file)
        if "_marginalia" in prefix: 
            l = get_words(data,"margin",pos)
            standardized = add_to_dict(standardized,l) 
        elif "_text" in prefix: 
            l = get_words(data,"text",pos)
            standardized = add_to_dict(standardized,l) 

In [None]:
# more than 1 letter; not already a standard spelling; more than half is legible 
# more than five occurrences 
vocab_all = sorted(standardized.items(), key=lambda x:x[1], reverse=True)
vocab_all = [x for x in vocab_all if x[1] > 0 and x[0] not in standard and (len(re.findall("\^",x[0])) < math.floor(len(x[0])/2)) and re.sub(r"s$|\'s$|\!|\?|\:","",x[0]) not in standard]
vocab = [x[0] for x in vocab_all if x[1] > 1]
print(len(vocab),"words to standardize out of",len(vocab_all))

with open(f"../assets/vocab/{pos}.json","w+") as file: 
    json.dump(vocab_all,file)  

In [None]:
counts = [v[1] for v in vocab_all if v[1] > 1]
import numpy as np
for p in [10,25,50,75]: 
    print(p,": ", np.percentile(counts,p))

In [None]:
import os,json
from dotenv import load_dotenv
env_path = '../../DH/openai.env'
load_dotenv(dotenv_path=env_path)
OPENAI_API_KEY = os.getenv('SECRET_KEY')

from openai import OpenAI
client = OpenAI(api_key=OPENAI_API_KEY)

def read_words_from_file(file_path):
    with open(file_path, 'r') as file:
        words = json.load(file)
    return words

def standardize_spellings(words):
    # Prepare the system message
    system_message = {
        "role": "system",
        "content": "You are an assistant that converts words from Early Modern English sermons to their modern standardized spellings."
    }

    # Prepare the user message with the list of words
    user_message = {
        "role": "user",
        "content": "Translate the following words to English in the format of original:translation separated by newlines. Do not add extra white spaces, and keep carets ^ in the original if it occurs. For example, when I input 'Deut^\nEphes\nThes^\nPetecost\nWollebii\nGr^tius', I should get 'Deut^:Deuteronomy\nEphes:Ephesians\Thes^:Thessalonians\nPetecost:Pentecost\nWollebii:Wolleb\nGr^tius:Grotius' as my output. Translate the following words to English: " + "\n".join(words)
    }

    # Call the OpenAI API with the messages
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[system_message, user_message]
    )

    # Extract the result from the response
    return response


In [None]:
fname = "all.json"
words = read_words_from_file(f"../assets/vocab/{fname}")
words = [x[0] for x in words if x[1] > 1 and x[0] not in standard]
words = [w.capitalize() for w in words if (len(re.findall("\^",w)) < math.floor(len(w)/2))]
len(words)

In [None]:
start = 20000 #
outputfname = "all_greater_than_4.json"
while start < len(words): 
    end = start + 500
    print(start,end,words[start])

    if start > 0: 
        with open(f"../assets/vocab/standard_{outputfname}","r") as file: 
            new_standard = json.load(file)
    else: 
        new_standard = {}
    standardized_words = standardize_spellings(words[start:end])
    standardized_words = standardized_words.choices[0].message.content
    print(standardized_words)
    new_standard.update({pair.split(":")[0]:pair.split(":")[1] for pair in standardized_words.split("\n") if ":" in pair})
    with open(f"../assets/vocab/standard_{outputfname}","w+") as file: 
        json.dump(new_standard,file)
    start = end 

# Biblical Entities

Get the most similar words by edit distance to the biblical hits 

In [None]:
from fuzzywuzzy import process
from Levenshtein import distance 
def dist_fn(s1, s2):
    return distance(s1, s2)

def find_similar_words(target_word, word_list, threshold,k=20):
    similar_words = process.extract(target_word, word_list, limit=None,scorer=dist_fn)
    similar_words = [(word, dist) for word, dist in similar_words if dist <= threshold][-k:]
    similar_words = sorted(similar_words, key=lambda x:x[1])
    return similar_words

def find_match(target_word, word_list):
    match = process.extract(target_word, word_list, limit=None,scorer=dist_fn)
    match = [(word, dist) for word, dist in match if dist == 0]
    if len(match) > 0: 
        return True 
    return False

In [None]:
from lib.spelling import entities

In [None]:
def similar_spelling(hits):  
    for target in hits: 
        similar_words = find_similar_words(target,words,len(target)/2,10)
        print(f"Words similar to '{target}':")
        for word, dist in similar_words:
            print(f"{word} (Distance: {dist})")
        print()
similar_spelling(entities[:10])