In [1]:
import json,math,sys,re,os
from tqdm import tqdm 
sys.path.append('../')

from lib.spelling import standard 
prefixes = ['B','A0','A1','A2','A3','A4','A5','A6','A7','A8','A9']
eras = ["pre-Elizabethan","Elizabethan","Carolinian","Jacobean","CivilWar",
        "Interregnum","JamesII","CharlesII","WilliamAndMary"]

def get_words(data,type,target_pos="all"): 
    standardized = {}

    for item in data.values(): 
        if type != "margin": 
            item = item.items()
        for i in item: 
            encodings = i[1]
            for token, pos, s in encodings:
                token = token.strip(".")
                pos = pos.strip(".")
                s = s.strip(".")

                # proper nouns, abbreviations 
                if token == pos: continue # no punctuation 
                if len(s) == 0: continue
                if len(pos) == 0: continue 
                if len(token) == 0: continue
                if s[0].islower(): continue
                if re.search("\d",s): continue
                if "NOTE" in token or "NONLATINALPHABET" in token: continue
                s = s.lower()

                def add_to_standard():
                    if s not in standardized: standardized[s] = 0
                    standardized[s] += 1
                
                if target_pos == "all": 
                    if 'fw' not in pos and 'crd' not in pos: 
                        add_to_standard()
                    elif s[0].isupper():
                        if ('fw' in pos): add_to_standard() 
                elif target_pos == "verbs": 
                    if "v" in pos:add_to_standard()
                elif target_pos == "nouns":
                    if "n" in pos:add_to_standard() 
                    elif s[0].isupper():
                        if ('fw' in pos): add_to_standard() 
    return standardized

def add_to_dict(old,new): 
    for word, freq in new.items(): 
        if word not in old: old[word] = freq
        else: old[word] += freq
    return old


def get_vocab(target_era,pos="all"): 
    standardized = {}
    for era in os.listdir(f"../assets/processed"):
        if era == ".DS_Store": continue 
        if era != target_era: continue 
        print(target_era)
        for prefix in os.listdir(f"../assets/processed/{era}/json"):
            if prefix == ".DS_Store": continue 
            if "_info" in prefix: continue
            print(prefix)
            with open(f"../assets/processed/{era}/json/{prefix}","r") as file: 
                data = json.load(file)
            if "_marginalia" in prefix: 
                l = get_words(data,"margin",pos)
                standardized = add_to_dict(standardized,l) 
            elif "_text": 
                l = get_words(data,"text",pos)
                standardized = add_to_dict(standardized,l)    

        standardized = sorted(standardized.items(), key=lambda x:x[1], reverse=True)
        # more than 1 letter; not already a standard spelling; more than half is legible 
        vocab = [x[0] for x in standardized if len(x[0]) > 1 and x[0] not in standard and (len(re.findall("^",x[0])) < math.floor(len(x[0])/2))]
        print(len(vocab),"words")

        with open(f"../assets/vocab/{target_era}_{pos}.json","w+") as file: 
            json.dump(vocab,file)

271150 known spellings


In [None]:
get_vocab("pre-Elizabethan")

In [23]:
get_vocab("Elizabethan","nouns") # 37268

Elizabethan
A2_marginalia.json
A1_texts.json
A6_texts.json
B_texts.json
B_marginalia.json
A0_texts.json
A6_marginalia.json
A1_marginalia.json
A2_texts.json
A0_marginalia.json
A7_marginalia.json
A7_texts.json
36854 words


In [4]:
import os,json
from dotenv import load_dotenv
env_path = '/Users/amycweng/DH/openai.env'
load_dotenv(dotenv_path=env_path)
OPENAI_API_KEY = os.getenv('SECRET_KEY')

from openai import OpenAI
client = OpenAI(api_key=OPENAI_API_KEY)

def read_words_from_file(file_path):
    with open(file_path, 'r') as file:
        words = json.load(file)
    return words

def standardize_spellings(words):
    # Prepare the system message
    system_message = {
        "role": "system",
        "content": "You are an assistant that converts words from Early Modern English sermons to their modern standardized spellings. The input words are separated by newlines. Give the output in the format of original:corrected separated by newlines. Do not add extra white spaces. For example, when I input 'Deut\nEphes\nLuk\nPetecost', I should get 'Deut:Deuteronomy\nEphes:Ephesians\nLuk:Luke\nPetecost:Pentecost' as my output. If the word is in Latin, convert it to English."
    }

    # Prepare the user message with the list of words
    user_message = {
        "role": "user",
        "content": "Correct the following words and output the original followed by a colon and the corrected words: " + "\n".join(words)
    }

    # Call the OpenAI API with the messages
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[system_message, user_message]
    )

    # Extract the result from the response
    return response


In [None]:
standard = {}
words = read_words_from_file("../assets/vocab/pre-Elizabethan_nouns.json")
words = [w.capitalize() for w in words]
len(words)

In [None]:
start,end = 5000,5557 # 0:100,100:200,200:500,500:1000,1k:1.5k
# 1.5k-2k, 2k-2.5k, 2.5k-3k, 3k-3.5k,3.5k-4k,4k-4.5k, 4.5k-5k,5k-finish
# I'll -> Ale, Co^ -> Coe, Oxigene -> Oxygen, 
# Demose -> Demons 
# 500 words take half a minute to process
# Took 10 cents in total    
print(len(words[start:end]))
standardized_words = standardize_spellings(words[start:end])
standardized_words = standardized_words.choices[0].message.content
print(standardized_words)
standard.update({pair.split(":")[0]:pair.split(":")[1] for pair in standardized_words.split("\n") if ":" in pair})
with open("../assets/vocab/standard_pre-Elizabethan_nouns.json","w+") as file: 
    json.dump(standard,file)

In [5]:
with open('../assets/vocab/standard_pre-Elizabethan_all.json',"r") as file:
    standard = json.load(file)
fname = "pre-Elizabethan_all"
words = read_words_from_file(f"../assets/vocab/{fname}.json")
words = [w.capitalize() for w in words]
len(words)

7

In [6]:
standardized_words = standardize_spellings(words)
standardized_words = standardized_words.choices[0].message.content
print(standardized_words)
standard.update({pair.split(":")[0]:pair.split(":")[1] for pair in standardized_words.split("\n") if ":" in pair})
with open(f"../assets/vocab/standard_{fname}.json","w+") as file: 
    json.dump(standard,file)

Thyrdelye:Thirdly
Fourthelye:Fourthly
Thirdlye:Thirdly
Tractent:Treatise
^y^^:holy
^^^dyous:gracious
Diuinit:Divinity


In [25]:
# standardized with GPT 3.5 
for fp in os.listdir('../assets/vocab'):
    if "standard" not in fp: continue
    with open(f"../assets/vocab/{fp}") as file: 
        new_standard = json.load(file)
    new_standard = {n:k for n,k in new_standard.items() if len(re.findall("^",n)) < math.floor(len(n)/2)}
    new_standard = {n:k for n,k in new_standard.items() if n.lower() not in entities and n.lower() not in wordnet_words}
    print(len(new_standard))
    with open(f"../assets/vocab/{fp}","w+") as file: 
        json.dump(new_standard, file)

4969
432
457


In [None]:
prefixes = {}
for word in words.keys(): 
    if word[:1] not in prefixes: 
        prefixes[word[:1]] = []
    prefixes[word[:1]].append(word)

In [None]:
m_hits = {}
for name in entities:
    if name in words: m_hits[name] = words[name]

In [None]:
text_prefixes = {}
for word in text_words: 
    if word[:1] not in text_prefixes: 
        text_prefixes[word[:1]] = []
    text_prefixes[word[:1]].append(word)

In [None]:
t_hits = {}
for name in entities:
    if name in text_words: t_hits[name] = text_words[name]

In [None]:
len(t_hits)

In [None]:
a_hits_margin = {}
for auts in author_ids.values(): 
    for aut in auts: 
        aut = aut.split(" ")
        for a in aut: 
            if len(a.strip(".")) < 3: continue
            if a in words: 
                a_hits_margin[a] = words[a]

print(len(a_hits_margin))
list(sorted(a_hits_margin.items(),key=lambda x:x[1],reverse=True))[:10]

In [None]:
print(len(author_ids))

In [None]:
a_hits_text = {}
for auts in author_ids.values(): 
    for aut in auts: 
        aut = aut.split(" ")
        for a in aut: 
            if len(a.strip(".")) < 3: continue
            if a in text_words: 
                a_hits_text[a] = text_words[a]
print(len(a_hits_text))
list(sorted(a_hits_text.items(),key=lambda x:x[1],reverse=True))[:10]

In [None]:
with open('../assets/bible_hits.json','w+') as file: 
    json.dump({'marginal':m_hits, 'in-text':t_hits},file)

In [None]:
with open('../assets/author_hits.json','w+') as file: 
    json.dump({'marginal':a_hits_margin, 'in-text':a_hits_text},file)

Get the most similar words by edit distance to the canonical and biblical hits 

In [None]:
from fuzzywuzzy import process
from Levenshtein import distance 
def dist_fn(s1, s2):
    return distance(s1, s2)

def find_similar_words(target_word, word_list, threshold,k=20):
    similar_words = process.extract(target_word, word_list, limit=None,scorer=dist_fn)
    similar_words = [(word, dist) for word, dist in similar_words if dist <= threshold][-k:]
    similar_words = sorted(similar_words, key=lambda x:x[1])
    return similar_words

def find_match(target_word, word_list):
    match = process.extract(target_word, word_list, limit=None,scorer=dist_fn)
    match = [(word, dist) for word, dist in match if dist == 0]
    if len(match) > 0: 
        return True 
    return False

In [None]:
with open('../assets/author_hits.json','r') as file: 
    a_hits = json.load(file)

with open('../assets/bible_hits.json','r') as file: 
    b_hits = json.load(file)

a_hits = a_hits['marginal']
b_hits = b_hits['marginal']
len(a_hits),len(b_hits)

In [None]:
def similar_spelling(hits):  
    for target in hits: 
        similar_words = find_similar_words(target, text_words.keys(),len(target)/2,10)
        print(f"Words similar to '{target}':")
        for word, dist in similar_words:
            print(f"{word} (Distance: {dist})")
        print()
similar_spelling(b_hits)