In [1]:
from bs4 import BeautifulSoup, SoupStrainer
'''
Obtain all words that have a part of speech of 'ab' (abbreviation) or 'nn1' (singular proper noun) in EP XML sermons
'''
import os,json,sys 
sys.path.append('../')
from lib.standardization import * 
# dictionary that maps lemmas to variants 
lemma_dict = {l:{a:0 for a in a_list} for l, a_list in abbrev.items()}

def lemmas_nouns_and_abbrev(filepath):
    # read the input XML file 
    with open(filepath,'r',encoding='utf-8') as file: 
        data = file.read()

    # use soupstrainer to only parse the main text of the book 
    # create a parsed tree using the xml parser for fast performance 
    soup = BeautifulSoup(data,features="lxml-xml",parse_only=SoupStrainer("w"))
    
    # iterate through every word
    words = soup.find_all('w')
    for w in words: 
        pos = w.get("pos")
        if pos == 'ab' or pos == 'nn1':
            l = clean_word(w.get("lemma"))
            if l not in lemma_dict and l not in abbrev_to_book: 
                # entirely new word that is not in either the key or values of the dictionary of book abbreviations 
                lemma_dict[l] = {}
            elif l in abbrev_to_book: 
                # lemma is already found in the dictionary of book abbreviations 
                # set the key to the standardized form 
                l = abbrev_to_book[l]
            word = clean_word(w.text)
            if word not in lemma_dict[l]: 
                # this variant has not been seen before
                lemma_dict[l][word] = 0 
            lemma_dict[l][word] += 1 

In [74]:
import os,json
''' 
Took 164 minutes 
'''
ep = "/Users/amycweng/Digital Humanities/sermonsEP"
with open("../assets/sermons_ep.json","r") as file: 
    sermons_ep = json.load(file)

count = 0
for file in sermons_ep: 
    filepath = os.path.join(ep,file)
    lemmas_nouns_and_abbrev(filepath)
    count += 1 
    if count % 100 == 0: 
        print(f"Processed {count} texts")
    if count % 500 == 0: # save progress in case anything goes wrong
        with open(f"../assets/lemmas_{count}.json","w+") as file: 
            json.dump(lemma_dict,file)
with open(f"../assets/lemmas.json","w+") as file: 
    json.dump(lemma_dict,file)

Processed 100 texts
Processed 200 texts
Processed 300 texts
Processed 400 texts
Processed 500 texts
Processed 600 texts
Processed 700 texts
Processed 800 texts
Processed 900 texts
Processed 1000 texts
Processed 1100 texts
Processed 1200 texts
Processed 1300 texts
Processed 1400 texts
Processed 1500 texts
Processed 1600 texts
Processed 1700 texts
Processed 1800 texts
Processed 1900 texts
Processed 2000 texts
Processed 2100 texts
Processed 2200 texts
Processed 2300 texts
Processed 2400 texts
Processed 2500 texts
Processed 2600 texts
Processed 2700 texts
Processed 2800 texts
Processed 2900 texts
Processed 3000 texts
Processed 3100 texts
Processed 3200 texts
Processed 3300 texts
Processed 3400 texts
Processed 3500 texts
Processed 3600 texts
Processed 3700 texts
Processed 3800 texts
Processed 3900 texts
Processed 4000 texts
Processed 4100 texts
Processed 4200 texts
Processed 4300 texts
Processed 4400 texts
Processed 4500 texts
Processed 4600 texts
Processed 4700 texts
Processed 4800 texts
P

In [75]:
with open(f"../assets/lemmas_sorted.json","w+") as file: 
    json.dump(sorted(lemma_dict.items()),file)

In [2]:
from difflib import SequenceMatcher

with open(f"../assets/lemmas.json","r") as file: 
    lemma_dict = json.load(file)

possible_matches = {k:[] for k in abbrev}

for lemma in sorted(lemma_dict):
    if lemma not in abbrev: 
        most_likely = {}
        # finding possible matches to any of the known abbreviations for each book 
        for ab, book in abbrev_to_book.items():
            similarity = SequenceMatcher(None, ab, lemma).ratio()
            if similarity >= 0.8:
                possible_matches[book].append((similarity,ab,lemma_dict[lemma]))
                most_likely[book] = similarity

In [5]:
b_to_m = {k:{} for k in abbrev}
for book, matches in possible_matches.items(): 
    for m in matches:
        variants = m[2]
        for v,freq in variants.items(): 
            # I forgot to strip colons and other punctuation marks earlier 
            # recompute frequencies here 
            v = re.sub(r"[^a-z*]","",v)
            if v not in b_to_m[book]:
                b_to_m[book][v] = 0 
            b_to_m[book][v] += freq 

with open(f"../assets/variant_dict.json","w+") as file: 
    json.dump(b_to_m,file)

In [16]:
import csv, re

with open('../assets/sermons_marginalia.csv', 'r') as file:          
    notes = csv.reader(file, delimiter=',')
    # find instances of possible numbered books
    known, unknown = {}, {}  
    for idx, entry in enumerate(notes):
        # get note text 
        n = entry[-1]
        # replace all periods with spaces and convert to lower case 
        n = re.sub(r'(\.)',r' ',n).lower()
        # replace all instances of two or more spaces with a single space. 
        n = re.sub(r'\s+',' ',n)
        possible = re.findall(r'\b([a-z•]+)\b \b[0-9•]+\b \b[0-9•v]+\b', n)
        for i, p in enumerate(possible):
            orig_p = p.split(" ")[0]
            p = clean_word(orig_p)
            if p not in abbrev_to_book and p not in abbrev: 
                if p not in unknown: 
                    unknown[p] = 0 
                unknown[p] += 1 
            else: 
                if p in abbrev_to_book: 
                    book = abbrev_to_book[p]
                if book not in known:
                    known[book] = {} 
                if orig_p not in known[book]: 
                    known[book][orig_p] = 0
                known[book][orig_p] += 1   

In [17]:
matches = {k:{} for k in abbrev}
for word in sorted(unknown): 
    for book, variants in b_to_m.items(): 
        if word == book or word in variants: 
            matches[book][word] = unknown[word]
            break

clean_format = []
for book, variants in matches.items(): 
    for v,freq in variants.items():
        clean_format.append(f"{book}: {v} {freq}\n") 
    clean_format.append("\n")
with open(f"../assets/possible_variants.txt","w+") as file: 
    file.writelines(clean_format)