In [5]:
from bibleMarginalia import * 

def get_marginal_notes(filepath):
    tcp_id = filepath.split("/")[-1].split(".")[0]

    # read the input XML file 
    with open(filepath,'r') as file: 
        data = file.read()

    # use soupstrainer to only parse the main body
    tag = SoupStrainer("div1")

    # create a parsed tree, i.e., soup, of the body text using an html parser, which keeps track of line numbers
    soup = BeautifulSoup(data,features="html.parser",parse_only=tag)
    
    # iterate through every marginal note tag of this file 
    notes = soup.find_all('note')
    notes_info = []
    for note in notes: 
        if note.get("place") == "marg":
            div = note.parent
            div_path = [] 
            while div is not None:
                type = div.get("type")
                if type is not None: 
                    div_path.append(type) 
                div = div.parent
            # find illegible parts and replace with the display characters 
            n = str(note)
            for gap in note.find_all("gap"):
                n = re.sub(str(gap),gap["disp"],n)
            # strip out all embedded tags 
            n = re.sub("\<(.*?)\>","",n)
            # remove newlines 
            n = re.sub("\n"," ",n)
            notes_info.append({"tcp_id": tcp_id, 
                             "div_path": "; ".join(div_path),
                             "sourceline": note.sourceline,
                             "sourcepos":note.sourcepos,
                             "note":n})
    return notes_info

In [9]:
import pandas as pd 
import os,csv
''' 
Takes 12 minutes 
'''
sermons = pd.read_csv("sermons.csv")["id"]
tcp = '/Users/amycweng/Digital Humanities/sermonsTCP'

outfile = open("marginalia.csv","w+")
writer = csv.DictWriter(outfile, fieldnames=["tcp_id","div_path","sourceline","sourcepos","note"])
writer.writeheader()
no_notes = []
count = 0
for file in os.listdir(tcp): 
    filepath = os.path.join(tcp,file)
    notes = get_marginal_notes(filepath)
    tcp_id = file.split(".")[0]
    if not len(notes): 
        no_notes.append(tcp_id)
    else: 
        writer.writerows(notes)
    count += 1 
    if count % 100 == 0: 
        print(f"Processed {count} texts")
outfile.close()

Processed 100 texts
Processed 200 texts
Processed 300 texts
Processed 400 texts
Processed 500 texts
Processed 600 texts
Processed 700 texts
Processed 800 texts
Processed 900 texts
Processed 1000 texts
Processed 1100 texts
Processed 1200 texts
Processed 1300 texts
Processed 1400 texts
Processed 1500 texts
Processed 1600 texts
Processed 1700 texts
Processed 1800 texts
Processed 1900 texts
Processed 2000 texts
Processed 2100 texts
Processed 2200 texts
Processed 2300 texts
Processed 2400 texts
Processed 2500 texts
Processed 2600 texts
Processed 2700 texts
Processed 2800 texts
Processed 2900 texts
Processed 3000 texts
Processed 3100 texts
Processed 3200 texts
Processed 3300 texts
Processed 3400 texts
Processed 3500 texts
Processed 3600 texts
Processed 3700 texts
Processed 3800 texts
Processed 3900 texts
Processed 4000 texts
Processed 4100 texts
Processed 4200 texts
Processed 4300 texts
Processed 4400 texts
Processed 4500 texts
Processed 4600 texts
Processed 4700 texts
Processed 4800 texts
P

In [10]:
print(len(no_notes)) # 1435 files in total, so there are 4000 texts with marginalia 
import json 
with open("no_notes.json","w+") as file: 
    json.dump(no_notes,file)

1435


In [None]:
# # replace all periods with spaces. This is to make sure that all citations are 
# # in the format of "<book> <chapter> <verse>", i.e., "Ecclesiastes 9 4". 
# # Some citations are originally inconsistently formatted as "<book> <chapter>.<verse>" at times 
# # and "<book> <chapter>. <verse>." at other times, so replacing periods with spaces is the first step to standardizing all the citation formats 
# n = re.sub(r'(\.)',r' ',n).lower()
# remove everything that is not an alphabetical character, integer, comma, ampersand, illegible char, or a single space
# n = re.sub(r'[^a-z0-9\,\&\-\—\* ]','',n)
# # replace all instances of "and" with ampersands 
# n = re.sub(rf"\band\b|&ampc|&amp",'&', n)
# # next, replace all instances of two or more spaces with a single space. 
# n = re.sub(r'\s+',' ',n).strip()