In [1]:
import csv, os, re, shutil,json, sys
import pandas as pd 
sys.path.append('../')

TCP = '../../TCP'
metadataFolder = '../../ECBC-Data-2022/TCP metadata'

def findTextTCP(id):
    if re.match('B1|B4',id[0:2]):
        path = f'{TCP}/P2{id[0:2]}/{id}.P4.xml'
    else: 
        if f'{id}.P4.xml' in os.listdir(f'{TCP}/P1{id[0:2]}'):
            path = f'{TCP}/P1{id[0:2]}/{id}.P4.xml'
        elif f'{id}.P4.xml' in os.listdir(f'{TCP}/P2{id[0:2]}'): 
            path = f'{TCP}/P2{id[0:2]}/{id}.P4.xml'
    return path 

## Find texts that contain sermons (DIV tag with attribute "TYPE"='sermon' or containing key terms in the title or subject headings)

In [84]:
from bs4 import BeautifulSoup, SoupStrainer
def is_sermon(filepath):
    # read the input XML file 
    with open(filepath,'r') as file: 
        data = file.read()
    # use soupstrainer to only parse the main body
    tag = SoupStrainer("DIV1")
    soup = BeautifulSoup(data,features="xml",parse_only=tag)
    sermons = soup.findAll(attrs={"TYPE": re.compile("^(?!list|reply|catalogue|calendar)(sermon)")})
    if len(sermons) > 0:      
        return True
    return False

def get_lang(filepath): 
    # read the input XML file 
    with open(filepath,'r') as file: 
        data = file.read()
    # use soupstrainer to only parse the main body
    tag = SoupStrainer("LANGUSAGE")
    soup = BeautifulSoup(data,features="xml",parse_only=tag)
    return soup.text

In [None]:
sermons = []
for csvFile in os.listdir(metadataFolder):
    data = pd.read_csv(os.path.join(metadataFolder,csvFile))
    for idx,tcpID in enumerate(data['id']):
        if tcpID == "id": continue
        filepath = findTextTCP(tcpID)
        subject_headings = data['keywords'][idx].replace(" -- ","; ").replace("  "," ")
        if is_sermon(filepath): 
            sermons.append( {"id": tcpID, 
                            "estc":data['estc'][idx],
                            "stc":data['stc'][idx],
                            "title": data['title'][idx],
                            "authors": "; ".join(set(data['author'][idx].split("; "))),
                            "publisher": data['publisher'][idx],
                            "pubplace":data['pubplace'][idx],
                            "subject_headings":subject_headings,
                            "date":data['date'][idx]
                            }
            )
    print(csvFile)

In [None]:
for csvFile in os.listdir(metadataFolder):
    data = pd.read_csv(os.path.join(metadataFolder,csvFile))
    for idx,tcpID in enumerate(data['id']):
        if tcpID == "id": continue
        filepath = findTextTCP(tcpID)
        subject_headings = data['keywords'][idx].replace(" -- ","; ").replace("  "," ")
        if re.search('sermon', subject_headings.lower()): 
            if is_sermon(filepath):
                sermons.append( {"id": tcpID, 
                                "estc":data['estc'][idx],
                                "stc":data['stc'][idx],
                                "title": data['title'][idx],
                                "authors": "; ".join(set(data['author'][idx].split("; "))),
                                "publisher": data['publisher'][idx],
                                "pubplace":data['pubplace'][idx],
                                "subject_headings":subject_headings,
                                "date":data['date'][idx]
                                }
                )
    print(csvFile)

In [None]:
sermons = pd.DataFrame(sermons)
sermons.to_csv("../assets/sermons.csv",index=False)
print(f"{len(sermons)} TCP XML files contain sermons.") 
# 399 without sermons in their subject headings 
# 4253 total TCP XML files contain sermons 

## Read sermon metadata files 

In [4]:
sermons_metadata = pd.read_csv("../assets/sermons.csv")
sermons = sermons_metadata.to_dict(orient='records')
sermons = {entry['id']: entry for entry in sermons}
sermons_missing = pd.read_csv("../assets/sermons_missing.csv")
sermons_missing = sermons_missing.to_dict(orient='records')
sermons_missing = {s['id']: s for s in sermons_missing}
len(sermons),len(sermons_missing)

(4226, 1503)

## Find missing sermons but exclude foreign language texts

In [None]:
# Find the texts that most likely contain sermons but do not have the proper div tag in the XML 
sermons_missing = {}

by_subj, by_title, by_both = [],[],[]
for csvFile in os.listdir(metadataFolder):
    data = pd.read_csv(os.path.join(metadataFolder,csvFile))
    for idx,tcpID in enumerate(data['id']):
        if tcpID not in sermons: 
            title = data['title'][idx]
            clean_title = title.lower().replace("'",'')
            subject_headings = data['keywords'][idx]
            unique_subjects = " -- ".join(set(subject_headings.split(" -- ")))
            subject_hit, title_hit = False, False
            

            if re.search('sermon', subject_headings.lower()): 
                subject_hit = True 
            if re.search(r'sermon|preached|preacht|preachd', clean_title): 
                title_hit = True
            if subject_hit and title_hit: 
                by_both.append(tcpID)
            elif subject_hit: 
                by_subj.append(tcpID)
            elif title_hit: 
                by_title.append(tcpID)
            if subject_hit or title_hit: 
                sermons_missing[tcpID] =  {"id": tcpID, 
                                "estc":data['estc'][idx],
                                "stc":data['stc'][idx],
                                "title": data['title'][idx],
                                "authors": "; ".join(set(data['author'][idx].split("; "))),
                                "publisher": data['publisher'][idx],
                                "pubplace":data['pubplace'][idx],
                                "subject_headings":unique_subjects,
                                "date":data['date'][idx]
                                }
print(len(sermons_missing))

1793


In [214]:
len(by_both), len(by_subj), len(by_title)

(1085, 316, 390)

In [None]:
foreign = {}
foreign_missing = {}
for csvFile in os.listdir(metadataFolder):
    data = pd.read_csv(os.path.join(metadataFolder,csvFile))
    for idx,tcpID in enumerate(data['id']):
        if tcpID in sermons: 
            # continue
            filepath = findTextTCP(tcpID)
            tcpIDlang = get_lang(filepath)
            if tcpIDlang != 'eng': 
                foreign[tcpID] = tcpIDlang
        elif tcpID in sermons_missing: # texts
            filepath = findTextTCP(tcpID)
            tcpIDlang = get_lang(filepath)
            if tcpIDlang != 'eng': 
                foreign_missing[tcpID] = tcpIDlang
    print(csvFile)

## Items to exclude

In [8]:
import sys 
sys.path.append('../') 
from lib.dictionaries.sermon_annotations import * 
from collections import Counter

In [4]:
len(exclude_annotated), len(exclude_foreign)

(270, 42)

In [16]:
len(sermon_subsections), len(custom_exceptions)

(158, 13)

In [113]:
c_sections_list = []
for slist in custom.values(): 
    if isinstance(slist,str): slist=[slist]
    c_sections_list.extend(slist)
print(len(custom), Counter(list(c_sections_list)))

204 Counter({'text': 74, 'treatise': 16, 'part': 16, 'tract': 9, 'chapter': 7, 'book': 6, 'section': 5, 'religious_tract': 5, 'discourse': 5, 'letter': 3, 'biblical_commentary': 3, 'speech': 3, 'lecture': 3, 'class': 2, 'verse': 2, 'funeral_sermon': 2, 'essay': 2, 'doctrine': 2, 'exposition': 2, 'commentary': 2, 'dialogue': 2, 'prefatory_letter': 1, 'address': 1, 'commentary_on_job': 1, 'lamentation': 1, 'decade': 1, 'religious_treatise': 1, 'funeral_oration': 1, 'commentary_on_verse': 1, 'colophon': 1, 'commentary_on_colossians': 1, 'extracts_from_sermon': 1, 'oration_and_sermon': 1, 'commentary_on_ruth': 1, 'moral_treatise': 1, 'catechism': 1, 'panegyric': 1, 'poem': 1, 'homiletic_tract': 1, 'funeral_speech': 1, 'exposition_of_job': 1, 'scaffold_speech': 1, 'treatises': 1, 'theological_discourse': 1, 'exegesis': 1, 'commentary_on_acts_8': 1, 'polemic': 1, 'preface': 1, 'consolatio': 1, 'collection': 1, 'application': 1, 'commandment': 1, 'subpoena': 1, 'abstract': 1, 'commentary_on_l

In [None]:
for item in exclude_annotated: 
    if item not in sermons_missing.keys(): 
        print(item)

In [109]:
missing_df = []
sermons_df = []

for tcpID, entry in sermons.items(): 
    if tcpID in exclude_foreign: 
        continue
    elif tcpID not in ['A28579','A50799']: 
        sermons_df.append(entry)

for tcpID, entry in sermons_missing.items():
    if tcpID in exclude_foreign: 
        continue
    elif tcpID in exclude_annotated: 
        continue
    else: 
        missing_df.append(entry)

missing_df = pd.DataFrame(missing_df)
missing_df.to_csv("../assets/sermons_missing.csv",index=False)
sermons_df = pd.DataFrame(sermons_df)
sermons_df.to_csv("../assets/sermons.csv", index=False)

print(len(sermons_df)+ len(missing_df))
len(sermons_df), len(missing_df)

5729


(4226, 1503)

## Examine section names 

In [None]:
import os,re
import pandas as pd 
from collections import Counter 


sermons_missing = pd.read_csv("../assets/sermons_missing.csv")
sermons_missing = sermons_missing.to_dict(orient='records')
sermons_missing = {s['id']: s for s in sermons_missing}

num_oral = 0 
sections = [] # tcpID to section to index 
info_dict = sermons_missing
# info_dict = sermons 

for fp in os.listdir("../assets/plain_all"): 
    if fp == ".DS_Store": continue 
    tcpID = fp.split(".")[0]
    if tcpID not in sermons and tcpID not in sermons_missing: 
        continue 
    if tcpID in sermons: 
        continue
    # if tcpID in sermons_missing: 
    #     continue 
     
    with open(f"../assets/plain_body/{fp}","r") as file: 
        text = file.read() 
    s = re.findall(r'(\bDIV[\d+\_\w+\^]+)\s',text)
    
    tcpID_sections = {}
    title = info_dict[tcpID]['title']
    subjects = info_dict[tcpID]['subject_headings']
      
    if tcpID in custom or tcpID in custom_exceptions or tcpID in custom_subsections: 
        continue 
    elif tcpID in exclude_foreign or tcpID in exclude_annotated: 
        continue 
    elif re.search('Sermons|Funeral sermons|Thanksgiving sermons|Fast-day sermons|Christmas sermons|Visitation sermons',subjects):
        continue
    
    skip = False 
    for name in s: 
        if re.search(r"\^sermon\^",name):
            sections.append(name)
            skip = True 
        if name not in tcpID_sections: 
            tcpID_sections[name] = 1 
        else: 
            tcpID_sections[name] += 1 
    if skip: continue 
    isStandard = False 
    for item in wanted_sections: 
        if item in tcpID_sections: 
            isStandard = True
    if not isStandard: 
        print(f"\'{tcpID}\':", tcpID_sections) 

    print(tcpID, "-----------",
          sermons_missing[tcpID]['authors'],"-----------",
          sermons_missing[tcpID]['title'],"-----------",
          sermons_missing[tcpID]['subject_headings'])

In [None]:
for name, _ in Counter(sections).items(): 
    # if not re.search(r"^sermon",name): 
    #     if name not in wanted_sections: 
            print(name, _)


In [52]:
all_sermons = sermons.copy()
sermons_missing = pd.read_csv("../assets/sermons_missing.csv")
sermons_missing = sermons_missing.to_dict(orient='records')
sermons_missing = {s['id']: s for s in sermons_missing}
all_sermons.update(sermons_missing)
len(all_sermons)

5757

In [None]:
missing = []
found = []
for tcpID in missing: 
    print(all_sermons[tcpID]['authors'])
    print(all_sermons[tcpID]['title'])
    print(all_sermons[tcpID]['subject_headings'])
    print(all_sermons[tcpID]['date'])
    if f"{tcpID}.txt" not in os.listdir("../assets/plain_body"): continue 
    with open(f"../assets/plain_body/{tcpID}.txt","r") as file: 
        text = file.read() 
        s = re.findall(r'(\bDIV[\d+\_\w+\^]+)\s',text)
        items = []
        for item in s: 
            # if "sermon" in item: 
                items.append(item)
        
        print(tcpID, Counter(items),'\n\n')


In [None]:
for fp in os.listdir(f"../assets/plain_body"):
    tcpID = fp.split(".txt")[0]
    if tcpID not in sermons and tcpID not in sermons_missing: 
        print(tcpID)
        os.remove(f"../assets/plain_body/{tcpID}.txt")

# There are 5,729 XML files in this corpus
- 15179 sermon-related sections (DIV1-7)
- 98092 instances of foreign language gaps ("< in non-Latin alphabet >)

In [18]:
import os,re
from collections import Counter 
from tqdm import tqdm 
folder = f"/Users/amycweng/DH/Early-Modern-Sermons/assets/plain_body"
tcpIDs = {fp.split(".txt")[0]:None for fp in os.listdir(folder)}
len(tcpIDs)

5729

In [19]:
total = {}
for tcpID in tqdm(tcpIDs): 
    with open(f"{folder}/{tcpID}.txt","r") as file: 
        text = file.read() 
        s = re.findall("NONLATINALPHABET",text)
        # if len(s) > 0: 
        #     print(tcpID, Counter(s))
        total[tcpID] = len(s)
print(sum(total.values()))

100%|██████████| 5729/5729 [00:07<00:00, 792.95it/s]

98092





In [None]:
for tcpID, freq in Counter(total).most_common(n=10):
    print("###############")
    print(tcpID,freq)
    print(all_sermons[tcpID]['title'])


In [None]:
import os,re
from collections import Counter 
from tqdm import tqdm 
tcpIDs = {fp.split(".txt")[0]:None for fp in os.listdir(folder)}
total = []
for tcpID in tqdm(tcpIDs): 
    with open(f"{folder}/{tcpID}.txt","r") as file: 
        text = file.read().split() 
    tcpIDs[tcpID] = ([],[])
    for idx, s in enumerate(text): 
        if re.search(r"^DIV\d+\^",s): 
            tcpIDs[tcpID][0].append(s)
            label = True  
            if (idx+1) < len(text): 
                if re.search(r"^DIV\d+\^",text[idx+1]): 
                    label = False 
            else: 
                label = False 
            tcpIDs[tcpID][1].append(label)

  7%|▋         | 396/5729 [00:10<01:40, 53.03it/s]

In [120]:
total_oral= []
total_else = []
oral_tcpIDs = []
for tcpID,items in tqdm(tcpIDs.items()): 
    sections,labels = items[0], items[1] 
    print(labels)
    for item,label in zip(items[0],items[1]):
        section_name = re.findall(r"^DIV\d+\^(.*?)\^[\d\w]*$",item)
        section_name = re.sub(r"\\\^","_",section_name[0])
        if "^" in section_name: 
            section_name = "_".join(section_name.split("^")[:-1])
        if label is True: 
            print(tcpID)
            oral = False 
            if re.search("^(?!list|reply|catalogue|calendar)(sermon|homily|speech|lecture)",section_name): 
                total_oral.append(section_name)
                oral = True 
            else: 
                total_else.append(section_name)
            if oral: oral_tcpIDs.append(tcpID)

 40%|████      | 2318/5729 [00:00<00:00, 11926.70it/s]

[False, False, False]
[False, False, False, False, False, False]
[False, False, False, False, False, False]
[False, False, False, False]
[False, False, False, False, False]
[False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]
[False, False]
[False, False, False]
[False, False, False, False, False, False, False, False, False, False, False, False]
[False, False, False, False, False, False]
[False, False, False]
[False, False, False]
[False, False, False]
[False, False, False]
[False, False, False, False, False]
[False]
[False, False, False, False, False, False]
[False, False, False]
[False, False, False, False, False, False, False]
[False, False]
[False, False, False]
[False, False, False]
[False, False, False, False, False, False, Fals

100%|██████████| 5729/5729 [00:00<00:00, 15028.62it/s]

[False, False, False]
[False, False, False]
[False, False, False]
[False, False]
[False, False, False]
[False, False, False]
[False, False, False, False, False]
[False, False, False]
[False, False, False]
[False, False, False]
[False, False]
[False, False, False, False, False, False, False, False, False, False]
[False, False]
[False, False, False, False]
[False, False, False, False]
[False, False, False, False, False, False, False, False, False, False]
[False, False, False]
[False, False, False, False]
[False, False, False]
[False, False, False]
[False, False, False, False]
[False, False, False]
[False, False]
[False, False, False, False, False]
[False, False, False]
[False, False, False, False]
[False, False, False, False]
[False, False]
[False, False]
[False, False, False]
[False, False, False]
[False, False, False, False]
[False, False]
[False, False, False, False]
[False, False, False, False]
[False, False, False, False, False]
[False, False, False]
[False, False, False, False, Fal




In [118]:
len(set(oral_tcpIDs)), len(tcpIDs)-len(set(oral_tcpIDs))

(0, 5729)

In [109]:
print(len(total_oral), Counter(total_oral).most_common(n=10))

16215 [('sermon', 14664), ('lecture', 912), ('speech', 210), ('sermons', 151), ('homily', 115), ('sermon_proper', 32), ('sermon_on_luke', 7), ('sermon_on_psalms', 7), ('sermon_on_john', 6), ('sermon_on_acts', 5)]


In [114]:
print(len(total_else), Counter(total_else))



In [28]:
from graphviz import Digraph
import re

# Your example data
example_output = [
    'DIV1^title\\^page^', 'DIV1^dedication^', 'DIV1^sermon^', 'DIV2^part^', 'DIV2^part^', 
    'DIV2^part^', 'DIV2^part^', 'DIV2^part^', 'DIV2^part^', 'DIV2^part^', 'DIV1^sermon^', 
    'DIV2^part^', 'DIV2^part^', 'DIV2^part^', 'DIV1^sermon^2', 'DIV2^observation^1', 
    'DIV2^observation^2', 'DIV2^observation^3', 'DIV1^sermon^3', 'DIV2^consideration^1', 
    'DIV2^consideration^2', 'DIV2^consideration^3', 'DIV1^sermon^4', 'DIV2^part^1', 
    'DIV2^part^2', 'DIV1^sermon^', 'DIV2^part^', 'DIV2^part^', 'DIV2^part^', 'DIV2^part^', 
    'DIV2^part^', 'DIV2^part^', 'DIV2^application^', 'DIV1^sermon^', 'DIV2^part^', 
    'DIV2^part^', 'DIV2^part^', 'DIV1^sermon^', 'DIV2^part^', 'DIV2^proposal^1', 
    'DIV2^proposal^2', 'DIV2^proposal^3', 'DIV2^part^', 'DIV2^conclusion^'
]

# Create a directed graph
dot = Digraph(comment='Document Structure', format='png')
dot.attr(rankdir='TB')  # Top to Bottom layout
dot.attr('node', shape='box', style='filled', fillcolor='lightgrey')

# Process the example data
current_div1 = None
div1_nodes = set()
div2_nodes = set()

for item in example_output:
    if item.startswith('DIV1^'):
        # Extract DIV1 type (clean the name)
        div1_type = re.sub(r'\^\d*$', '', item.split('^')[1])
        div1_node = f"DIV1: {div1_type}"
        dot.node(div1_node, div1_type, shape='ellipse', fillcolor='lightblue')
        current_div1 = div1_node
        div1_nodes.add(div1_node)
    elif item.startswith('DIV2^'):
        if current_div1 is None:
            continue  # Skip if no DIV1 parent
        
        # Extract DIV2 type (clean the name)
        div2_type = re.sub(r'\^\d*$', '', item.split('^')[1])
        div2_node = f"DIV2: {div2_type}"
        
        # Only add each DIV2 node once
        if div2_node not in div2_nodes:
            dot.node(div2_node, div2_type)
            div2_nodes.add(div2_node)
        
        # Create edge from current DIV1 to this DIV2
        dot.edge(current_div1, div2_node)

# Save and render the graph
dot.render('document_structure', view=True)
print("Graph saved as document_structure.png")

ModuleNotFoundError: No module named 'graphviz'

In [29]:
from tqdm import tqdm 

progress = tqdm(sorted(tcpIDs))
special = []
for tcpID in progress: 
    with open(f"{folder}/{tcpID}.txt","r") as file: 
        text = file.read() 
    # special.extend(re.findall(r'[^\d\w\s\'\"\.\,\?\!\:\;\(\)\[\]\&\-\•\◊\—\§\^]',text))
    text = re.sub(r"\s*[\∣\¦\|\‖]\s*","",text)
    with open(f"{folder}/{tcpID}.txt","w+") as file:
        file.writelines(text) # write as one long string  
    progress.set_description(tcpID)

  0%|          | 0/5729 [00:00<?, ?it/s]

B43856: 100%|██████████| 5729/5729 [06:56<00:00, 13.74it/s]
