In [3]:
import csv, os, re, shutil,json, sys
import pandas as pd 
EP = '/Users/amycweng/Digital Humanities/eebotcp/texts'
TCP = '/Users/amycweng/Digital Humanities/TCP'
sys.path.append('../')
from lib.citation_info import * 

Find texts that contain sermons (DIV1 tag with attribute "TYPE"='sermon')

In [27]:
from bs4 import BeautifulSoup, SoupStrainer
def is_sermon(filepath):
    # read the input XML file 
    with open(filepath,'r') as file: 
        data = file.read()
    # use soupstrainer to only parse the main body
    tag = SoupStrainer("DIV1",attrs={"TYPE":"sermon"})
    soup = BeautifulSoup(data,features="xml",parse_only=tag)
    if len(soup.findAll("DIV1")) > 0: 
        return True
    return False

In [28]:
metadataFolder = '/Users/amycweng/Digital Humanities/ECBC-Data-2022/TCP metadata'
sermons = []
for csvFile in os.listdir(metadataFolder):
    data = pd.read_csv(os.path.join(metadataFolder,csvFile))
    for idx,tcpID in enumerate(data['id']):
        if tcpID == "id": continue
        filepath = findTextTCP(tcpID)
        if is_sermon(filepath): 
            subject_headings = data['keywords'][idx].replace(" -- ","; ").replace("  "," ")
            sermons.append( {"id": tcpID, 
                            "estc":data['estc'][idx],
                            "stc":data['stc'][idx],
                            "title": data['title'][idx],
                            "authors": "; ".join(set(data['author'][idx].split("; "))),
                            "publisher": data['publisher'][idx],
                            "pubplace":data['pubplace'][idx],
                            "subject_headings":subject_headings,
                            "date":data['date'][idx]
                            }
            )
    print(csvFile)

P1B0.csv
P1A9.csv
P1A8.csv
P2B2.csv
P2B3.csv
P2B1.csv
P2B0.csv
P2B4.csv
P2A8.csv
P2A9.csv
P2A7.csv
P2A6.csv
P2A4.csv
P2A5.csv
P2A1.csv
P2A0.csv
P2A2.csv
P2A3.csv
P1A6.csv
P1A7.csv
P1A5.csv
P1A4.csv
P1A0.csv
P1A1.csv
P1A3.csv
P1A2.csv


In [31]:
# store relevant metadata in a CSV file 
with open("../assets/sermons.csv","w+") as outfile: 
    writer = csv.DictWriter(outfile, fieldnames=sermons[0].keys())
    writer.writeheader()
    writer.writerows(sermons)

print(f"{len(sermons)} TCP XML files contain sermons.") 

3991 TCP XML files contain sermons.


Copy all the TCP XML files of the texts to a separate folder for easy browsing. 

In [32]:
sermons_metadata = pd.read_csv("../assets/sermons.csv")
sermons = sermons_metadata["id"]

In [33]:
for s in sermons: 
    path = findTextTCP(s)
    shutil.copy(path,'/Users/amycweng/Digital Humanities/sermonsTCP')

Copy all the EP (EarlyPrint) XML files of the texts to a separate folder for easy browsing. 

In [34]:
underscores = {}
def findTextEP(tcpID):
    path = None
    if "B43" in s: return None # there is no folder in the EP texts that starts with B43 
    
    for file in os.listdir(f'{EP}/{tcpID[0:3]}'):
        if tcpID in file: 
            if '_' in file: 
                print(file)
                # account for the fact that some individual TCP files have been 
                # sectioned into multiple EP files due to size 
                if tcpID not in underscores: 
                    underscores[tcpID] = [f'{EP}/{tcpID[0:3]}/{file}']
                else: 
                    underscores[tcpID].append(f'{EP}/{tcpID[0:3]}/{file}')
            else: 
                path = f'{EP}/{tcpID[0:3]}/{file}' 
                break   
    return path

missing = []
for s in sermons: 
    path = findTextEP(s)
    if not path: 
        missing.append(s)
    else: 
        shutil.copy(path,'/Users/amycweng/Digital Humanities/sermonsEP')


In [35]:
print(underscores)
print(f"There are {len(sermons)-len(missing)} sermons in EP. {len(missing)} TCP sermons are missing from EP.")
print("The TCP ids of the missing texts: ", missing)

{}
There are 3661 sermons in EP. 330 TCP sermons are missing from EP.
The TCP ids of the missing texts:  ['A95346', 'B26461', 'B26318', 'B29175', 'B25291', 'B23227', 'B26965', 'B26662', 'B29151', 'B23303', 'B29077', 'B23225', 'B25935', 'B24387', 'B27750', 'B26367', 'B24299', 'B27584', 'B29537', 'B22572', 'B28835', 'B22620', 'B26664', 'B22604', 'B21648', 'B29161', 'B21644', 'B28285', 'B26787', 'B23952', 'B20800', 'B26345', 'B26249', 'B23004', 'B21561', 'B23636', 'B26742', 'B21646', 'B27417', 'B27566', 'B27684', 'B26659', 'B22979', 'B29538', 'B23961', 'B23001', 'B28382', 'B29195', 'B23013', 'B20167', 'B28834', 'B26321', 'B26180', 'B23007', 'B27952', 'B21317', 'B27215', 'B26677', 'B22621', 'B24300', 'B22963', 'B20883', 'B21647', 'B26839', 'B28085', 'B25756', 'B24496', 'B20173', 'B31833', 'B34542', 'B14422', 'B14338', 'B13906', 'B13601', 'B15275', 'B14200', 'B14420', 'B11014', 'B15678', 'B15392', 'B12105', 'B17542', 'B17696', 'B14434', 'B17254', 'B10006', 'B15274', 'B11791', 'B13612', 'B11

In [36]:
subjects = sermons_metadata["subject_headings"]
all_subjects = []
for s in subjects: 
    all_subjects.extend([_.strip('.') for _ in s.split("; ")])
from collections import Counter
Counter(all_subjects)


Counter({'17th century': 3147,
         'Sermons, English': 3100,
         'Sermons': 2875,
         'Bible': 1776,
         'Early works to 1800': 1573,
         'N.T': 880,
         'O.T': 880,
         'Funeral sermons': 454,
         'Church of England': 312,
         'Great Britain': 199,
         'Fast-day sermons': 198,
         'History': 183,
         '16th century': 169,
         'England': 150,
         'Controversial literature': 102,
         'Christian life': 85,
         'Religious aspects': 73,
         'Charles': 73,
         'II,': 63,
         'Civil War, 1642-1649': 60,
         'Catholic Church': 59,
         'Jesus Christ': 51,
         'I,': 50,
         'God': 47,
         'Sermons, American': 47,
         'Commentaries': 44,
         'No Keywords': 43,
         'King of England, 1600-1649': 39,
         'Christianity': 37,
         "Lord's Supper": 37,
         'Visitation sermons': 35,
         'King of England, 1630-1685': 34,
         'Church and state': 33,