In [4]:
import csv, os, re, shutil,json
import pandas as pd 
EP = '/Users/amycweng/Digital Humanities/eebotcp/texts'
TCP = '/Users/amycweng/Digital Humanities/TCP'

Identify texts that contain sermons in EEBO-TCP using metadata.

In [3]:
metadataFolder = '/Users/amycweng/Digital Humanities/ECBC-Data-2022/TCP metadata'
sermons = []
count_subjectheadings = []
hit = False
for csvFile in os.listdir(metadataFolder):
    data = pd.read_csv(os.path.join(metadataFolder,csvFile))
    for idx,tcpID in enumerate(data['id']):
        hit = False
        # columns of the TCP metadata CSVs: 
        # id,stc,estc,title,author,publisher,pubplace,subject_headings,date
        orig_title = data['title'][idx]
        title = orig_title.lower().replace("'",'')
        estc = data['estc'][idx]
        stc= data['stc'][idx]
        authors = data['author'][idx].split("; ")
        authors = "; ".join(set(authors))
        publisher = data['publisher'][idx]
        pubplace = data['pubplace'][idx]
        date = f"{data['date'][idx]}"


        subject_headings = data['keywords'][idx].replace(" -- ","; ").replace("  "," ")
        # search in the subject headings 
        if re.search('sermon', subject_headings.lower()): 
            hit = True 
            count_subjectheadings.append(tcpID) 
        # search in title 
        elif re.search(r'preached|preacht', title): 
            hit = True

        if hit: 
            sermons.append( {"id": tcpID, 
                              "estc":estc,
                              "stc":stc,
                              "title": orig_title,
                              "authors": authors,
                              "publisher": publisher,
                              "pubplace":pubplace,
                              "subject_headings":subject_headings,
                              "date":date
                              }
            )

In [41]:
# store relevant metadata in a CSV file 
with open("sermons.csv","w+") as outfile: 
    writer = csv.DictWriter(outfile, fieldnames=sermons[0].keys())
    writer.writeheader()
    writer.writerows(sermons)

with open("sermons_subjectheadings.json","w+") as outfile: 
    json.dump(count_subjectheadings,outfile)

print(f"{len(sermons)} TCP XML files are likely to contain sermons.") 
print(f"{len(count_subjectheadings)} texts containing sermons are identified by the Library of Congress' subject headings")
print(f"{len(sermons)-len(count_subjectheadings)} texts are not identified by subject heading as a sermon but mention 'preached' or 'preacht' in the title")

5535 TCP XML files are likely to contain sermons.
5241 texts containing sermons are identified by the Library of Congress' subject headings
294 texts are not identified by subject heading as a sermon but mention 'preached' or 'preacht' in the title


Copy all the TCP XML files of the texts to a separate folder for easy browsing. 

In [None]:
sermons = pd.read_csv("sermons.csv")["id"]

def findTextTCP(id):
    if re.match('B1|B4',id[0:2]):
        path = f'{TCP}/P2{id[0:2]}/{id}.P4.xml'
    else: 
        if f'{id}.P4.xml' in os.listdir(f'{TCP}/P1{id[0:2]}'):
            path = f'{TCP}/P1{id[0:2]}/{id}.P4.xml'
        elif f'{id}.P4.xml' in os.listdir(f'{TCP}/P2{id[0:2]}'): 
            path = f'{TCP}/P2{id[0:2]}/{id}.P4.xml'
    return path 

for s in sermons: 
    path = findTextTCP(s)
    shutil.copy(path,'/Users/amycweng/Digital Humanities/sermonsTCP')

Copy all the EP (EarlyPrint) XML files of the texts to a separate folder for easy browsing. 

In [45]:
underscores = {}
def findTextEP(tcpID):
    path = None
    if "B43" in s: return None # there is no folder in the EP texts that starts with B43 
    
    for file in os.listdir(f'{EP}/{tcpID[0:3]}'):
        if tcpID in file: 
            if '_' in file: 
                print(file)
                # account for the fact that some individual TCP files have been 
                # sectioned into multiple EP files due to size 
                if tcpID not in underscores: 
                    underscores[tcpID] = [f'{EP}/{tcpID[0:3]}/{file}']
                else: 
                    underscores[tcpID].append(f'{EP}/{tcpID[0:3]}/{file}')
            else: 
                path = f'{EP}/{tcpID[0:3]}/{file}' 
                break   
    return path

missing = []
for s in sermons: 
    path = findTextEP(s)
    if not path: 
        missing.append(s)
    else: 
        shutil.copy(path,'/Users/amycweng/Digital Humanities/sermonsEP')


In [47]:
print(underscores)
print(f"There are {len(sermons)-len(missing)} sermons in EP. {len(missing)} TCP sermons are missing from EP.")
print("The TCP ids of the missing texts: ", missing)

{}
There are 5108 sermons in EP. 427 TCP sermons are missing from EP.
The TCP ids of the missing texts:  ['A95720', 'B25417', 'B26318', 'B29175', 'B25291', 'B23227', 'B26965', 'B29289', 'B26662', 'B29151', 'B23303', 'B29077', 'B23225', 'B25935', 'B24387', 'B26367', 'B24299', 'B27584', 'B29537', 'B22572', 'B24122', 'B28835', 'B22620', 'B26664', 'B22604', 'B21648', 'B21644', 'B25240', 'B23299', 'B23761', 'B28285', 'B20731', 'B26787', 'B27727', 'B23952', 'B20800', 'B26345', 'B26249', 'B23004', 'B21561', 'B23636', 'B26742', 'B21646', 'B27417', 'B27684', 'B26659', 'B22979', 'B28836', 'B29538', 'B23961', 'B23001', 'B23750', 'B28382', 'B29195', 'B23013', 'B20167', 'B28834', 'B26321', 'B26180', 'B26466', 'B23007', 'B27952', 'B21317', 'B26784', 'B27215', 'B26622', 'B26677', 'B22621', 'B26714', 'B24300', 'B22963', 'B20883', 'B21647', 'B22971', 'B26839', 'B28085', 'B25756', 'B24496', 'B20173', 'B31833', 'B14422', 'B14334', 'B15398', 'B14338', 'B13906', 'B13601', 'B15555', 'B15275', 'B14200', 'B14