In [1]:
import pandas as pd
from bs4 import BeautifulSoup as bs
from lxml import etree
import re


In [2]:
metadata = pd.read_csv('metadata.csv')

In [3]:
metadata['plain_txt_filename'] = metadata['plain_txt_filename'].str.replace('\\','/')
metadata.to_csv('metadata.csv', index=False)

In [4]:
def page_numbering_hathi(doc):
    # hathi trust cleaning
    with open(doc['plain_txt_filename'], 'r', encoding='utf-8') as f:
        text = f.read()

    # get rid of hathi trust blurb if it exists
    if '## p. (#1) ##' in text:
        text = text.split('## p. (#1) ##################################################')[1]
    else:
        text

    # add a page one
    text = '<pb/>\n' + text

    # remove form feeds
    text = text.replace('\f','')

    # mark page breaks
    text = re.sub(r'## p\. .*?\(#\d+\) #+\n*', '<pb/>\n', text)

    # remove empty pages
    text = re.sub(r'<pb/>\s+(?=<pb/>)', '', text)

    # number the pages
    pages = text.split('<pb/>')

    numbered_text = pages[0]

    for i in range(1, len(pages)):
        numbered_text += f'<pb n="{i}"/>' + pages[i]

    return numbered_text



In [9]:
def get_tei_skeleton(series, text_given=None):
    # Header creation with metadata
    root = etree.Element("TEI")

    # create the header
    header = etree.SubElement(root, "teiHeader")
    fileDesc = etree.SubElement(header, "fileDesc")

    # title
    title = etree.SubElement(fileDesc, "title")
    title.text = series['title_name']

    # author
    author = etree.SubElement(fileDesc, "author")
    author.text = str(series['author_name'])

    # publisher
    publisher = etree.SubElement(fileDesc, "publicationStmt")
    publisher.text = f'Published by {series['publisher_name']} in the year {series['year_of_publication']}'

    # source 
    source = etree.SubElement(fileDesc, "sourceDesc")
    source.text = f'This file can be found at {series['plain_txt_filename']}. It is a {series['genre_or_doc_type']} file from {series['source']}. The original file format was a {series['original_file_format']} file and this document was available to the {series['rights_agreement']}'

    # add text body
    text = etree.SubElement(root, "text")
    body = etree.SubElement(text, "body")

    if text_given:
        pages = re.split(r'<pb n="(\d+)"/>', text_given)
    
        if pages[0].strip():
            body.text = pages[0]
        
        # Loop through page number and content pairs
        for i in range(1, len(pages), 2):
            page_num = pages[i]
            page_content = pages[i+1] if i+1 < len(pages) else ""
            
            # Create actual <pb> element
            pb = etree.SubElement(body, "pb")
            pb.set("n", page_num)
            pb.tail = page_content
    else:
        with open(series['plain_txt_filename'], 'r', encoding='utf-8') as f:
            text = f.read()
        body.text = text
        
    tree = etree.ElementTree(root)

    # xml file name
    xml_file_name = 'tei_formatted/' + series['plain_txt_filename'].replace("plain_text/","").replace('.txt', '') + '.xml'

    tree.write(xml_file_name, encoding='utf-8', xml_declaration=True, pretty_print=True)

In [10]:
# plain text cleaning
for index, doc in metadata.iterrows():
    if doc['source'] == 'Hathi Trust':
        x = page_numbering_hathi(doc)
        get_tei_skeleton(doc, x)
    else:
        get_tei_skeleton(doc)

# Markup
<li> Large divs: foreword, intro, chapter (div type="chapter" etc)
<li> Headings: titles, section headings (use head tag)
<li> Paragraphs: p tag
<li> Page breaks: pb n="1" 