Takes the output of FR hot-folder OCR processing, clears up headers and saves files under below template, drawing on pdfminer-derived CSV

01_06_NNE-_0624_The_voyage_of_Bertus_into_Ireland_Anno_684_pp.010-010

basic structure:
- fn process text: take in path and return text sans following:
    - headers & footers
    - ~~word-break hyphens~~
    - ~~linebreaks~~
    - ~~??extra spaces??~~
- fn create filename: take in path (and maybe vol) and return formatted filename drawing on CSV
- for vols 1-11:
    - for chapter entries out of all txt in vol folder:
        - process text
        - create filename
        - save text in new folder

In [1]:
import os 
import csv

In [2]:
def process_text(path):
    '''
    clean up a text file from FineReader
    parameters: path: text file path
    returns: text as string cleaned of headers & footers based on Cambridge Hakluyt edition
    '''
    clean_lines = []
    with open(path, 'r', encoding="utf8") as f:
        print(path)
        text = f.readlines()
        i = 0
        while i < len(text) - 1:
            #skip up to 4 short lines - suspects for header remnants
            if len(text[i]) < 35: 
                print('skipped as 1st header ', text[i])
                i += 1
                if i > len(text) - 1: break
            if len(text[i]) < 35: 
                print('skipped as 2nd header ', text[i])
                i += 1
                if i > len(text) - 1: break
            if len(text[i]) < 35: 
                print('skipped as 3rd header ', text[i])
                i += 1
                if i > len(text) - 1: break
            if len(text[i]) < 35: 
                print('skipped as 4th header ', text[i])
                i += 1
                if i > len(text) - 1: break
            # after header, collect body text until footer start
            while not 'https://' in text[i] or 'The material originally positioned' in text[i]:
                clean_lines.append(text[i])
                i+=1
                if i >= len(text) - 1: break
            #once footer starts, skip footer lines, check for page number and repeat loop
            while 'https://' in text[i] or 'The material originally positioned' in text[i]:
                print('skipped footer ', text[i])
                i+=1
                if i > len(text) - 1: break
            if i > len(text) - 1: break
            #check for page number that occasionally precedes footer -- now in clean text
            if len(clean_lines[len(clean_lines) - 1]) < 4:
                print('skipped page number ', clean_lines[len(clean_lines) - 1])
                clean_lines.pop()
    #returns text as string with newlines preserving line braks just in case
    return('\n'.join(clean_lines))

def create_filename(vol, file_name):
    '''
    create file name based on format set in pdfminer extract, including date, drawing on CSV ledger
    parameters:
        vol: number of volume as integer
        file_name: text file name
        returns: new filename as string
    input: 06.16_pp_55_94_Libellus_historicus_Johannis_de_Plano_Carpini.txt
    output: 01_16_NNE-_1246_Libellus_historicus_Johannis_de_Plano_Carpini_pp.055-094.txt
    csv headers: vol chap geog date title pages
    '''
    #format vol to match ledger entries
    vol = str(vol).zfill(2)
    #slice chapter portion of filename
    chap = file_name[3:5]
    #correct for chaps 0 through 9
    if chap[1] == '_':
        chap = '0' + chap[0]
    with open ('text-data/ledger.csv', mode = 'r', newline = '') as ledger:
        ledger_reader = csv.DictReader(ledger)
        #iterate over csv rows until find relevant one
        for row in ledger_reader:
            if row['vol'] == vol and row['chap'] == chap:
                r = row
                break
    name = '_'.join([vol, chap, r['geog'], r['date'], r['title']])
    return(name + '_pp.' + r['pages'] + '.txt')

In [5]:
#set working directory
os.chdir('text-data/Cambridge MacLehose FineReader OCR')
#print(os.getcwd())
#iterate through volumes
for vol in range(1,12):
    filelist = os.scandir(os.getcwd() + '/' + str(vol))
    for entry in filelist:
        #identify body chapters
        if entry.is_file(): 
            if (((vol == 1 and entry.name.startswith('06')) 
                or (vol > 1 and entry.name.startswith('04'))) 
                and entry.name[3] != '0'):
                # get clean text and desired filename
                text = process_text(entry.path)
                filename = create_filename(vol, entry.name)
                # create text file
                with open(filename, 'w', encoding="utf8") as f:
                    f.write(text)

C:\Users\apovzner\Documents\Hakluyt\Cambridge MacLehose FineReader OCR/1\06.10_pp_16_23_The_voyage_of_King_Edgar_with_4000_shippes_round_about_his_large_Monarchie_Anno_973.txt
skipped as 1st header  ﻿a.d.	THE ENGLISH VOYAGES

skipped as 2nd header  c. 890.

skipped footer  Downloaded from https://www.cambridge.org/core. University of Notre Dame Law Library, on 30 Aug 2021 at 13:52:43, subject to

skipped footer  the Cambridge Core terms of use, available at https://www.cambridge.org/core/terms.

skipped footer  https://doi.org/10.1017/CBO9781107286245.018

skipped as 1st header  KING EDGAR	a.d.

skipped as 2nd header  .	. .	973-

skipped footer  Downloaded from https://www.cambridge.org/core. University of Notre Dame Law Library, on 30 Aug 2021 at 13:52:43, subject to

skipped footer  the Cambridge Core terms of use, available at https://www.cambridge.org/core/terms.

skipped footer  https://doi.org/10.1017/CBO9781107286245.018

skipped as 1st header  A.D.

skipped as 2nd header  973-


In [88]:
# test create_filename function
f = '06.20_pp_297_299_The_rolle_of_the_huge_Fleete_of_Edward_the_thirde_before_Caleis.txt'
print(create_filename(1,f))

01_20_NNE-_1345_The_rolle_of_the_huge_Fleete_of_Edward_the_thirde_before_Caleis_pp.297-299.txt


In [69]:
# test process_text function
for file in os.scandir('C:/Users/apovzner/Documents/Hakluyt/Cambridge MacLehose FineReader OCR/1'):
    print('text: ',process_text(file.path))

C:/Users/apovzner/Documents/Hakluyt/Cambridge MacLehose FineReader OCR/1\01.0_pp_i_ii_Frontmatter.txt
skipped as 1st header  ﻿CAMBRIDGE LIBRARY COLLECTION

skipped as 2nd header  Books of enduring scholarly value

skipped as 3rd header  Maritime Exploration

skipped footer  Downloaded from https://www.cambridge.org/core. University of Notre Dame Law Library, on 30 Aug 2021 at 13:52:44, subject to

skipped footer  the Cambridge Core terms of use, available at https://www.cambridge.org/core/terms.

skipped footer  https://doi.org/10.1017/CBO9781107286245

skipped footer  Downloaded from https://www.cambridge.org/core. University of Notre Dame Law Library, on 30 Aug 2021 at 13:52:44, subject to

skipped footer  the Cambridge Core terms of use, available at https://www.cambridge.org/core/terms.

skipped footer  https://doi.org/10.1017/CBO9781107286245

skipped footer  Downloaded from https://www.cambridge.org/core. University of Notre Dame Law Library, on 30 Aug 2021 at 13:52:44, subject t

In [None]:
#test print short lines
for file in os.scandir('C:/Users/apovzner/Documents/Hakluyt/Cambridge MacLehose FineReader OCR/6'):
    if file.name.startswith('04.'):
        with open('C:/Users/apovzner/Documents/Hakluyt/Cambridge MacLehose FineReader OCR/6/'+file.name, 'r') as f:
            for line in f.readlines():
                if len(line) < 35:
                    print(line)