Using pdfminer high-level api functions, extracts text from the Cambridge ebook of the MacLehose edition and saves into separate txt files per chapter, noting volume and section number [within the MacleHose edition], the geographical region [corresponding to the volume divisions of the 1599 edition], date, title [per CambridgeCore encoding] and page range

In [1]:
# set working directory
import os
os.chdir('text-data/CambridgeCore MacLehose PDFs')
print(os.getcwd())

C:\Users\apovzner\Documents\Hakluyt\text-data\CambridgeCore MacLehose PDFs


In [None]:
#pdf imports
# pdfminer.high_level.extract_text(pdf_file, password='', page_numbers=None, maxpages=0, caching=True, codec='utf-8', laparams=None)
# Parameters:
    # pdf_file – Either a file path or a file-like object for the PDF file to be worked on.
    # page_numbers – List of zero-indexed page numbers to extract.
    # maxpages – The maximum number of pages to parse
# Returns:
# a string containing all of the text extracted.

import pdfminer
from pdfminer.high_level import extract_text
from pdfminer.high_level import extract_pages

import re

In [None]:
def remove_blank_lines(text):
    '''
    remove blank lines from text given as list of lines
    parameters: text: chapter text split into lines
    returns: all non-empty lines from text
    '''
    prev_len = len(text) + 1
    while len(text) < prev_len:
        prev_len = len(text)
        try:
            text.remove('')
        except: pass
    return(text)

def find_date(headers):
    '''
    extract date from list of chapter headers
    parameters: headers: list of lines from chapter headers
    returns: date identified as the most common numberical component of the lines text zeroed to 4 digits
    '''
    headers_string = ''.join(headers)
    numbers = re.findall(r'[0-9]+', headers_string)
    if len(numbers) == 0: return 'XXXX'
    elif len(numbers) == 1: common_number = numbers[0]
    else:
        common_number = max(set(numbers), key = numbers.count)
    if int(common_number) > 300 and int(common_number) < 1620:
        return common_number.zfill(4)
    else:
        return 'XXXX'

def empty_page(lines):
    '''
    determine whether a page is empty of text content
    parameters: lines: page text as lines
    returns: True  if page contains less than 10 lines or lines average less than 3 chars
    '''
    #if len(lines_sans_blanks) < 10: return True  /// original mis-variable
    if len(lines) < 10: return True
    lines_lens = [len(line) for line in lines]
    if sum(lines_lens)/len(lines_lens) < 4: return True    
    return False

def chapter_process(chapter):
    '''
    extract chapter text free of headers & footers and determine date
    parameters: chapter: path of pdf chapter
    returns: 
        chapter text as single string 
        date with leading zeroes to 4 digits or XXXX if failed to extract
    possible enhancements:
        resolve linebreak dashes
        crop out side notes
    '''
    headers = []
    chapter_text_list = []
    chapter_pages = len(list(extract_pages(chapter)))

    for pagenum in range(chapter_pages): 
        # process page by page, clearing headers, footers & blank lines; storing headers for date extract
        text = extract_text(chapter, page_numbers = [pagenum])
        lines = text.splitlines()
        lines_sans_blanks = remove_blank_lines(lines)
        if empty_page(lines_sans_blanks): continue
        headers += lines_sans_blanks[:3]
        if not ('.1_pp' in chapter and pagenum == 0):  #remove header except for first page of first chapter in each volume, which has no header
            del lines_sans_blanks[:3]
        del lines_sans_blanks[-4:] # remove footer
        chapter_text_list += lines_sans_blanks

    #override hyphen-broken words at ends of lines excepting the last
    for i in range(len(chapter_text_list) - 1):
        if chapter_text_list[i][-1] == '-':
            chapter_text_list[i] = chapter_text_list[i][:-1] + chapter_text_list[i+1]
            chapter_text_list[i+1] = ' '

    #joining list of lines into one string and cleaning out extra spaces
    chapter_text_string = ' '.join(chapter_text_list)
    chapter_text_string = re.sub('\s+',' ', chapter_text_string)
    return(chapter_text_string, find_date(headers))

def vol_chap_geog_prange(vol, chapter):
    '''
    extract volume, chapter number, broad geographical designation and title ready for feeding into txt file names
    parameters: vol as int; chapter as path of pdf chapter
    returns: 
            vol_z as number zeroed to 2 digits
            chap_z as number zeroed to 2 digits
            geog as CCCC determined by volume / chapter numbers below:
                01.01-04.4: NNE-
                04.05-06.17: SSE1
                06.18-07.16: SSE2
                07.17-11.43: AM-- 
            title as extracted from file name
            page range zeroed to 3 digits each number
    '''
    chap = chapter[chapter.find('.') + 1 : chapter.find('_')] #extract chap num between first dot and first underscore
    geog = 'XXXX' #to raise flag just in case something escapes
    if vol in [1,2,3]:
        geog = 'NNE-'
    elif vol == 4:
        if int(chap) in range(5): geog = 'NNE-'
        else: geog = 'SSE1'
    elif vol == 5:
        geog = 'SSE1'
    elif vol == 6:
        if int(chap) in range(18): geog = 'SSE1'
        else: geog = 'SSE2'
    elif vol == 7:
        if int(chap) in range(17): geog = 'SSE2'
        else: geog = 'AMER'
    else: geog = 'AMER'

    title = chapter[:-4]
    for i in range(4): # remove section & pages through 4th underscore
        title = title[title.find('_') + 1:]

    page_range = chapter
    for i in range(2):
        page_range = page_range[page_range.find('_') + 1:]
    page_start = page_range[:page_range.find('_')]
    page_range = page_range[page_range.find('_') + 1:]
    page_end = page_range[:page_range.find('_')]
    page_range = page_start.zfill(3) + '-' + page_end.zfill(3)
    return(str(vol).zfill(2), chap.zfill(2), geog, title, page_range)   



In [None]:
# iterate over volumes, then over chapter pdfs excluding front matter etc
for vol in range(1,12):
    filelist = os.scandir(os.getcwd() + '/' + str(vol))
    for entry in filelist:
        if entry.is_file(): 
            if (((vol == 1 and entry.name.startswith('06')) 
                or (vol > 1 and entry.name.startswith('04'))) 
                and entry.name[3] != '0'):  #identify body chapters
                    print(str(vol) + '_' + entry.name)
                    chapter = str(vol) + '/' + entry.name
                    
                    #extract and save text, extract file name components and fit into file name
                    chapter_text, date = chapter_process(chapter)
                    vol_n, chap_n, geog, title, page_range = vol_chap_geog_prange(vol, chapter)
                    filename = vol_n + '_' + chap_n + '_' + geog + '_' + date + '_' + title + '_pp.' + page_range

                    #create text file
                    with open(filename + '.txt', 'w') as f:
                        f.write(chapter_text)

In [None]:
# single file creation for debugging
pdf_to_test = '06.42_pp_327_327_A_mandate_of_King_Edward_the_first_concerning_outlandish_Merchants'
chapter = str(1) + '/' + pdf_to_test +'.pdf'

#extract and save text, extract file name components and fit into file name
chapter_text, date = chapter_process(chapter)
vol_n, chap_n, geog, title, page_range = vol_chap_geog_prange(vol, chapter)
filename = vol_n + '_' + chap_n + '_' + geog + '_' + date + '_' + title + '_pp.' + page_range

#create text file
with open(filename + '.txt', 'w') as f:
    f.write(chapter_text)

# Here and onward is testing code; do ignore

In [None]:


# filename format: 01 01 GEOG date TITLE [pagerange]
vol = 10
chapter = str(vol)+'/04.15_pp_178_183_The_voyage_and_valiant_fight_of_The_Content.pdf'

# import pypdf2
# from pypdf2 import pdffilereader

chapter = 'croppeddoc2.pdf'
print(extract_text(chapter, page_numbers = [0]))


# #extract and save text, extract file name components and fit into file name
# chapter_text, date = chapter_process(chapter)
# vol_n, chap_n, geog, title, page_range = vol_chap_geog_prange(vol, chapter)
# filename = vol_n + '_' + chap_n + '_' + geog + '_' + date + '_' + title + '_pp.' + page_range

# #create text file
# with open(filename + '.txt', 'w') as f:
#     f.writelines(chapter_text)

# # print(chapter_text, date)
# # print(extract_text(chapter, page_numbers = [0]))

In [None]:
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTTextBox, LTTextLine, LTFigure

fp = open('text-data/archive/other editions and text sources/MacLehose/04.25_pp_114_147_Libellus.pdf', 'rb')
parser = PDFParser(fp)
doc = PDFDocument(parser)

rsrcmgr = PDFResourceManager()
laparams = LAParams(line_overlap=0.5, char_margin=2.0, line_margin=0.5, word_margin=0.5, boxes_flow=-1, detect_vertical=False, all_texts=False)
#word_margin of 0.5 / 0.7 seems to glue together the words that otherwise separate into disparate characters. not sure how to integrate that into the higher-level API page-getting though
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.create_pages(doc):
    interpreter.process_page(page)
    layout = device.get_result()
    parse_layout(layout)

In [None]:
laparams = LAParams(line_overlap=0.5, char_margin=2.0, line_margin=0.5, word_margin=0.5, boxes_flow=-1, detect_vertical=False, all_texts=False)

text = extract_text('text-data/archive/other editions and text sources/MacLehose/04.25_pp_114_147_Libellus.pdf')
print(text)