### identify chapter start / end and trim HK text accordingly

In [1]:
import nltk
import csv
import os
from nltk.corpus import PlaintextCorpusReader
import HK_basics as HK
corpus_root = 'text-data/Cambridge MacLehose FineReader OCR'
hakluyt = PlaintextCorpusReader(corpus_root, '.*')
hakluyt_lengths = [HK.HK_page_length(fileid) for fileid in hakluyt.fileids()]
length_fd = nltk.FreqDist(hakluyt_lengths)
print(length_fd[1]+length_fd[2]+length_fd[3],' chapters out of 589 are 3 pages or shorter and thus substantially impacted by imprecise chapter divisions based on pages rather than chapter title location')
print("~125,000 words cast off in trimming")

259  chapters out of 589 are 3 pages or shorter and thus substantially impacted by imprecise chapter divisions based on pages rather than chapter title location
~125,000 words cast off in trimming


core functions:
- title_match: ratio of stopword-filtered words from ledger title matched in the section
- page-approx: flags true once string equals/surpasses maximum text likely to fit on a single page
- user-select: print out page sections with numbers and mark user input

algo:
1. split chapter into line sections
2. assemble first_page
    1. identify highest title_match
    2. print out leger title, max title_match, collect user input
    3. if input == y, store title_match position
    4. else user-select position
3. assemble last_page
    1. identify highest title_match
    2. print out leger title, max title_match, collect user input
    3. if input == y, store title_match position **from end**
    4. else user-select position **from end**
4. new file
    1. assemble from total lines based on start and end indexes
    2. save into new folder

In [75]:
def title_match(title, section):
    """determines ratio of stopword-filtered words from ledger title matched in the section"""
    section_trim = section[:int(len(title)*2)]
    title_filtered = [word for word in title if word not in stop_words]
    matching_words = [word for word in title_filtered if word in section_trim]
    return len(matching_words) / len(title_filtered)
def page_approx(lines):
    '''flags true once lines equal/surpass maximum text likely to fit on a single page'''
    return True if len(' '.join(lines)) >= 2600 else False
def user_select(title, lines):
    '''prints title & page sections w/index and returns user input on decided match'''
    print('_____________________________________________________________________________')
    print('select best match for: ', title)
    print('-----------------------------------------------------------------------------')
    for line in lines:
        if len(line) > 3: 
            print(lines.index(line), ': ', line[:400])
    print('_____________________________________________________________________________')
    print('select best match for: ', title)
    print('-----------------------------------------------------------------------------')
    return(int(input('select line')))
def next_title(raw_title):
    '''based on current title, accesses ledger to determine the next chapter title
    return next chapter title + next chapter starting page for overlap calc'''
    with open('text-data/ledgertagged.csv') as ledgertaggedcsv:
        ledger = list(csv.DictReader(ledgertaggedcsv, delimiter = ","))
        for row in ledger:
            if int(row['vol'])==HK.HK_vol(raw_title) and int(row['chap'])==HK.HK_chap(raw_title):
                if int(row['vol']) == 11 and int(row['chap'])== 43: #checking for last entry
                    return ''
                else:
                    next_title = ledger[ledger.index(row) +1]['title']
                    next_chapter_start_page = int(ledger[ledger.index(row) +1]['pages'][:3])
                    return(" ".join(next_title.split('_')),next_chapter_start_page)

In [27]:
import pandas as pd
sw = pd.read_csv('text-data/stopwords.csv')
#print (sw)
stop_words = [w for w in (sw['nltk'].tolist() + sw['eliz'].tolist() + sw['hk'].tolist() + sw['pronouns'].tolist()) if pd.isnull(w) == False]
#print(stop_words)

In [71]:
test_title = '01_44_NNE-_1313_The_letters_of_Edward_the_second_unto_Haquinus_King_of_Norway_concerning_certain_English_Merchants_a_pp.339-343.txt'
t, p = next_title(test_title)
type(p)

int

In [80]:
filelist = os.scandir('text-data/Cambridge_MacLehose_FineReader_OCR')
newdir = r'text-data/Cambridge_MacLehose_FineReader_OCR_trimmed/'
end_title = '' #initializing flag for known title based on end_title from preceding chapter
char_block = '''||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
                ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||'''
for entry in filelist:
    #skip past completed files
    last_vol_n, last_chap_n = 11, 28
    if HK.HK_vol(entry.name) < last_vol_n or (HK.HK_vol(entry.name) == last_vol_n and HK.HK_chap(entry.name) <= last_chap_n):
        continue
    #determine start line
    with open(entry.path, 'r', encoding="utf8") as f:
        chapter_lines = f.readlines()
        chapter_lines.append(' ') #appending extra line in case I need to end the chapter after the last line when page brak matches chapter break
        chapter_title = " ".join(HK.HK_title(entry.name).split('_'))
        first_page = []
        line_index = 0
        while not (page_approx(first_page) or line_index == len(chapter_lines)):
            first_page.append(chapter_lines[line_index])
            line_index += 1
        if end_title == '':
            print(char_block*2)
            print(entry.name)
            print('find start line')
            best_match = 0
            for line in first_page:
                if title_match(chapter_title, line) > best_match:
                    best_title = line
                    best_match = title_match(chapter_title, line)
            title_approved = input(f'--target: {chapter_title}, \n--auto match: {first_page.index(best_title)} : {best_title[:400]} \n--approve? y/n')
            if title_approved == 'y':
                start_index = first_page.index(best_title)
            else:
                start_index = user_select(chapter_title, first_page)
        else:
            start_index = first_page.index(end_title)
    #determine last line
        print(char_block)
        print(entry.name)
        print('find end line')
        chapter_title, next_chap_start_page = next_title(entry.name)
        #flag if next chapter starts on new page
        _, chapter_end_page = HK.HK_pages(entry.name)
        if next_chap_start_page == chapter_end_page + 1:
            print('||| next chapter starts on new page |||')
        last_page = []
        #print('last page just after creation', last_page)
        line_index = -1
        while not (page_approx(last_page) or line_index == len(chapter_lines)*-1):
            last_page.append(chapter_lines[line_index])
            line_index -= 1
        #print('last page initially assembled', last_page)
        last_page.reverse()
        #print('last page after reversal', last_page)
        #print('last page length', len(last_page))
        best_match = 0
        for line in last_page:
            if title_match(chapter_title, line) > best_match:
                best_title = line
                best_match = title_match(chapter_title, line)
        title_approved = input(f'--target: {chapter_title}, \n--auto match: {best_title[:400]} \n--approve? y/n')
        if title_approved == 'y':
            end_index = last_page.index(best_title)
        else:
            end_index = user_select(chapter_title, last_page)
        #print('end index first established', end_index)
        end_index = end_index - len(last_page) # flipping to count from end
        #print('end index flipped', end_index)
    #write new file
    new_lines = chapter_lines[start_index : end_index]
    #no need for +1 in end_index as I don't want to include next title in the text of the chapter
    new_text = '\n'.join(new_lines)
    #print(f'writing to file, index start {start_index} end {end_index} text {new_text}')
    with open(newdir+entry.name, 'w', encoding="utf8") as f2:
        f2.write(new_text)
    # determine if there's page overlap between current and next chapter, 
    # in which case end title can be reused as start title
    if next_chap_start_page == chapter_end_page:
        end_title = last_page[end_index]
    else: end_title = ''

||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
                ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
                ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
11_29_AMER_1579_A_discourse_of_the_West_Indies_and_the_South_sea_written_by_Lopez_Vaz_a_Portugall_conteining_divers__pp.227-290.txt
find start line
--target: A discourse of the West Indies and the South sea written by Lopez Vaz a Portugall conteining divers , 
--auto match: 4 : A discourse of the West Indies and South sea written by Lopez Vaz a Portugal, borne in the citie of Elvas, continued unto the yere 1587. Wherein among divers rare things not hitherto delivered by any other writer, cer-taine voyages of our Englishmen are truely reported: which was intercepted with the author thereof at the ri

ValueError: not enough values to unpack (expected 2, got 0)