# Remove Headers (for data capsule)

This code is derived from my running headers experiments (4.RunningHeadersExperiments.ipynb), which is, in turn, derived from Ted Underwood's running headers code.

Copied text from `4.RunningHeadersExperiments`:

> Relies on using Ted Underwood's code for removing running headers

>https://github.com/tedunderwood/DataMunging/blob/master/runningheaders/HeaderFinder.py

>Note that the pagelist needs to be the strings on the pages in list format (for more information, look at how the pop function works).

>Also, because the function uses .pop, it will modify the list of pages you give it, so if you run remove_headers a couple times to experiment and then can't figure out why you can get the modified text but not the list of headers, review your order of when you're looking at things.

>Another aspect of pop to watch for is that if you have a page with only a few repeating elements, then the index may get out of range quickly.

>For example, if your page is Table, Reis, 100, and the script considers that a running header, then if you .pop index 1, there will no longer be an index 2.

>To get around this, I changed the sets in repeated into lists, maintaining the sublist order and the interior tuple order. However, within each sublist, I sorted the tuples so that the one with the highest index would be first in the sublist.

>Example: [('TRADE of LISBON', 1), ('Year .', 2)] becomes [('Year .', 2), ('TRADE of LISBON', 1)], so that Year is removed before TRADE of LISBON.

In [None]:
import glob
import pandas as pd
import shutil

## Define HeaderFinder

In [None]:
# HeaderFinder.py
#
# Scans a list of pages for running headers, which we understand as lines, near
# the top of a page, that are repeated within the space of two pages,
# in either direction. The two-page window is necessary because headers
# are sometimes restricted to recto or verso. A very common pattern
# involves different, alternating recto and verso headers. We also use
# fuzzy matching to allow for OCR errors and other minor variation (e.g.
# page numbers that may be roman numerals).
#
# Once headers are identified, they can be treated in a range of different
# ways. The first of these functions is not concerned to *separate* the header
# from the original text but only to identify it so that it can be given extra
# weight in page classification. The second function actually removes them.

# In principle, this could all be done for footers as well. I haven't cared, because
# it wasn't a big problem in the 19c volumes I've worked with so far. That
# could change!

from difflib import SequenceMatcher

def find_headers(pagelist, romannumerals):
	'''Identifies repeated page headers and returns them as a list keyed to
	original page locations.'''

	# For very short documents, this is not a meaningful task.

	if len(pagelist) < 5:
		return []

	firsttwos = list()
	# We construct a list of the first two substantial lines on
	# each page. We ignore short lines and lines that are just numbers,
	# and don't go deeper than five lines in any event.

	# We transform lines in this process -- e.g, by removing digits.
	# If we were attempting to *remove* lines from the original text,
	# we would probably need to construct objects that package the transformed
	# line with information about its original location, so we could also
	# remove the original.

	for page in pagelist:
		thesetwo = list()
		linesaccepted = 0

        # Cathy's note: at one point, I had .splitlines() after page
        # however, that meant that texts with no headers weren't getting added
        # instead, splitlines when you first import the text.
		for idx, line in enumerate(page):
			if idx > 4:
				break

			line = line.strip()
			if line.startswith('<') and line.endswith('>'):
				continue

			line = "".join([x for x in line if not x.isdigit()])
			# We strip all numeric chars before the length check.

			if line in romannumerals:
				continue

			# That may not get all roman numerals, because of OCR junk, so let's
			# attempt to get them by shrinking them below the length limit. This
			# will also have the collateral benefit of reducing the edit distance
			# for headers that contain roman numerals.
			line = line.replace("iii", "")
			line = line.replace("ii", "")
			line = line.replace("xx", "")

			if len(line) < 5:
				continue

			linesaccepted += 1
			thesetwo.append(line)

			if linesaccepted >= 2:
				break

		firsttwos.append(thesetwo)

	# Now our task is to iterate through the firsttwos, identifying lines that
	# repeat within a window, which we define as "this page and the two previous
	# pages."

	# We're going to do this with a list of sets. That way we can add things
	# without risk of duplication. Otherwise, when we add headers to previous
	# pages, we're always going to be checking whether they were already added.

	repeated = list()
	for i in range(len(firsttwos)):
		newset = set()
		repeated.append(newset)

	for index in range(2, len(firsttwos)):
		# We can be sure the 2 index is legal because we have previously filtered
		# short documents.

		indexedlines = firsttwos[index]

		for j in range (index - 2, index):

			previouslines = firsttwos[j]

			for lineA in indexedlines:
				for lineB in previouslines:
					s = SequenceMatcher(None, lineA, lineB)
					similarity = s.ratio()
					if similarity > .8:
						repeated[index].add(lineA)
						repeated[j].add(lineB)

	# Now we have a list of sets that contain digit-stripped strings
	# representing headers, in original page order, with empty sets where no headers
	# were found. We want to convert this to a list of lists of individual tokens.

	listoftokenstreams = list()

	for thispageheaders in repeated:
		thisstream = []
		for header in thispageheaders:
			thisstream.extend(header.split())
		listoftokenstreams.append(thisstream)

	return listoftokenstreams



## Define remove_headers

In [None]:
from difflib import SequenceMatcher

def remove_headers(pagelist, romannumerals):
	'''Identifies repeated page headers and removes them from
	the pages; then returns the edited pagelist.'''

	# For very short documents, this is not a meaningful task.

	if len(pagelist) < 5:
		return pagelist

	firsttwos = list()
	# We construct a list of the first two substantial lines on
	# each page. We ignore short lines and lines that are just numbers,
	# and don't go deeper than five lines in any event.

	# We transform lines in this process -- e.g, by removing digits.
	# We also package them as tuples in order to preserve information
	# that will allow us to delete the lines identified as repeats.

	for page in pagelist:
		thesetwo = list()
		linesaccepted = 0

		for idx, line in enumerate(page):
			if idx > 4:
				break

			line = line.strip()
			if line.startswith('<') and line.endswith('>'):
				continue

			line = "".join([x for x in line if not x.isdigit()])
			# We strip all numeric chars before the length check.

			if line in romannumerals:
				continue

			# That may not get all roman numerals, because of OCR junk, so let's
			# attempt to get them by shrinking them below the length limit. This
			# will also have the collateral benefit of reducing the edit distance
			# for headers that contain roman numerals.
			line = line.replace("iii", "")
			line = line.replace("ii", "")
			line = line.replace("xx", "")

			if len(line) < 5:
				continue

			linesaccepted += 1
			thesetwo.append((line, idx))

			if linesaccepted >= 2:
				break

		firsttwos.append(thesetwo)

	# Now our task is to iterate through the firsttwos, identifying lines that
	# repeat within a window, which we define as "this page and the two previous
	# pages."

	# We're going to do this with a list of sets. That way we can add things
	# without risk of duplication. Otherwise, when we add headers to previous
	# pages, we're always going to be checking whether they were already added.

	repeated = list()
	for i in range(len(firsttwos)):
		newset = set()
		repeated.append(newset)

	for index in range(2, len(firsttwos)):
		# We can be sure the 2 index is legal because we have previously filtered
		# short documents.

		indexedlines = firsttwos[index]

		for j in range (index - 2, index):

			previouslines = firsttwos[j]

			for lineA in indexedlines:
				for lineB in previouslines:
					s = SequenceMatcher(None, lineA[0], lineB[0])
					# The zero indexes above are just selecting the string part
					# of a string, index tuple.

					similarity = s.ratio()
					if similarity > .8:
						repeated[index].add(lineA)
						repeated[j].add(lineB)

                        
    # Now we have a list of sets that contain tuples
    # representing headers, in original page order, with empty sets where no headers
    # were found. We can now use the line indexes in the tuples to pop out the
    # relevant lines.
    
    # I make into a list so I can sort and ensure larger index numbers go first
    # otherwise, taking away an index 1 first will modify the list, and thus, any later removals
    # this has seemed only relevant for texts with repeated material like tables or other content
    
	repeatedList = [list(x) for x in repeated]

	repeatedSortedList = []

	for sublist in repeatedList:
		sublist = sorted(sublist, key=lambda i:i[1],reverse=True)
		repeatedSortedList.append(sublist)

    # and just to double check
	assert len(pagelist) == len(repeatedSortedList)

    # make a list of what has been removed
	removed = list()
    
    # note that page needs to be a list, not a string

	for page, headerset in zip(pagelist, repeatedSortedList):
	#     print('text from first page' + page[0])
	#     print('text in list form' + str(page[0:10]))
    
    
		for header in headerset:
			lineindex = header[1]
			removed.append(page.pop(lineindex))
	finalpages = [x for sublist in pagelist for x in sublist]

	return finalpages, removed


## Define romannumeralsList

In [None]:
romannumeralsList = ['i', 'ii', 'iii', 'iviiii', 'v', 'vi', 'vii', 'viiiiix', 'ixviiii', 'x', 
                 'xi', 'xii', 'xiii', 'xiv', 'xv', 'xvi', 'xvii', 'xviii', 'xix', 'xx', 
                 'xxi', 'xxii', 'xxiii', 'xxiv', 'xxv', 'xxvi', 'xxvii', 'xxviii', 'xxix', 'xxx']

#  Execute the Code
### reminder: change folder names, etc.
also, unsure if doing it on whatever the os of the data capsule is will mess with things - keep an eye out.

I have also modified this to work on a single folder instead of doing it over a series of folders, since hopefully the virtual machine will be able to handle it...surely it is better than my poor little laptop!

In [None]:
%%time

# get list of ht unzipped folders
htFolderList = glob.glob('C:\\Users\\cathy\\Documents\\twdb_files\\origFile\\ht_unzip\\*')

# to track completion
processedFolders = []
removedHeaders = []

# iterate through each folder

# note, you may want to do this in chunks if you are worried about errors or an old laptop!

for folder in htFolderList:
    
    folderID = os.path.basename(folder)
    
    processedFolders.append(folderID)

    # in each zip file, create a list of the pages
    pageFolderList = glob.glob(folder+'\\*\\*')

    # pageList will hold the strings of each page  
    pageList = []
        
# note: if you don't need to modify files (ie, remove pages with extra technical info),
# then move down to main iteration.

##########################################

    # remove first four pages technical microfilm info from aeu.ark pages
    if 'aeu.ark' in folderID:

        for page in pageFolderList[4:]:
            with open(page, 'r', encoding='utf-8') as f:
                pageList.append(f.read().splitlines())

        # print('opened pages, added to page list!')

        # run remove_headers on the series of pages, then add the returned pages to a single string
        processedText = remove_headers(pageList, romannumeralsList)
        textWithNoHeaders = '\n'.join(processedText[0])
        # removedHeaders.append(processedText[1])

        # print('remove headers complete!')

        # write the text, with no headers, to a .txt file
        # remember to modify the filepath with the text identifier

        tempFile = open('C:\\Users\\cathy\\Documents\\twdb_files\\origFile\\ht_runningheadersremoved\\'+folderID+'.txt','w', encoding='utf-8')
        tempFile.write(textWithNoHeaders)
        tempFile.close()          
            
            
##########################################
            
    else:   
        # main iteration (you can skip right to here if you don't need to remove aeu.ark tech data)     
        # print('main iteration')   
        
        for page in pageFolderList:
            with open(page, 'r', encoding='utf-8') as f:
                pageList.append(f.read().splitlines())

        # run remove_headers on the series of pages, then add the returned pages to a single string
        processedText = remove_headers(pageList, romannumeralsList)
        textWithNoHeaders = '\n'.join(processedText[0])
        # removedHeaders.append(processedText[1])

        # print('remove headers complete!')

        # write the text, with no headers, to a .txt file
        # remember to modify the filepath with the text identifier

        tempFile = open('C:\\Users\\cathy\\Documents\\twdb_files\\origFile\\ht_runningheadersremoved\\'+folderID+'.txt','w', encoding='utf-8')
        tempFile.write(textWithNoHeaders)
        tempFile.close()


## identify and combine the LongSFiles metadata

In [None]:
ht_LongSGlobbed = glob.glob('C:\\Users\\cathy\\Documents\\twdb_files\\origFile\\ht_zip_ocrNorm\\**\\longSfiles.txt')
# ht_LongSGlobbed

In [None]:
longSfiles = []

for file in ht_LongSGlobbed:
    with open(file, 'r') as f:
        longSfiles.append(f.read())

longSfilesText = ''.join(longSfiles)

In [None]:
# tempFile = open('C:\\Users\\cathy\\Documents\\twdb_files\\plaintext\\txt_ht\\longSfiles_ht.txt', "w")
# tempFile.write(longSfilesText)
# tempFile.close()

## Separate the vol metadata and plaintext

In [None]:
ht_volGlobbed = glob.glob('C:\\Users\\cathy\\Documents\\twdb_files\\origFile\\ht_zip_ocrNorm\\**\\*vol.tsv')
len(ht_volGlobbed)

In [None]:
for file in ht_volGlobbed:
    shutil.copy(file, 'C:\\Users\\cathy\\Documents\\twdb_files\\plaintext\\txt_ht\\vol_metadata')

In [None]:
ht_cleanGlobbed = glob.glob('C:\\Users\\cathy\\Documents\\twdb_files\\origFile\\ht_zip_ocrNorm\\**\\*clean.txt')
len(ht_cleanGlobbed)

In [None]:
for file in ht_cleanGlobbed:
    shutil.copy(file, 'C:\\Users\\cathy\\Documents\\twdb_files\\plaintext\\txt_ht\\text')