This notebook is intended to navigate the [pairtree](https://confluence.ucop.edu/display/Curation/PairTree) format in which HathiTrust fulltext data comes from dataset request, via rsync.

The basic workflow is as follows:
1. Drill down to final directory that holds volume data, starting at the directory that holds the highest level of pairtree data (in HT, this is the folder that is named with the institutional prefix for volumes in the dataset from that institution, e.g. 'mdp' for U. Michigan)
2. Create a new directory based on the HTID of the volume, and move the textfiles to the new directory
    * `htidExtractor` is used to parse the filepath to generate the HTID used as the folder name
3. With a directory that holds folders of textfiles, by volume, uses `load_vol` and `clean_vol` to read in each page, find running headers and footers, remove them, and then concatenate pages into single, clean textfile in a new output directory

There are some variables that will need to be manually change to make this workflow work for a given project, and these will be flagged with codes in the comments.

**Note: you need to download this GitHub repo and move it to the same folder where this Jupyter notebook is: https://github.com/htrc/HTRC-Tools-RunningHeaders-Python.** Use the green `clone or download` button on the right, then unzip the downloaded file (which will yield a folder called `htrc`) and move it where this Jupyter notebook is located.

The other libraries we are using are relatively standard, but can be downloaded using `pip` if you do not have them already. If you use Python with Anaconda, it's likely you already have them. If you do not, the `import` statement will fail.

In [24]:
import os
import glob
import re
import shutil

from collections import defaultdict
from typing import List, TypeVar, Set, Iterator, Optional, Tuple, Dict

from htrc.models import Page, PageStructure, HtrcPage
from htrc.utils import clean_text, levenshtein, pairwise_combine_within_distance, flatten 
from htrc.runningheaders import parse_page_structure

In [17]:
# A FUNCTION THAT WILL PARSE A PAIRTREE PATH TO GENERATE AN HTID

def htidExtractor(file_path:str,start:str,end:str,skip_str:str):
    '''
    :param file_path: a single filepath string--can be easily modified to take list instead
    :param start: the redundant ”parent folder“; redundant  prefix of the HTID substring
    :param end: the redundant ”children folder“; redundant suffic of the HTID substring
    :param skip_str: a folder to be excluded
    :return: unique_htids: a list of unique htids
    :rtype: lst
    '''
    all_htids=[]
    # print(i)
    htid = re.search('%s(.*)%s' % (start, end), file_path).group(1)
    htid_lst=htid.split('/')[:-1]
    # print(htid_lst)
    newstr=htid_lst[0]+'.'
    for n in htid_lst[1:]:
        if n==skip_str:
            pass
        else:
            newstr=newstr+n
    # print(newstr)
    all_htids.append(newstr)
    # print(len(all_htids))
    unique_htids=set(all_htids)
    # print(unique_htids)
    # return unique_htids
    return newstr

MODIFY CODE:
    AT END OF PAIRTREE, IN final directory, IS A zip file FOR THE VOLUME, HOLDING ALL TEXT FILES FOR PAGES
- Need to expand the zip, then MOVE THE RESULTANT FOLDER TO PAGES DIRECTORY
    - Copy the zips and then expand in a new directory
    - Moving is an option too, and would be quicker

ALSO HARD CODE IN TO SKIP FILES WITH ' 2.ZIP' and look only for 'xx.zip' (no white space and 2)

In [37]:
# DEFINING A PATH TO THE DIRECTORY WHERE THIS NOTEBOOK IS LOCATED IN THE NAME OF LESS TYPING
root = os.getcwd()

# UPDATE THESE VARIABLES BASED ON YOUR DIRECTORY STRUCTURE!
data_dir = root+'/data-download/' # folder that holds all pairtree top folders
end = '.txt' # extension of type of file you're looking for 
skip_string = 'pairtree_root' # a part of the filepath to disregard when generating an HTID
output_path = root+'/pages/' # the folder to which you want the page textfiles to be moved
metadata_out_dir = root+'/meta/'

# ITERATE THROUGH PAIRTREE STRUCTURE AND FIND AND MOVE PAGE TEXTFILES
for root, dirs, files in os.walk(data_dir, topdown=False):
    # Disregarding files that start with "." because on Mac, you'll get hidden .DSstore files:
    for files in [i for i in files if (i.endswith(".zip")) and not (i.startswith("."))]:
        print(files)
        final_path = os.path.join(root, files)
        # print(final_path)
        # print(final_path)
#         htid = str(htidExtractor(final_path, start, end, skip_string))
        out_dir = output_path+files[:-4]
        print(out_dir)
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)
            shutil.copy(final_path, out_dir)
        else:
            shutil.copy(final_path, out_dir)

mdp.39015007870481.zip
/Users/rdubnic2/Desktop/JupyterNotebooks/pages/mdp.39015007870481
ark+=13960=t3mw3px6k.zip
/Users/rdubnic2/Desktop/JupyterNotebooks/pages/ark+=13960=t3mw3px6k
txa.tarb004288.zip
/Users/rdubnic2/Desktop/JupyterNotebooks/pages/txa.tarb004288
ien.35556044272359.zip
/Users/rdubnic2/Desktop/JupyterNotebooks/pages/ien.35556044272359


In [39]:
glob.glob?

In [38]:
print(output_path)

for files in glob.glob(output_path+'')

# with zipfile.ZipFile(lfilename) as file:
#     file.extract(filename, dir)
# remove(lfilename)

/Users/rdubnic2/Desktop/JupyterNotebooks/pages/


In [13]:
# A FUNCTION USED TO LOAD A VOLUME INTO MEMORY IN A FORMAT THAT OUR HEADER/FOOTER CLEANER TAKES AS INPUT
def load_vol(path: str, num_pages: int) -> List[HtrcPage]:
    pages = []
    py_num_pages = num_pages-1
    for n in range(py_num_pages):
        if n == 0:
            n = 1
            page_num = str(n).zfill(8)
            with open('{}/{}.txt'.format(path, page_num), encoding='utf-8') as f:
                lines = [line.rstrip() for line in f.readlines()]
                pages.append(HtrcPage(lines))
        else:
            page_num = str(n).zfill(8)
            with open('{}/{}.txt'.format(path, page_num), encoding='utf-8') as f:
                lines = [line.rstrip() for line in f.readlines()]
                pages.append(HtrcPage(lines))
    
    return pages

In [14]:
# UPDATE DIRECTORY NAME--MAKE SURE NOTEBOOK IS IN SAME DIRECTORY AS OVERALL DATA DIRECTORY
clean_page_paths = glob.glob('pages/*') # find all textfiles to clean and concatenate
# clean_page_paths

In [15]:
# FUNCTION THAT CLEANS RUNNING HEADERS/FOOTERS FROM EACH PAGE & CONCATENATE INTO SINGLE TEXT FILE FOR EACH VOLUME
def clean_vol(page_directory_paths: list, out_dir: str):
    vol_num = 0
    for path in page_directory_paths:
        filename = path.split('/', 1)[1]
        file_count = len([f for f in os.listdir(path) if f.endswith('.txt')])
        loaded_vol = load_vol(path, file_count)
        pages = parse_page_structure(loaded_vol)
        outfile = filename+'.txt'
        vol_num +=1
        
        with open(outfile, 'w') as f:
            clean_file_path = os.getcwd()+'/'+outfile
            for n, page in enumerate(pages):
                # print('.')
                f.write(page.body + '\n')
        shutil.move(clean_file_path, out_dir)       
           
    return print(f"Cleaned {vol_num} volume(s)")

In [None]:
root = os.getcwd() # reasserting that variable `root` is current working directorym where this notebook is located

# CREATE A VARIABLE WITH A PATH TO THE DIRECTORY WHERE WE'LL WRITE CLEAN VOLUME TEXTFILES
clean_vol_out_dir = root+'/clean-volumes/'

# USE CLEAN_VOL TO CLEAN EACH VOLUME AND MOVE TO THE ABOVE CHOSEN FOLDER
clean_vol(clean_page_paths, clean_vol_out_dir)