### This notebook is an exemplar which demonstrates transferring zip files between a Box folder and Savio scratch to run OCR on images using Tesseract (inside a Singularity container)

( tested with boxsdk (2.0.0a2) on python 3.5 kernel)
pip install -Iv boxsdk==2.0.0a2 


_This software is available under the terms of the Educational Community License, Version 2.0 (ECL 2.0). This software is Copyright 2016 The Regents of the University of California, Berkeley ("Berkeley")._

The text of the ECL license is reproduced below.

Educational Community License, Version 2.0
*************************************
Copyright 2016 The Regents of the University of California, Berkeley ("Berkeley")

Educational Community License, Version 2.0, April 2007

The Educational Community License version 2.0 ("ECL") consists of the
Apache 2.0 license, modified to change the scope of the patent grant in
section 3 to be specific to the needs of the education communities using
this license. The original Apache 2.0 license can be found at:[http://www.apache.org/licenses/LICENSE-2.0]

### Notebook configuration section
Set of target and source directories, script file names and other used as parameters in processing below.

In [None]:

boxProjectFolder = 'Court Downloads Ayotte Ellias'
boxResultsFolder = 'OCR-zips'
boxFileList = ['7.zip', '80003.zip', '605.zip']

projectname = 'chench_test3_7_80003_605'
runFolder = '/global/scratch/mmanning/chench/'

tesseractimage = '/global/scratch/mmanning/tesseract4.img'
tesseractdatadir = '/opt/tessdata/'
pdfnamelist = []

scratchDataDirectory = '/global/scratch/mmanning/chench/test3/'
tesseractScratchDataDirectory = '/scratch/'

SINGULARITYCMD = 'singularity exec -B /global/scratch/mmanning/chench/test3/:/scratch/  /global/scratch/mmanning/tesseract4.img'

gsCommandScript = runFolder + 'gsCommandScript.sh'
t4CommandScript = runFolder + 't4CommandScript.sh'
slurmScript = runFolder + 'slurmscript.sh'



### Box Authorization

function to store the oauth2 refresh token in a local file. This can be modified to use a keychain or other as required.

In [None]:
def store_tokens(access_token, refresh_token):
    
    """Callback for storing refresh tokens. (For now we ignore access tokens)."""
    with open('apptoken.cfg', 'w') as f:
     f.write(refresh_token.strip())

Oauth2 information is read from a local file with three lines, one line per parameter. 
The client id and client secret are defined in the Box application created for this notebook.  Create the application at the Box Developers site: https://berkeley.app.box.com/developers/services/edit/

The redirect uri can be any site that requires validation. Run the bootstrap notebook to create initial 
tokens that are then continually refreshed

In [None]:
import os

CLIENT_ID = None
CLIENT_SECRET = None
REDIRECT_URI = None

# folder where box token config file resides
os.chdir('/global/home/users/mmanning')


# Read app info from text file
with open('app.cfg', 'r') as app_cfg:
    CLIENT_ID = app_cfg.readline()
    CLIENT_SECRET = app_cfg.readline()
    REDIRECT_URI = app_cfg.readline()

The refresh token is read from a local file. This token was created by running the bootstrap notebook which requires the user to validate with CalNet Authentication Service credentials, then stores the returned auth and refresh tokens in the same config files.

In [None]:
REFRESH_TOKEN = None

# Read app info from text file
with open('apptoken.cfg', 'r') as apptoken_cfg:
    REFRESH_TOKEN = apptoken_cfg.readline()

__Perform autentication__ 
then create globus client
Verify client is working by retrieving the name of the users root folder in Box

In [None]:
from boxsdk import OAuth2
from boxsdk import Client

# Do OAuth2 authorization.
oauth = OAuth2(
    client_id=CLIENT_ID.strip(),
    client_secret=CLIENT_SECRET.strip(),
    refresh_token=REFRESH_TOKEN.strip(),
    store_tokens=store_tokens
)

client = Client(oauth)

root_folder = client.folder(folder_id='0').get()
print ("folder name: ", root_folder['name'] )

items = client.folder(folder_id='0').get_items(limit=100, offset=0)
#print ("items: ", items )

### Utility functions

__function to find folder id be folder name.__  
Current SDK does not have a 'find by name' function so must loop thru all folders and look for match.

In [None]:
def find_folder_id(folder_name):
    folderlist = client.search(query=folder_name, result_type='folder', limit=10, offset=0)
    
    if len(folderlist) == 0 or len(folderlist) > 1:
        print('folder not found: ', folder_name)
        return 0
    else:
        return folderlist[0]['id']

In [None]:
import re

def natural_sort_key(s, _nsre=re.compile('([0-9]+)')):
    return [int(text) if text.isdigit() else text.lower()
            for text in re.split(_nsre, s)]  

__function to return all files in directory tree.__

In [None]:
import os
def scantreeForFiles(path):
    """Recursively yield DirEntry objects for given directory."""
    for entry in os.scandir(path):
        if entry.is_dir(follow_symlinks=False):
            yield from scantreeForFiles(entry.path) 
        else:
            yield entry.path


__function to return list of all folders in directory tree.__

In [None]:
import os
def scandirForFolders(path, dirlist):
    """Recursively yield DirEntry objects for given directory."""
    for entry in os.scandir(path):
        if entry.is_dir(follow_symlinks=False):
            dirlist.append(entry.path)
            scandirForFolders(entry.path, dirlist)    


__Validate all the task log files produced by ht_helper __

In [None]:
def validateTaskResults(fileroot, totalTasks):
    # file root is job-name.jobId.taskNumber.log
    
    errorList = []
    
    for i in range(0, totalTasks-1):
        fn = fileroot + '.' + str(i)
        if os.path.exists(fn):
            out = !tail -1 {fn}
            retval = out[0]
            #print ('return code: ', out[0])
        else:
            print ('warning: log file not available: ', fn)
        
        if ( retval != '0' ):
            errorList.append(i)
            
    return errorList


### Retrieve the file(s)  from the Box folder.
currently the Box SDK does not have an option for finding a folder by name so if you are looking for a specific folder then you would need to loop thru all the items in the list below and do a name match. Once you find the folder and retrieve the id, you can save that id for subsequent runs. Another option is to get the id from the url in the web client, but approah below is more flexible for now.

In [None]:
import os
import shutil 


os.chdir(scratchDataDirectory)
print ('current working directory: ', os.getcwd())

# test folder contents
items = client.folder(folder_id='0').get_items(limit=20, offset=0)
if type(items) is list:
    print ('number of files in top folder: ', len(items) )
    
    targetfolderId = ''
    for item in items:
        if item['type'] == 'folder':
            print('folder name: ', item['name'])
            if item['name'] == boxProjectFolder:
                targetfolderId = item['id']
                print('targetfolderId: ', targetfolderId)
        
    if targetfolderId is not None:
        tgtitems = client.folder(folder_id=targetfolderId).get_items(limit=200, offset=0)
        if type(tgtitems) is list:
            print ('number of files in target folder: ', len(tgtitems) ) 
        
        # download files
        for tgtitem in tgtitems:
            if  not tgtitem['type'] == 'folder' and tgtitem['name'] in boxFileList:
                print('downloading: ', tgtitem['name'])
                newfile = open(scratchDataDirectory + tgtitem['name'], 'wb')
                client.file(file_id=tgtitem['id']).download_to(newfile)
                newfile.close()
        print('downloading completed. ')
        

__unzip the files__

In [None]:
import zipfile
def unzip(source_filename, dest_dir):
    with zipfile.ZipFile(source_filename) as zf:
        print('extractall: ', source_filename)
        zf.extractall(dest_dir)
    print('extractall completed. ')

In [None]:
import glob

for filename in glob.glob('*.zip'):
    print('unzip: ', filename)
    unzip(filename, scratchDataDirectory)
    #remove the zip file
    os.remove(filename)
print('zip processing completed. ')

__SLURM job script__ normal

In [None]:
# batch script
batchtemplate = '#!/bin/bash -l  \n\
# Job name: \n\
#SBATCH --job-name=' + projectname + '\n\
# \n\
# Account: \n\
#SBATCH --account=ac_scsguest \n\
# \n\
# Partition: \n\
#SBATCH --partition=savio2 \n\
# \n\
## Scale by increasing the number of nodes \n\
#SBATCH --nodes=5  \n\
## DO NOT change ntasks-per-node setting as T4 also distributes across cores \n\
#SBATCH --ntasks-per-node=6 \n\
#SBATCH --qos=savio_normal \n\
# \n\
# Wall clock limit: \n\
#SBATCH --time={} \n\
# \n\
## Command(s) to run: \n\
module load gcc openmpi  \n\
/global/home/groups/allhands/bin/ht_helper.sh  -t {} -n1 -s1 -vL \n' 


__Remove special characters from filenames__  
not sure what characters to include here

In [None]:
import re
import os
for entry in scantreeForFiles(scratchDataDirectory):
    filename, file_extension = os.path.splitext(entry)
    if ( entry.endswith('.pdf') and re.search('\$', entry)):
        print ('sprcial characters in filename: ', entry)
        os.rename(entry, re.sub("[\$]", "", entry))

### Create script to convert all pdf files in working directory to images


__need to handle dollar signs in filenames here (grumble, grumble...)__

In [None]:
import glob, os
import shutil 

# Ghostscript executable is inside the container.
# TEMPLATE: gs -dBATCH -dNOPAUSE -dQUIET -sDEVICE=png16m -sOutputFile=/scratch/test/output/test-%d.png -r300 /scratch/test/germanocr.pdf
SINGULARITYCMD = 'singularity exec -B {}:/scratch/ /global/scratch/mmanning/tesseract4.img ' 
GHOSTSCRIPTCMD = 'gs -dBATCH -dNOPAUSE -dQUIET -sDEVICE=png16m -sOutputFile=\"{}-%d.png\" -r300 \"{}\" ;  echo $?'

os.chdir(scratchDataDirectory)
print ('current working directory: ', os.getcwd())

scmd = SINGULARITYCMD.format(scratchDataDirectory)

# total number of ghostscript tasks
gsCommandTotal = 0

with open(gsCommandScript, 'w') as f:  

    for entry in scantreeForFiles(scratchDataDirectory):
        filename, file_extension = os.path.splitext(entry)
        if ( entry.endswith('.pdf')):
            relativepath1 = entry[len(scratchDataDirectory):]
            relativepath2 = filename[len(scratchDataDirectory):]
            gcmd = GHOSTSCRIPTCMD.format(tesseractScratchDataDirectory+relativepath2, tesseractScratchDataDirectory+relativepath1 )
            f.write(scmd + gcmd + '\n')
            gsCommandTotal += 1
    
    
#set time limit for this batch run
outputbatchscript = batchtemplate.format('04:30:00',  gsCommandScript)
with open(slurmScript, 'w') as f:  
    f.write(outputbatchscript)

__Execute the task script with ht_helper__

In [None]:
os.chdir(runFolder)
print ('current working directory: ', os.getcwd())

out = !sbatch slurmscript.sh   
    
print ('Execute ghostscript output: ', out ) 
jobId =  out[0].split()[3]
print (jobId)

In [None]:
# print the users queue and the job status by id
!squeue -u $username
print('--------------------------------')
!scontrol show job $jobId

__Check all task log files for bad exit code__  
task numbers align with lines in the task script  
check the log file of tasks in the returned array of failures  

In [None]:
import glob, os
print ('current working directory: ', os.getcwd())

jobId = '1339347'
gsCommandTotal = 2761

fileroot = projectname + '.' + jobId + '.log'
tasklist = validateTaskResults(fileroot, gsCommandTotal)
print ('these tasks in task script failed: ', tasklist)


__Remove task logs after any errors have been resolved__

In [None]:
 
filter = fileroot + '*'
print ('filter: ', filter)
#for f in glob.glob(filter):
#    os.remove(f)

### Create script to ocr all png files in working directory to text

In [None]:
import glob, os
os.chdir(scratchDataDirectory)
print ('current working directory: ', os.getcwd())
# template: tesseract --tessdata-dir /opt/tessdata /scratch/germanocr_Page_01.png  germanout  -l deu
TCMD = ' sh -c \'OMP_NUM_THREADS=1 tesseract --tessdata-dir /opt/tessdata \"{}\" \"{}\" \'  -l eng;  echo $?'
#

scmd = SINGULARITYCMD.format(scratchDataDirectory)
# total number of tesseract tasks
t4CommandTotal = 0

with open(t4CommandScript, 'w') as f:

    for entry in scantreeForFiles(scratchDataDirectory):
        if ( entry.endswith('.png')):
            filename, file_extension = os.path.splitext(entry)
            relativepath1 = entry[len(scratchDataDirectory):]
            relativepath2 = filename[len(scratchDataDirectory):]
            tcmd = TCMD.format(tesseractScratchDataDirectory+relativepath1, tesseractScratchDataDirectory+relativepath2 )
            #print(scmd + tcmd)
            f.write(scmd + tcmd + '\n')
            t4CommandTotal += 1
    
    
#set time limit for this batch run
outputbatchscript = batchtemplate.format('15:00:00',  t4CommandScript)
with open(slurmScript, 'w') as f:  
    f.write(outputbatchscript)

__Execute the task script with ht_helper__

In [None]:
os.chdir(runFolder)
print ('current working directory: ', os.getcwd())

out = !sbatch slurmscript.sh   
    
print ('Execute tesseract4 output: ', out ) 
jobId =  out[0].split()[3]
print (jobId)

In [None]:
# print the users queue and the job status by id
!squeue -u $username
print('--------------------------------')
!scontrol show job $jobId

__Check all task log files for bad exit code__

In [None]:

os.chdir(runFolder)
print ('current working directory: ', os.getcwd())

fileroot = projectname + '.' + jobId + '.log'
#tasklist = validateTaskResults(fileroot, 10) first check a small subset
tasklist = validateTaskResults(fileroot, t4CommandTotal)
print ('these tasks in task script failed: ', tasklist)

# Remove task logs
#filter = fileroot + '*'
#for f in glob.glob(filter):
#    os.remove(f)

### Merge text files and upload to Box

In [None]:
from scandir import scandir
dirlist = []

scandirForFolders(scratchDataDirectory, dirlist)

print("num dirs: ", len(dirlist) ) 

for x in range(0, 10): 
    print("dir: ", dirlist[x] ) 


__check that for every .png there is a .txt in each directory__

In [None]:
missingResultList = []
for currentdir in dirlist:
    os.chdir(currentdir)
    #print ('current working directory: ', os.getcwd())
    
    
    # get a list of all pdf names
    for filename in os.listdir(os.getcwd()):
        if  os.path.isfile(filename)  and filename.endswith('.png'):
            fn, fe = os.path.splitext(filename)
            if not os.path.exists(fn + '.txt'):
                missingResultList.append(currentdir + '/' + filename)
                print ('missing result: ', currentdir + '/' + filename )
print("missingResultList size: ", len(missingResultList) ) 

__merge text files into original documents__

In [None]:
from shutil import copyfile

for currentdir in dirlist:
    os.chdir(currentdir)
    #print ('current working directory: ', os.getcwd())
    pdfnamelist = []
    
    # get a list of all pdf names
    for filename in os.listdir(os.getcwd()):
        if  os.path.isfile(filename)  and filename.endswith(".pdf"):
            #print("filename: ", filename ) 
            fn, fe = os.path.splitext(filename)
            pdfnamelist.append(fn)
    #print("pdfnamelist size: ", len(pdfnamelist) ) 
    
    for name in pdfnamelist:
        mergeList = []
        for filename in os.listdir('.'):
            
            if filename.endswith(".txt") and filename.startswith(name): 
                #print("filename: ", filename)
                mergeList.append(filename)
                
        #print('mergeList: ', mergeList)
        alltextfilename = ''.join([ currentdir,"/",name,'_ALL.txt'])
        
        if (len(mergeList) > 1):
            sortedList = sorted(mergeList, key = natural_sort_key)
            print('sortedList: ', sortedList)

            alltextfilename = ''.join([ currentdir,"/",name,'_ALL.txt'])
            with open(alltextfilename, 'w', encoding="utf-8") as outfile:
                for fname in sortedList:
                    with open(''.join([currentdir,"/", fname]), encoding="utf-8" ) as infile:
                        for line in infile:
                            outfile.write(line)
        elif (len(mergeList) == 1):
            # if file is only one page, just copy to _ALL.txt so it is included in results
            print('single file: ', mergeList[0])
            copyfile(mergeList[0], alltextfilename)
        else: 
            print('empty mergeList on file: ', name)        


__verify counts__

In [None]:
os.chdir(scratchDataDirectory)
print("number of pdfs in set: " ) 

!find . -name "*.pdf" | wc -l

print("number of merged text files in set: " ) 

!find . -name "*_ALL.txt" | wc -l


In [None]:
print("num dirs: ", len(dirlist) ) 

for currentdir in dirlist:
    os.chdir(currentdir)
    print ('current working directory: ', os.getcwd())
    
    # remove all pdf and png files
    for currentFile in os.listdir(os.getcwd()):
        if os.path.isfile(currentFile) and not currentFile.endswith('_ALL.txt'):
                os.remove(os.path.join(currentdir, currentFile))
    
    for currentFile in os.listdir(os.getcwd()):
        if os.path.isfile(currentFile) :
            newname = currentFile.replace('_ALL.txt', '.txt')
            os.rename(currentFile, newname)
    

In [None]:
import shutil

os.chdir(scratchDataDirectory)
print ('current working directory: ', os.getcwd())
shutil.make_archive(projectname, 'zip', scratchDataDirectory)

print('completed zip: ', os.stat(projectname + '.zip'))
    

#### Move the resulting zip file to Box.

In [None]:
#folderId = find_folder_id(boxProjectFolder)
folderId = find_folder_id('ThisIsATest')
print ("folderId: ", folderId )
upload_folder = client.folder(folder_id=folderId).get()
objUploaded = upload_folder.upload(scratchDataDirectory + projectname + '.zip')  
print ("obj file id: ", objUploaded['id'] )