In [29]:
import cv2
import glob
import os
import time

In [30]:
SLINGSTONE_TEMPLATE_FILE = 'templates/slingstone.JPG'
RECTANGLE_TEMPLATE_FILE = 'templates/rectangle.JPG'
HERBARIUM_SHEETS_DIR = 'all_sheets'
ACCESSION_LABEL_DIR = 'accession_labels'

In [31]:
def extract_label(source_image, template_image):
    result = cv2.matchTemplate(source_image, template_image, cv2.TM_CCOEFF_NORMED)
    (minVal, maxVal, minLoc, (x, y)) = cv2.minMaxLoc(result)
    correlation = maxVal
    template_height, template_width = template_image.shape[:2]
    label_image = source_image[y:y+template_height, x:x+template_width]  
    return label_image, correlation

def localtime():
    return time.asctime( time.localtime(time.time()) )

In [32]:
# MAIN

print '{}:   STARTING'.format(localtime())
sheets = glob.glob('{}/*.JPG'.format(HERBARIUM_SHEETS_DIR))
n = len(sheets)

# Create directory in which to store accesion label images
if not os.path.exists(ACCESSION_LABEL_DIR):
    os.makedirs(ACCESSION_LABEL_DIR)
    
# Read template images in grayscale
rectangle_template_image = cv2.imread(RECTANGLE_TEMPLATE_FILE,0)
slingstone_template_image = cv2.imread(SLINGSTONE_TEMPLATE_FILE,0)

# Match label templates and save the match with the highest correlation
i = 0
for sheet in sheets:
    filename = os.path.basename(sheet)
    source_image = cv2.imread(sheet,0) # load in grayscale
    source_image = source_image[1077:4743, 390:3054] # crop image so that it contains only the herbarium sheet
    rectangle_image, rectangle_correlation = extract_label(source_image, rectangle_template_image)
    slingstone_image, slingstone_correlation = extract_label(source_image, slingstone_template_image)
    if rectangle_correlation > slingstone_correlation:
        cv2.imwrite('{}/{}'.format(ACCESSION_LABEL_DIR, filename), rectangle_image) 
    else:
        cv2.imwrite('{}/{}'.format(ACCESSION_LABEL_DIR, filename), slingstone_image)
    i += 1
    if i%100 == 0:
        print '{}:   {} of {} ({}%) images processed'.format(localtime(), i, n, 100*i/n)
print '{}:   {} of {} ({}%) images processed'.format(localtime(), i, n, 100*i/n)
print 'FINIS'

Sat Dec  9 12:09:53 2017:   STARTING
Sat Dec  9 12:12:10 2017:   100 of 5590 (1%) images processed
Sat Dec  9 12:14:26 2017:   200 of 5590 (3%) images processed
Sat Dec  9 12:16:42 2017:   300 of 5590 (5%) images processed
Sat Dec  9 12:18:59 2017:   400 of 5590 (7%) images processed
Sat Dec  9 12:20:58 2017:   500 of 5590 (8%) images processed
Sat Dec  9 12:22:59 2017:   600 of 5590 (10%) images processed
Sat Dec  9 12:24:58 2017:   700 of 5590 (12%) images processed
Sat Dec  9 12:26:56 2017:   800 of 5590 (14%) images processed
Sat Dec  9 12:28:55 2017:   900 of 5590 (16%) images processed
Sat Dec  9 12:30:55 2017:   1000 of 5590 (17%) images processed
Sat Dec  9 12:32:55 2017:   1100 of 5590 (19%) images processed
Sat Dec  9 12:34:55 2017:   1200 of 5590 (21%) images processed
Sat Dec  9 12:36:53 2017:   1300 of 5590 (23%) images processed
Sat Dec  9 12:38:52 2017:   1400 of 5590 (25%) images processed
Sat Dec  9 12:40:50 2017:   1500 of 5590 (26%) images processed
Sat Dec  9 12:42: