Note: Run with Python 3 kernel.

# Utility Functions (called by other functions)

In [10]:
def combine_images_vertically(img1, img2):
    """
    Glues 2 images together with img2 below img1.
    Returns the new compound image.
    """
    
    import numpy as np
    
    h1, w1 = img1.shape
    h2, w2 = img2.shape

    # Create an array big enough to hold img2 below img1.
    img = np.zeros(((h1 + h2), max(w1, w2)), np.float32)

    # Paste img1 at y=0, x=0
    img[0:img1.shape[0], 0:img1.shape[1]] = img1

    # Paste img2 at y=h1, x=0
    img[h1:h1+img2.shape[0],0:img2.shape[1]] = img2    
    return img

In [11]:
def scrape_bishop_bulletins_page(bulletin):
    """
    Scrapes data from the Bishop Museum pubs online web page. 
    Bulletin is b172 for Insects of Guam I and b189 for Insects of Guam II.
    A directory named b172 or b189 is created and populated with a CSV file, named b189.csv, containing
    titles, stubs, authors, and urls for pdfs.
    If the bulletin directory already exists, this function does nothing.
    """
    import requests
    import os
    import pandas as pd
    from bs4 import BeautifulSoup
    from urllib.parse import urljoin
    from slugify import slugify
    
    if bulletin not in ['b172', 'b189']:
        print("bulletin not in ['b172', 'b189']")
        return

    bulletins_url = 'http://hbs.bishopmuseum.org/pubs-online/bpbm-bulletins.html'
    result = requests.get(bulletins_url)
    soup = BeautifulSoup(result.content, features = "lxml")

    pdf_list = []
    links = soup.find_all('a')
    for link in links:
        if bulletin in link.get('href'):
            text = link.previous_sibling.previous_sibling.previous_sibling
            parts = text.split(', by ')
            if len(parts) == 2:
                title = parts[0].strip()
                slug = slugify(title)
                authors = parts[1].replace('[','').strip()          
                url = link.get('href')
                url = urljoin(bulletins_url, url)
                pdf_list.append({'title':title, 'slug':slug, 'authors':authors, 'url':url})
    
    df_pdf_list = pd.DataFrame(pdf_list)
    os.mkdir(bulletin)
    outfile = '{}/{}.csv'.format(bulletin, bulletin)
    df_pdf_list.to_csv(outfile, index=False)
    
    return

## Usage example:
#scrape_bishop_bulletins_page('b172')

# Main Functions

In [12]:
def create_file_structure(bulletin):
    """
    bulletin is 'b172' for Insects of Guam I and 'b189' for Insects of Guam II.
    Depends on scrape_bishop_bulletins_page
    Creates a data file structure in this format:
    
    b172
        anthribidae-of-guam
            anthribidae-of-guam.pdf
            anthribidae-of-guam-0.jpg
            anthribidae-of-guam-1.jpg
            ...
        formicidae-of-guam
            formicidae-of-guam.pdf
            formicidae-of-guam-0.jpg
            formicidae-of-guam-1.jpg
            ...
            
    Each directory contains a PDF file and a JPG image for each page in the PDF.
    
    """
    import requests
    import os
    import subprocess
    import pandas as pd
    
    if bulletin not in ['b172', 'b189']:
        print("bulletin not in ['b172', 'b189']")
        return
    
    if not os.path.exists(bulletin):
        print('{} directory does not exist.'.format(bulletin))
        print('Scraping Bishop Museum Bulletins web page.')
        scrape_bishop_bulletins_page(bulletin)
    
    pdf_list = pd.read_csv('{}/{}.csv'.format(bulletin, bulletin)).to_dict('records')
    
    os.chdir('b172'); print(os.getcwd())
    for d in pdf_list:
        slug = d['slug']
        if os.path.exists(slug):
            print('{} directory already exists.'.format(slug))
            continue

        # Create a new directory and move into it
        url = d['url']
        os.mkdir(slug); os.chdir(slug); print(os.getcwd())

        # Download PDF
        filename = '{}.pdf'.format(slug)
        r = requests.get(url)
        open(filename, 'wb').write(r.content)

        # Create a JPG image for each page in PDF using the Linux convert command
        jpg = filename.replace('pdf', 'jpg')
        subprocess.call(['convert', '-density', '200x200', filename, jpg])

        # Move up one directory
        os.chdir('..'); print(os.getcwd())
    os.chdir('..'); print(os.getcwd())
    return

#create_file_structure('b172')

In [13]:
def make_bounding_box_table(bulletin, section):
    """
    
    """
    from bs4 import BeautifulSoup
    import pandas as pd

    xml_file_path = '{}/{}/{}.xml'.format(bulletin, section, section)
    f = open(xml_file_path, 'r')
    contents = f.read()
    soup = BeautifulSoup(contents, features = "lxml")

    bb_list = []
    n = -1
    for image in soup.find_all('image'):
        for box in image.find_all('box'):
            n += 1
            species_name = box.text.replace('\n','')
            bb_list.append({
                'species_name':species_name, 
                'page_image':image['name'],
                'bb_image': '{}-{}.jpg'.format(species_name.replace(' ', '-'), n),
                'xtl':int(float(box['xtl'])),
                'ytl':int(float(box['ytl'])),
                'xbr':int(float(box['xbr'])),
                'ybr':int(float(box['ybr']))
            })
    df_bb_list = pd.DataFrame(bb_list)
    outfile = '{}/{}/bounding_boxes.csv'.format(bulletin, section)
    df_bb_list.to_csv(outfile, index=False)
    return

#make_bounding_box_table('b172', 'barkbeetles-of-guam')

In [14]:
def extract_bounding_box_images(bulletin, section):
    """
    
    """
    import cv2
    import os
    
    bb_list_csv_path = '{}/{}/bounding_boxes.csv'.format(bulletin, section)  
    df_bb_list = pd.read_csv(bb_list_csv_path)

    mydir = '{}/{}/bounding_box_images'.format(bulletin, section)
    if not os.path.exists(mydir):
        os.mkdir(mydir)

    # Extract bounding box images
    for i, r in df_bb_list.iterrows():
        page_image_path = '{}/{}/{}'.format(bulletin, section, r.page_image)
        im = cv2.imread(page_image_path)
        roi = im[r.ytl:r.ybr, r.xtl:r.xbr]
        roi_filename = '{}/{}'.format(mydir, r.bb_image)
        print(roi_filename)    
        cv2.imwrite(roi_filename, roi)
    return

#extract_bounding_box_images('b172', 'barkbeetles-of-guam')

In [15]:
def merge_bounding_box_images(bulletin, section):
    """
    
    """
    import pandas as pd
    import pandasql as ps
    import cv2
    import numpy as np

    
    bb_list_csv_path = '{}/{}/bounding_boxes.csv'.format(bulletin, section)  
    df_bb_list = pd.read_csv(bb_list_csv_path)

    mydir = '{}/{}/bounding_box_images'.format(bulletin, section)

    merged_image_dir = '{}/{}/merged_images'.format(bulletin, section)
    if not os.path.exists(merged_image_dir):
        os.mkdir(merged_image_dir)

    sql = '''select species_name 
    from df_bb_list 
    group by species_name 
    order by species_name'''
    species_names = ps.sqldf(sql)
    
    print('species_names: {}'.format(species_names))
    
    for species_name in species_names.values:
        species_name = species_name[0]
        print(species_name)
        sql = '''select bb_image 
        from df_bb_list 
        where species_name="{}" 
        order by bb_image'''.format(species_name)
        df = ps.sqldf(sql)
        print(df)
        print()

        rowcount = df.shape[0]
        print('rowcount: {}'.format(rowcount))
        if rowcount > 2:
            print('More than 2 images to be merged. Not implemented. Continuiing.')
            continue
        if rowcount == 1:
            f = '{}/{}'.format(mydir, df.bb_image[0])
            print('f: {}'.format(f))
            img = cv2.imread(f, 0)
            #cv2.imshow('caca',img)
        if rowcount == 2:
            f = '{}/{}'.format(mydir, df.bb_image[0])
            print('f: {}'.format(f))
            img1 = cv2.imread(f, 0)
            print(img1)
            
            f = '{}/{}'.format(mydir, df.bb_image[1])
            print('f: {}'.format(f))
            img2 = cv2.imread(f, 0)  
            #print('img1 shape: {}'.format(img1.shp))
            #print('img2 shape: {}'.format(img2.shp))
            img = combine_images_vertically(img1, img2)
        f = '{}/{}.jpg'.format(merged_image_dir, species_name)
        cv2.imwrite(f, img)
    return
    
#merge_bounding_box_images('b172', 'barkbeetles-of-guam')

In [16]:
def make_mturk_data_csv(bulletin, section):
    """
    """
    import glob
    import pytesseract

    try:
        from PIL import Image
    except ImportError:
        import Image

    prefix = 'https://github.com/aubreymoore/insects-of-guam-datamining/raw/master'

    imagefiles = glob.glob('{}/{}/merged_images/*.jpg'.format(bulletin, section))
    outfile = '{}/{}/mturk_data.csv'.format(bulletin, section)

    with open(outfile, 'w+') as out:
        out.write('image_url,text\n')
        for imagefile in imagefiles:
            image_url = '{}/{}'.format(prefix, imagefile)

            # Perform optical character recognition on the image
            #text = pytesseract.image_to_string(Image.open(fn)).encode('utf-8')    

            print('performing optical character recognition on {}'.format(imagefile))
            text = pytesseract.image_to_string(Image.open(imagefile))

            # Remove double quotes and replace line feeds with <br>
            text = text.replace('\n\n', '\n')
            text = text.replace('"', '').replace('\n', '<br>')

            # Separate collection records into individual lines of text
            text = text.replace(';', '<br>')

            # write record
            out.write('{},"{}"\n'.format(image_url, text))  
            
    return

#make_mturk_data_csv('b172', 'barkbeetles-of-guam')

# MAIN

In [17]:
bulletin = 'b172'
create_file_structure(bulletin)

/home/aubrey/insects-of-guam-test/b172
dragonflies-of-guam directory already exists.
thrips-of-guam directory already exists.
cercopidae-of-guam directory already exists.
membracidae-of-guam directory already exists.
psyllidae-from-guam directory already exists.
aphidae-and-aleurodidae-of-guam directory already exists.
neuropteroid-insects-from-guam directory already exists.
butterflies-of-guam directory already exists.
sphingidae-of-guam directory already exists.
staphylinidae-of-guam directory already exists.
rhipiceridae-of-guam directory already exists.
ciidae-of-guam directory already exists.
elaterid-and-eucnemid-beetles-of-guam directory already exists.
coleoptera-heteromera-from-guam directory already exists.
new-longicorn-beetles-from-guam-cerambycidae directory already exists.
anthribidae-of-guam directory already exists.
curculionidae-of-guam directory already exists.
barkbeetles-of-guam directory already exists.
miscellaneous-families-of-guam-coleoptera directory already ex

At this point the directory structure should look something like this:

    b172
        anthribidae-of-guam
            anthribidae-of-guam.pdf
            anthribidae-of-guam-0.jpg
            anthribidae-of-guam-1.jpg
            ...
        formicidae-of-guam
            formicidae-of-guam.pdf
            formicidae-of-guam-0.jpg
            formicidae-of-guam-1.jpg
            ...
            
The PDF contains the whole section.
There is JPG file for each page in the section.

The next step is to use CVAT to record coordinates of bounding boxes which determine the position of species
sections in the JPGs.  The resulting XML file can be downloaded into the section directory.

Then, rerun this notebook, commit changes to the git repo, and push to github.

In [18]:
import pandas as pd
import os
import cv2

f = '{}/{}.csv'.format(bulletin, bulletin)
df = pd.read_csv(f)
for section in df.slug.values:
    print(section)
    bb_xml_path = '{}/{}/{}.xml'.format(bulletin, section, section)
    bb_xml_path_exists = os.path.exists(bb_xml_path)
    print('bb_xml_path_exists: {}'.format(bb_xml_path_exists))
    bb_csv_path = '{}/{}/bounding_boxes.csv'.format(bulletin, section)
    bb_csv_path_exists = os.path.exists(bb_csv_path)
    print('bb_csv_path_exists: {}'.format(bb_csv_path_exists))
    
    if bb_xml_path_exists and not bb_csv_path_exists:
        print('Making bb table for {}'.format(section))
        make_bounding_box_table(bulletin, section)
        
        print('Extracting bounding box images.')
        extract_bounding_box_images(bulletin, section)
        
        print('Merging bounding box images.')
        merge_bounding_box_images(bulletin, section)
        
        print('Making mturk data csv.')
        make_mturk_data_csv(bulletin, section)
    print()

dragonflies-of-guam
bb_xml_path_exists: True
bb_csv_path_exists: True

thrips-of-guam
bb_xml_path_exists: True
bb_csv_path_exists: True

cercopidae-of-guam
bb_xml_path_exists: True
bb_csv_path_exists: True

membracidae-of-guam
bb_xml_path_exists: True
bb_csv_path_exists: False
Making bb table for membracidae-of-guam
Extracting bounding box images.
b172/membracidae-of-guam/bounding_box_images/Leptocentrus-taurus-0.jpg
Merging bounding box images.
species_names:           species_name
0  Leptocentrus taurus
Leptocentrus taurus
                    bb_image
0  Leptocentrus-taurus-0.jpg

rowcount: 1
f: b172/membracidae-of-guam/bounding_box_images/Leptocentrus-taurus-0.jpg
Making mturk data csv.
performing optical character recognition on b172/membracidae-of-guam/merged_images/Leptocentrus taurus.jpg

psyllidae-from-guam
bb_xml_path_exists: True
bb_csv_path_exists: True

aphidae-and-aleurodidae-of-guam
bb_xml_path_exists: True
bb_csv_path_exists: True

neuropteroid-insects-from-guam
bb_xml_p

# END