Note: Run with Python 3 kernel.

In [23]:
def scrape_bishop_bulletins_page(signature):
    """
    Scrapes data from the Bishop Museum pubs online web page. 
    Signature is b172 for Insects of Guam I and b189 for Insects of Guam II.
    A directory named b172 or b189 is created and populated with a CSV file, named b189.csv, containing
    titles, stubs, authors, and urls for pdfs.
    """
    import requests
    import os
    import pandas as pd
    from bs4 import BeautifulSoup
    from urllib.parse import urljoin
    from slugify import slugify
    
    if signature not in ['b172', 'b189']:
        print("Signature not in ['b172', 'b189']")
        return
    
    if os.path.exists(signature):
        print('{} directory already exists.'.format(signature))
        return   

    bulletins_url = 'http://hbs.bishopmuseum.org/pubs-online/bpbm-bulletins.html'
    result = requests.get(bulletins_url)
    soup = BeautifulSoup(result.content, features = "lxml")

    pdf_list = []
    links = soup.find_all('a')
    for link in links:
        if signature in link.get('href'):
            text = link.previous_sibling.previous_sibling.previous_sibling
            parts = text.split(', by ')
            if len(parts) == 2:
                title = parts[0].strip()
                slug = slugify(title)
                authors = parts[1].replace('[','').strip()          
                url = link.get('href')
                url = urljoin(bulletins_url, url)
                pdf_list.append({'title':title, 'slug':slug, 'authors':authors, 'url':url})
    
    df_pdf_list = pd.DataFrame(pdf_list)
    os.mkdir(signature)
    outfile = '{}/{}.csv'.format(signature, signature)
    df_pdf_list.to_csv(outfile, index=False)
    
    return

## Usage example:
#scrape_bishop_bulletins_page('b172')

In [32]:
def create_file_structure(signature):
    """
    Signature is 'b172' for Insects of Guam I and 'b189' for Insects of Guam II.
    Depends on scrape_bishop_bulletins_page
    Creates a data file structure in this format:
    
    b172
        anthribidae-of-guam
            anthribidae-of-guam.pdf
            anthribidae-of-guam-0.jpg
            anthribidae-of-guam-1.jpg
            ...
        formicidae-of-guam
            formicidae-of-guam.pdf
            formicidae-of-guam-0.jpg
            formicidae-of-guam-1.jpg
            ...
            
    Each directory contains a PDF file and a JPG image for each page in the PDF.
    
    """
    import requests
    import os
    import subprocess
    import pandas as pd
    
    if signature not in ['b172', 'b189']:
        print("Signature not in ['b172', 'b189']")
        return
    
    if not os.path.exists(signature):
        print('{} directory does not exist.'.format(signature))
        print('Scraping Bishop Museum Bulletins web page.')
        scrape_bishop_bulletins_page(signature)
        return
    
    pdf_list = pd.read_csv('{}/{}.csv'.format(signature, signature)).to_dict('records')
    
    os.chdir('b172'); print(os.getcwd())
    for d in pdf_list:
        slug = d['slug']
        if os.path.exists(slug):
            print('{} directory already exists.'.format(slug))
            continue

        # Create a new directory and move into it
        url = d['url']
        os.mkdir(slug); os.chdir(slug); print(os.getcwd())

        # Download PDF
        filename = '{}.pdf'.format(slug)
        r = requests.get(url)
        open(filename, 'wb').write(r.content)

        # Create a JPG image for each page in PDF using the Linux convert command
        jpg = filename.replace('pdf', 'jpg')
        subprocess.call(['convert', '-density', '200x200', filename, jpg])

        # Move up one directory
        os.chdir('..'); print(os.getcwd())
    os.chdir('..'); print(os.getcwd())
    return

create_file_structure('b172')

/home/aubrey/insects-of-guam-test/b172
dragonflies-of-guam directory already exists.
thrips-of-guam directory already exists.
cercopidae-of-guam directory already exists.
membracidae-of-guam directory already exists.
psyllidae-from-guam directory already exists.
aphidae-and-aleurodidae-of-guam directory already exists.
neuropteroid-insects-from-guam directory already exists.
butterflies-of-guam directory already exists.
sphingidae-of-guam directory already exists.
staphylinidae-of-guam directory already exists.
rhipiceridae-of-guam directory already exists.
ciidae-of-guam directory already exists.
elaterid-and-eucnemid-beetles-of-guam directory already exists.
coleoptera-heteromera-from-guam directory already exists.
new-longicorn-beetles-from-guam-cerambycidae directory already exists.
anthribidae-of-guam directory already exists.
curculionidae-of-guam directory already exists.
barkbeetles-of-guam directory already exists.
miscellaneous-families-of-guam-coleoptera directory already ex

In [101]:
#def make_bounding_box_table(signature, section):
from bs4 import BeautifulSoup
import pandas as pd

bulletin = 'b172'
section = 'anthribidae-of-guam'

f = open('b172/anthribidae-of-guam/anthribidae-of-guam.xml', 'r')
contents = f.read()
soup = BeautifulSoup(contents, features = "lxml")

bb_list = []
n = -1
for image in soup.find_all('image'):
    for box in image.find_all('box'):
        n += 1
        species_name = box.text.replace('\n','')
        bb_list.append({
            'species_name':species_name, 
            'page_image':image['name'],
            'bb_image': '{}-{}.jpg'.format(species_name.replace(' ', '-'), n),
            'xtl':int(float(box['xtl'])),
            'ytl':int(float(box['ytl'])),
            'xbr':int(float(box['xbr'])),
            'ybr':int(float(box['ybr']))
        })
df_bb_list = pd.DataFrame(bb_list)
outfile = '{}/{}/bounding_boxes.csv'.format(bulletin, section)
df_bb_list.to_csv(outfile, index=False)

df_bb_list

#make_bounding_box_table('b172', 'anthribidae-of-guam')

Unnamed: 0,bb_image,page_image,species_name,xbr,xtl,ybr,ytl
0,Jordanthribus-planifascietus-0.jpg,anthribidae-of-guam-1.jpg,Jordanthribus planifascietus,1243,282,1718,1387
1,Jordanthribus-planifascietus-1.jpg,anthribidae-of-guam-2.jpg,Jordanthribus planifascietus,1162,191,932,223
2,Jordanthribus-conspersus-2.jpg,anthribidae-of-guam-2.jpg,Jordanthribus conspersus,1157,186,1692,947
3,Jordanthribus-conspersus-3.jpg,anthribidae-of-guam-3.jpg,Jordanthribus conspersus,1245,271,1009,256
4,Notioxenus-fulgidus-4.jpg,anthribidae-of-guam-3.jpg,Notioxenus fulgidus,1250,273,1727,1251
5,Notioxenus-fulgidus-5.jpg,anthribidae-of-guam-4.jpg,Notioxenus fulgidus,1167,188,1474,221
6,Melanopsacus-parvulus-6.jpg,anthribidae-of-guam-4.jpg,Melanopsacus parvulus,1167,190,1681,1489
7,Melanopsacus-parvulus-7.jpg,anthribidae-of-guam-5.jpg,Melanopsacus parvulus,1236,267,1266,259
8,Mauia-subnotatus-8.jpg,anthribidae-of-guam-5.jpg,Mauia subnotatus,1236,275,1721,1459
9,Mauia-subnotatus-9.jpg,anthribidae-of-guam-6.jpg,Mauia subnotatus,1148,181,604,218


In [96]:
import cv2

bulletin = 'b172'
section = 'anthribidae-of-guam'

# Extract bounding box images
for i, r in df_bb_list.iterrows():
    page_image_path = '{}/{}/{}'.format(bulletin, section, r.page_image)
    im = cv2.imread(page_image_path)
    roi = im[r.ytl:r.ybr, r.xtl:r.xbr]
    roi_filename = r.bb_image
    cv2.imwrite(roi_filename, roi)
    print(roi_filename)

Jordanthribus-planifascietus-0.jpg
Jordanthribus-planifascietus-1.jpg
Jordanthribus-conspersus-2.jpg
Jordanthribus-conspersus-3.jpg
Notioxenus-fulgidus-4.jpg
Notioxenus-fulgidus-5.jpg
Melanopsacus-parvulus-6.jpg
Melanopsacus-parvulus-7.jpg
Mauia-subnotatus-8.jpg
Mauia-subnotatus-9.jpg
Araecerus-fasciculatus-10.jpg
Araecerus-vieillardi-11.jpg
Araeocorynus-cumingi-12.jpg


In [70]:
import pandas as pd
import pandasql as ps

sql = 'select name from df_bb_list group by name order by name'
names = ps.sqldf(sql)
for name in names.values:
    name = name[0]
    print(name)
    sql = 'select image, xtl, ytl, xbr, ybr from df_bb_list where name="{}" order by image'.format(name)
    df = ps.sqldf(sql)
    print(df)
    
    for i, r in df.iterrows():
        bb = extract bb
        

Araecerus fasciculatus
                       image     xtl      ytl      xbr      ybr
0  anthribidae-of-guam-6.jpg  176.57  1205.03  1146.23  1673.06

Araecerus vieillardi
                       image     xtl     ytl      xbr     ybr
0  anthribidae-of-guam-7.jpg  276.38  246.42  1254.85  848.00

Araeocorynus cumingi
                       image     xtl      ytl      xbr      ybr
0  anthribidae-of-guam-7.jpg  290.87  1104.10  1247.61  1391.60

Jordanthribus conspersus
                       image     xtl     ytl      xbr      ybr
0  anthribidae-of-guam-2.jpg  186.83  947.16  1157.97  1692.34
1  anthribidae-of-guam-3.jpg  271.55  256.08  1245.19  1009.87

Jordanthribus planifascietus
                       image     xtl      ytl      xbr      ybr
0  anthribidae-of-guam-1.jpg  282.43  1387.84  1243.02  1718.49
1  anthribidae-of-guam-2.jpg  191.64   223.61  1162.78   932.73

Mauia subnotatus
                       image     xtl      ytl      xbr      ybr
0  anthribidae-of-guam-5.jpg  275.

# Data Prep

## Step 1: Get PDF copies of Insects of Guam I and II (Bulletin 172 and 189)
```
mkdir B172
cd B172
wget --convert-links http://hbs.bishopmuseum.org/pubs-online/bpbm-bulletins.html
grep -o 'hbs.bishopmuseum.org/pubs-online/pdf/b172p[1-9].*\.pdf' bpbm-bulletins.html > B172-pdfs.txt
wget -i B172-pdfs.txt
cd ..
```

In [None]:
import os

print(os.getcwd())
if not os.path.exists('caca'):
    os.mkdir('caca')
os.chdir('caca')
print(os.getcwd())
os.chdir('..')
print(os.getcwd())

In [None]:
# grep does not work

import os
import subprocess

subprocess.call(['mkdir', 'B999'])
os.chdir('B999')
subprocess.call(['wget', '--convert-links', 'http://hbs.bishopmuseum.org/pubs-online/bpbm-bulletins.html'])

outfile = open('titles-urls.htm', 'w')
subprocess.call(['grep', 'hbs.bishopmuseum.org/pubs-online/pdf/b172p[1-9].*\.pdf', 'bpbm-bulletins.html'],
                shell=True, 
                stdout=outfile)
os.chdir('..')

## Step 2: Extract species sections from each PDF

In [None]:
import os
import subprocess

os.chdir('B172')
filepath = 'titles-urls.htm'
with open(filepath) as fp:
    for cnt, line in enumerate(fp):
       #print("Line {}: {}".format(cnt, line))
       line = line.replace('&nbsp;', '').replace('[', '').replace(']', '')
       line = line.replace(', by ', '|').replace('<img', '|').replace('href="', '|').replace('pdf"', 'pdf|')
       parts = line.split('|')
       if (len(parts) == 5):
            title = parts[0].strip()
            authors = parts[1].strip()
            url = parts[3].strip()
            print('{}\n{}\n{}\n\n'.format(title, authors, url))
            
            directory = title.replace(' ','-')
            subprocess.call(['mkdir', directory])
            os.chdir(directory)
            subprocess.call(['wget', url])
            pdf = url.split('/')[-1]
            #subprocess.call(['pdftk', pdf, 'burst'])
            jpg = pdf.replace('pdf', 'jpg')
            print(jpg)
            subprocess.call(['convert', '-density', '200x200', pdf, jpg])
            os.chdir('..')
os.chdir('..')
print('FINIS')

In [None]:
%matplotlib inline

In [None]:
import xml.etree.ElementTree as ET  
import cv2
import glob
import numpy as np
import os
import sys
import pandas as pd
import json
try:
    from PIL import Image
except ImportError:
    import Image
import pytesseract

In [None]:
def parse_mturk_results(results_csv_filename):
    """
    Returns a dataframe.
    """
    df = pd.read_csv(results_csv_filename)
    df = df[['Input.image_url', 'Answer.annotatedResult.boundingBoxes']]
    df.columns =['image_url','bounding_boxes']

    box_list = []
    for i, r in df.iterrows():
        image_fn = r['image_url'].split('/')[-1]
        boxes = json.loads(r['bounding_boxes'])
        for box in boxes:
            box_dict = {
                'image_fn': image_fn,
                'box_type': box['label'],
                'left': box['left'],
                'top': box['top'],
                'width': box['width'],
                'height': box['height']
            }
            box_list.append(box_dict) 
    df_boxes = pd.DataFrame(box_list)
    df_boxes.sort_values(['image_fn', 'top'], inplace=True)
    
    # Check that there is a max of one 'Species section orphan' per image_fn
    # and that this section is nearest the top
    
    
    return df_boxes

#df = parse_mturk_results('Batch_235921_batch_results.csv')
#df

In [None]:
def attach_orphans():
    """
    If the entry for a species section spans more that one page, there will be two images extracted,
    saved with file names with include 'species section' and 'species section orphan'. 
    This function combines glues the 'species section orphan' images to the bottom of the preceeding
    'species section' images.
    """
    file_list = glob.glob('boxes/*.jpg')
    file_list.sort()
    number_of_images_combined = 0
    for i, fn in enumerate(file_list):
        if i>0:
            fn_prev = file_list[i-1]
            if 'orphan' in fn:
                #print 'Combining {} and {}'.format(fn_prev, fn)
                img1 = cv2.imread(fn_prev, 0)
                img2 = cv2.imread(fn, 0)
                new_img = combine_images_vertically(img1, img2) 

                # Overwrite the original 'species section' image file with the combined file.
                cv2.imwrite(fn_prev, new_img)

                # Delete the 'species section orphan' file.
                os.remove(fn) 
                number_of_images_combined += 1
    return number_of_images_combined

In [None]:
def extract_images_mturk(df):
    """
    REWRITE THIS
    Box is a dict containing coordinates for a bounding box. Example:
    
        {'image_name': 'odonata-3.jpg',
         'species_name': 'Tramea limbata',
         'xbr': 1223,
         'xtl': 250,
         'ybr': 716,
         'ytl': 348}
         
    This information to generate an image which is written to a file. Example:
    
        'boxes/Tramea limbata-odonata-3.jpg'    
    """
    for i,r in df.iterrows():
        #print r.image_fn
        im = cv2.imread('odonata/'+r.image_fn)
        #print r.top
        #print r.height
        #print r.left
        #print r.width
        roi = im[r.top: r.top+r.height-1, r.left:r.left+r.width-1]
        roi_filename = 'boxes/{:03d}-{}-{}'.format(i, r.box_type, r.image_fn)
        cv2.imwrite(roi_filename, roi)
        #print roi_filename
    return

#extract_images_mturk(df)

In [None]:
def combine_images_vertically(img1, img2):
    """
    Glues 2 images together with img2 below img1.
    Returns the new compound image.
    """
    h1, w1 = img1.shape
    h2, w2 = img2.shape

    # Create an array big enough to hold img2 below img1.
    img = np.zeros(((h1 + h2), max(w1, w2)), np.float32)

    # Paste img1 at y=0, x=0
    img[0:img1.shape[0], 0:img1.shape[1]] = img1

    # Paste img2 at y=h1, x=0
    img[h1:h1+img2.shape[0],0:img2.shape[1]] = img2    
    return img

## Download PDF
```
wget http://hbs.bishopmuseum.org/pubs-online/pdf/b172p3-6.pdf -O odonata.pdf
```

## Convert PDF into a set of JPGs

```
convert -density 200x200 odonata.pdf odonata.jpg
```

## Place bounding boxes around ROIs using MTurk

```html
<!-- You must include this JavaScript file -->
<script src="https://assets.crowd.aws/crowd-html-elements.js"></script>

<!-- For the full list of available Crowd HTML Elements and their input/output documentation,
      please refer to https://docs.aws.amazon.com/sagemaker/latest/dg/sms-ui-template-reference.html -->

<!-- You must include crowd-form so that your task submits answers to MTurk -->
<crowd-form answer-format="flatten-objects">

    <!-- The crowd-bounding-box element will create a tool for the Worker to draw 
           labeled boxes around the specified objects in your image.

          Your image file URLs will be substituted for the "image_url" variable below 
          when you publish a batch with a CSV input file containing multiple image file URLs.
          To preview the element with an example image, try setting the src attribute to
          "https://s3.amazonaws.com/cv-demo-images/two-birds.jpg" -->
    <crowd-bounding-box 
        src="${image_url}"
        labels="['Species section', 'Species section orphan']"
        header="Draw bounding boxes around the requested items"
        name="annotatedResult">

        <!-- Use the short-instructions section for quick instructions that the Worker
              will see while working on the task. Including some basic examples of 
              good and bad answers here can help get good results. You can include 
              any HTML here. -->
        <short-instructions>Draw boxes around the requested target of interest.</short-instructions>

        <!-- Use the full-instructions section for more detailed instructions that the 
              Worker can open while working on the task. Including more detailed 
              instructions and additional examples of good and bad answers here can
              help get good results. You can include any HTML here. -->
        <full-instructions header="Bounding Box Instructions">
            <p>Use the bounding box tool to draw boxes around the requested target of interest:</p>
            <ol>
              	<li>Draw a rectangle using your mouse over each instance of the target.</li>
                <li>Make sure the box does not cut into the target, leave a 2 - 3 pixel margin</li>
               	<li>When targets are overlapping, draw a box around each object, include all 
                      contiguous parts of the target in the box. Do not include parts that are completely 
                      overlapped by another object.</li>
               	<li>Do not include parts of the target that cannot be seen, even though you think you 
                      can interpolate the whole shape of the target.</li>
               	<li>Avoid shadows, they're not considered as a part of the target.</li>
               	<li>If the target goes off the screen, label up to the edge of the image.</li>
            </ol>
        </full-instructions>

    </crowd-bounding-box>
</crowd-form>
```

## Extract bounding boxes as a set of JPGs

In [None]:
mturk_results_csv = 'Batch_235921_batch_results.csv'

df = parse_mturk_results(mturk_results_csv)
extract_images_mturk(df)
attach_orphans();

## Update GitHub repository
```bash
git add .
git commit -m 'add species section images'
git push
```

## Extract data from species section images using MTurk

```html
<head>
  <title>My Design with Bootstrap</title>
  <meta charset="utf-8">
  <meta name="viewport" content="width=device-width, initial-scale=1">
  <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css">
  <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.4.0/jquery.min.js"></script>
  <script src="https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.14.7/umd/popper.min.js"></script>
  <script src="https://maxcdn.bootstrapcdn.com/bootstrap/4.3.1/js/bootstrap.min.js"></script>
  <!-- You must include this JavaScript file -->
  <script src="https://assets.crowd.aws/crowd-html-elements.js"></script>
</head>
<body>
   <div class="container">
        <div class="row">
            <div class="col-sm-6">
                <p><img src="${image_url}" style="max-width: 100%" /></p>
                <p class="small">${text}</p>
            </div>
            <div class="col-sm-6">
                <crowd-form answer-format="flatten-objects">
                    <p><strong>Instructions: </strong></p>
                    <div><crowd-input label="Scientific name" name="scientific_name" required></div>
                    <div><crowd-text-area label="Collection records" name="collection_records"></div>
                    <div><crowd-checkbox name="first_record"> First record for Guam</div>
                </crowd-form>
            </div>
        </div>
    </div>
</body>

```

https://github.com/aubreymoore/insects-of-guam/raw/master/boxes/000-Species%20section-odonata-0.jpg

In [None]:
# Generate csv file containing image_urls and text (html)
# This file will be used by MTurk

prefix = 'https://github.com/aubreymoore/insects-of-guam-datamining/raw/master/'
filelist = glob.glob('boxes/*.jpg')
filelist.sort()    
with open('mturk_data.csv', 'w+') as f:
    f.write('image_url,text\n')
    for fn in filelist:
        image_url = prefix + fn
        print(image_url)
        
        # Perform optical character recognition on the image
        #text = pytesseract.image_to_string(Image.open(fn)).encode('utf-8')
        text = pytesseract.image_to_string(Image.open(fn))
        
        # Remove double quotes and replace line feeds with <br>
        text = text.replace('\n\n', '\n')
        text = text.replace('"', '').replace('\n', '<br>')
        
        # Separate collection records into individual lines of text
        text = text.replace(';', '<br>')
        
        f.write('{},"{}"\n'.format(image_url, text))

In [None]:
filelist = glob.glob('boxes/*.jpg')
filelist

fn = filelist[5]
text = pytesseract.image_to_string(Image.open(fn))
text = text.replace('\n\n', '\n')

# Separate collection records into individual lines of text
text = text.replace('; ', '\n')

print(text)