In [1]:
%matplotlib inline

In [1]:
import xml.etree.ElementTree as ET  
import cv2
import glob
import numpy as np
import os
import sys
import pandas as pd
import json

In [3]:
def parse_mturk_results(results_csv_filename):
    """
    Returns a dataframe.
    """
    df = pd.read_csv(results_csv_filename)
    df = df[['Input.image_url', 'Answer.annotatedResult.boundingBoxes']]
    df.columns =['image_url','bounding_boxes']

    box_list = []
    for i, r in df.iterrows():
        image_fn = r['image_url'].split('/')[-1]
        boxes = json.loads(r['bounding_boxes'])
        for box in boxes:
            box_dict = {
                'image_fn': image_fn,
                'box_type': box['label'],
                'left': box['left'],
                'top': box['top'],
                'width': box['width'],
                'height': box['height']
            }
            box_list.append(box_dict) 
    df_boxes = pd.DataFrame(box_list)
    df_boxes.sort_values(['image_fn', 'top'], inplace=True)
    
    # Check that there is a max of one 'Species section orphan' per image_fn
    # and that this section is nearest the top
    
    
    return df_boxes

#df = parse_mturk_results('Batch_235921_batch_results.csv')
#df

In [4]:
def attach_orphans():
    """
    If the entry for a species section spans more that one page, there will be two images extracted,
    saved with file names with include 'species section' and 'species section orphan'. 
    This function combines glues the 'species section orphan' images to the bottom of the preceeding
    'species section' images.
    """
    file_list = glob.glob('boxes/*.jpg')
    file_list.sort()
    number_of_images_combined = 0
    for i, fn in enumerate(file_list):
        if i>0:
            fn_prev = file_list[i-1]
            if 'orphan' in fn:
                #print 'Combining {} and {}'.format(fn_prev, fn)
                img1 = cv2.imread(fn_prev, 0)
                img2 = cv2.imread(fn, 0)
                new_img = combine_images_vertically(img1, img2) 

                # Overwrite the original 'species section' image file with the combined file.
                cv2.imwrite(fn_prev, new_img)

                # Delete the 'species section orphan' file.
                os.remove(fn) 
                number_of_images_combined += 1
    return number_of_images_combined

In [5]:
def extract_images_mturk(df):
    """
    REWRITE THIS
    Box is a dict containing coordinates for a bounding box. Example:
    
        {'image_name': 'odonata-3.jpg',
         'species_name': 'Tramea limbata',
         'xbr': 1223,
         'xtl': 250,
         'ybr': 716,
         'ytl': 348}
         
    This information to generate an image which is written to a file. Example:
    
        'boxes/Tramea limbata-odonata-3.jpg'    
    """
    for i,r in df.iterrows():
        #print r.image_fn
        im = cv2.imread('odonata/'+r.image_fn)
        #print r.top
        #print r.height
        #print r.left
        #print r.width
        roi = im[r.top: r.top+r.height-1, r.left:r.left+r.width-1]
        roi_filename = 'boxes/{:03d}-{}-{}'.format(i, r.box_type, r.image_fn)
        cv2.imwrite(roi_filename, roi)
        #print roi_filename
    return

#extract_images_mturk(df)

In [6]:
def combine_images_vertically(img1, img2):
    """
    Glues 2 images together with img2 below img1.
    Returns the new compound image.
    """
    h1, w1 = img1.shape
    h2, w2 = img2.shape

    # Create an array big enough to hold img2 below img1.
    img = np.zeros(((h1 + h2), max(w1, w2)), np.float32)

    # Paste img1 at y=0, x=0
    img[0:img1.shape[0], 0:img1.shape[1]] = img1

    # Paste img2 at y=h1, x=0
    img[h1:h1+img2.shape[0],0:img2.shape[1]] = img2    
    return img

## Download PDF
```
wget http://hbs.bishopmuseum.org/pubs-online/pdf/b172p3-6.pdf -O odonata.pdf
```

## Convert PDF into a set of JPGs

```
convert -density 200x200 odonata.pdf odonata.jpg
```

## Place bounding boxes around ROIs using MTurk

```html
<!-- You must include this JavaScript file -->
<script src="https://assets.crowd.aws/crowd-html-elements.js"></script>

<!-- For the full list of available Crowd HTML Elements and their input/output documentation,
      please refer to https://docs.aws.amazon.com/sagemaker/latest/dg/sms-ui-template-reference.html -->

<!-- You must include crowd-form so that your task submits answers to MTurk -->
<crowd-form answer-format="flatten-objects">

    <!-- The crowd-bounding-box element will create a tool for the Worker to draw 
           labeled boxes around the specified objects in your image.

          Your image file URLs will be substituted for the "image_url" variable below 
          when you publish a batch with a CSV input file containing multiple image file URLs.
          To preview the element with an example image, try setting the src attribute to
          "https://s3.amazonaws.com/cv-demo-images/two-birds.jpg" -->
    <crowd-bounding-box 
        src="${image_url}"
        labels="['Species section', 'Species section orphan']"
        header="Draw bounding boxes around the requested items"
        name="annotatedResult">

        <!-- Use the short-instructions section for quick instructions that the Worker
              will see while working on the task. Including some basic examples of 
              good and bad answers here can help get good results. You can include 
              any HTML here. -->
        <short-instructions>Draw boxes around the requested target of interest.</short-instructions>

        <!-- Use the full-instructions section for more detailed instructions that the 
              Worker can open while working on the task. Including more detailed 
              instructions and additional examples of good and bad answers here can
              help get good results. You can include any HTML here. -->
        <full-instructions header="Bounding Box Instructions">
            <p>Use the bounding box tool to draw boxes around the requested target of interest:</p>
            <ol>
              	<li>Draw a rectangle using your mouse over each instance of the target.</li>
                <li>Make sure the box does not cut into the target, leave a 2 - 3 pixel margin</li>
               	<li>When targets are overlapping, draw a box around each object, include all 
                      contiguous parts of the target in the box. Do not include parts that are completely 
                      overlapped by another object.</li>
               	<li>Do not include parts of the target that cannot be seen, even though you think you 
                      can interpolate the whole shape of the target.</li>
               	<li>Avoid shadows, they're not considered as a part of the target.</li>
               	<li>If the target goes off the screen, label up to the edge of the image.</li>
            </ol>
        </full-instructions>

    </crowd-bounding-box>
</crowd-form>
```

## Extract bounding boxes as a set of JPGs

In [7]:
mturk_results_csv = 'Batch_235921_batch_results.csv'

df = parse_mturk_results(mturk_results_csv)
extract_images_mturk(df)
attach_orphans();

## Update GitHub repository
```bash
git add .
git commit -m 'add species section images'
git push
```

## Extract data from species section images using MTurk

https://github.com/aubreymoore/insects-of-guam/raw/master/boxes/000-Species%20section-odonata-0.jpg

In [5]:
# Generate csv file containing image_urls and text_urls
# This file will be used by MTurk
prefix = 'https://github.com/aubreymoore/insects-of-guam/raw/master/'
filelist = glob.glob('boxes/*.jpg')
filelist.sort()    
with open('species-section-urls.csv', 'w+') as f:
    f.write('image_url,text_url\n')
    for fn in filelist:
        image_url = prefix + fn
        text_url = image_url.replace('boxes/', 'ocr/').replace('.jpg', '.txt')
        f.write('{},{}\n'.format(image_url, text_url))

In [23]:
try:
    from PIL import Image
except ImportError:
    import Image
import pytesseract

"""
def ocr_core(filename):
    '''
    This function will handle the core OCR processing of images.
    '''
    text = pytesseract.image_to_string(Image.open(filename))  # We'll use Pillow's Image class to open the image and pytesseract to detect the string in the image
    return text  # Then we will print the text in the image
"""
    
if not os.path.exists('ocr'):
    os.makedirs('ocr')
filelist = glob.glob('boxes/*.jpg')
filelist.sort()
for fn in filelist:
    text = pytesseract.image_to_string(Image.open(fn))
    text = text.replace('\n\n', '\n')
    fn = fn.replace('boxes', 'ocr')
    fn = fn.replace('.jpg', '.txt')
    with open(fn, 'w+') as f:
        f.write(text.encode('utf-8')) 
    print fn

ocr/000-Species section-odonata-0.txt
ocr/001-Species section-odonata-0.txt
ocr/003-Species section-odonata-1.txt
ocr/004-Species section-odonata-1.txt
ocr/005-Species section-odonata-1.txt
ocr/006-Species section-odonata-1.txt
ocr/008-Species section-odonata-2.txt
ocr/009-Species section-odonata-2.txt
ocr/010-Species section-odonata-2.txt
ocr/011-Species section-odonata-2.txt
ocr/013-Species section-odonata-3.txt
