In [1]:
import os.path
import urllib
import csv
import time
import glob

from models import Page
from open import build_local_url
from order import get_test_lists
from selenium import webdriver

# Data Structures and Methods

## Classes

In [2]:
class Region(object):
    
    def __init__(self, url, element, start_offset, end_offset):
        self.url = url
        self.element = element
        self.start_offset = start_offset
        self.end_offset = end_offset
    
    def __eq__(self, other):
        return (isinstance(other, self.__class__)
            and self.__dict__ == other.__dict__)

    def __ne__(self, other):
        return not self.__eq__(other)

    def __repr__(self):
        return "{{URL: {url}, Element: {el}, Offsets: ({so}, {eo})}}".format(
            url=self.url, el=self.element, so=self.start_offset, eo=self.end_offset)

## Methods for Computing Accuracy

In [3]:
def is_detected_region_true(region, truth_regions):
    for truth_list in truth_regions.values():
        if region in truth_list:
            return True
    return False

In [4]:
def is_true_region_detected(region_candidates, detected_regions):
    ''' 
    A "true" region can be described relative to any number of its parent elements.
    We pass in all of these possible region descriptions as 'region_candidates'.
    '''
    for r in region_candidates:
        if r in detected_regions:
            return True
    return False

In [5]:
def precision(detected_regions, truth_regions):
    correct = 0
    false_regions = []
    total = len(detected_regions)
    for d in detected_regions:
        if is_detected_region_true(d, truth_regions):
            correct = correct + 1
        else:
            false_regions.append(d)
    precision = correct / float(total)
    return precision, false_regions

In [6]:
def recall(detected_regions, truth_regions):
    found = 0
    missing = {}
    total = len(truth_regions.keys())
    for key, true_region_candidates in truth_regions.items():
        if is_true_region_detected(true_region_candidates, detected_regions):
            found += 1
        else:
            missing[key] = true_region_candidates
    recall = found / float(total)
    return recall, missing

### Tests for accuracy methods

In [7]:
truth_regions = {
    ('url0', 0, 1): [Region('url0', 'el0', 0, 1), Region('url0', 'el1', 3, 4)],
    ('url1', 1, 2): [Region('url1', 'el0', 1, 2), Region('url1', 'el1', 2, 3)],
}

In [8]:
print is_detected_region_true(Region('url0', 'el1', 3, 4), truth_regions)  # True
print is_detected_region_true(Region('url0', 'el2', 0, 1), truth_regions)  # False

True
False


In [9]:
detected_regions = [Region('url0', 'el0', 0, 1), Region('url1', 'el1', 1, 2)]
region_candidates0 = [Region('url1', 'el1', 0, 1), Region('url0', 'el0', 1, 2)]
region_candidates1 = [Region('url0', 'el0', 0, 1), Region('url0', 'el1', 1, 2)]
print is_true_region_detected(region_candidates0, detected_regions)  # False
print is_true_region_detected(region_candidates1, detected_regions)  # True        

False
True


# Data Loading and Preparation

In [10]:
DETECTED_FILE = os.path.join('regions', 'detected', 'wget.tsv')
TRUTH_FILE = os.path.join('regions', 'extracted', 'wget_validation.tsv')

For now, we limit ourselves to considering only the first 50 cross-validation results.  Eventually we should be able to boost this number up to around 100.

In [11]:
VALIDATION_SIZE = 50

In [18]:
wget_pages = get_test_lists('wget')['validation'][:VALIDATION_SIZE]
validation_urls = [urllib.quote(build_local_url(p), safe='/:') for p in wget_pages]

Load in the detections from the stored TSV files

In [46]:
detected_regions = []
with open(DETECTED_FILE) as detected_tsv:
    reader = csv.DictReader(detected_tsv, delimiter='\t', 
        fieldnames=['timestamp', 'url', 'element', 'start_offset', 'end_offset'])
    for r in reader:
        if urllib.quote(r['url'], safe='/:') in validation_urls:
            detected_regions.append(
                Region(urllib.quote(r['url'], safe='/:'), r['element'], r['start_offset'], r['end_offset'])
            )

For the manually extracted regions, index each on by the absolute offsets within the page and the URL of the page.  This is because we extract all possible relative positions of the region within the page, but each of these are really the same region.

In [47]:
with open(TRUTH_FILE, 'rU') as truth_tsv:
    reader = csv.DictReader(truth_tsv, delimiter='\t',
        fieldnames=[
            'rel_start_offset', 'rel_end_offset', 'element',
            'url', 'abs_start_offset', 'abs_end_offset',
        ])
    truth_regions = {}
    
    for r in reader:
        if r['url'] in validation_urls:
            key = (r['url'], r['abs_start_offset'], r['abs_end_offset'])
            if key not in truth_regions:
                truth_regions[key] = []
            truth_regions[key].append(
                Region(r['url'], r['element'], r['rel_start_offset'], r['rel_end_offset'])
            )

## Verify that the regions have been properly loaded

In [48]:
detected_urls = set([r.url for r in detected_regions])
truth_urls = set(key[0] for key in truth_regions.keys())

In [49]:
print len(detected_urls), len(truth_urls)

30 37


In [50]:
print detected_regions[0]

{URL: http://127.0.0.1:8000/pages/wget/linux%20scripting%20wget%20tutorial/7/askubuntu.com/questions/25599/get-wget-output-to-a-variable.html, Element: HTML > BODY:nth-of-type(1) > DIV:nth-of-type(5) > DIV:nth-of-type(2) > DIV:nth-of-type(1) > DIV:nth-of-type(3) > DIV:nth-of-type(2) > DIV:nth-of-type(2) > TABLE:nth-of-type(1) > TBODY:nth-of-type(1) > TR:nth-of-type(1) > TD:nth-of-type(2) > DIV:nth-of-type(1) > PRE:nth-of-type(1) > CODE:nth-of-type(1), Offsets: (9, 36)}


In [51]:
print truth_regions.values()[0][0]

{URL: http://127.0.0.1:8000/pages/wget/linux%20redirect%20wget%20tutorial/6/linux.die.net/man/1/wget.html, Element: HTML > BODY:nth-of-type(1) > DIV:nth-of-type(1) > DIV:nth-of-type(2) > DL:nth-of-type(13) > DT:nth-of-type(1) > P:nth-of-type(3), Offsets: (101, 161)}


# Compute Accuracy!

In [52]:
prec, false_regions = precision(detected_regions, truth_regions)
print "Precision: ", prec

Precision:  0.573333333333


In [53]:
rec, missing_regions = recall(detected_regions, truth_regions)
print "Recall: ", rec

Recall:  0.595744680851


## Save Results for posterity

In [54]:
def get_regions_filename(basename):
    record_index = 0
    while True:
        record_prefix = '{basename}-{record_index}-'.format(
            basename=basename, record_index=record_index)
        record_prefix_path = os.path.join('regions', 'detected', 'history', record_prefix)
        if not glob.glob(record_prefix_path + '*'):
            ts = time.strftime('%Y%m%d-%H:%M:%S')
            filename = record_prefix_path + ts + '.tsv'
            return filename
        record_index += 1

In [55]:
false_fn = get_regions_filename('false_detections')
missing_fn = get_regions_filename('missing_regions')

In [56]:
def write_region(file_, r):
    file_.write('\t'.join([r.url, r.element, r.start_offset, r.end_offset]) + '\n')

In [57]:
with open(false_fn, 'w') as false_file:
    for r in false_regions:
        write_region(false_file, r)

In [58]:
with open(missing_fn, 'w') as missing_file:
    for _, region_list in missing_regions.items():
        region = max(region_list, key=lambda r: len(r.element))
        write_region(missing_file, region)

## Examine False Positives

In [59]:
browser = webdriver.Firefox()

In [60]:
SHOW_REGION_SCRIPT = """
var node = document.querySelector('{element}');
var range = document.createRange();
range.selectNode(node);
var selection = window.getSelection();
selection.addRange(range);
node.scrollIntoView();
"""

def show_region(browser, region):
    browser.get(region.url)
    if region.element != '':
        browser.execute_script(SHOW_REGION_SCRIPT.format(element=region.element))

The following scripts let us extract the text and HTML for a region, if we want to debug the detection.  Though note that they have to be run after the browser has already been directed to the page where the region appears.

In [61]:
GET_TEXT_SCRIPT = """
var node = document.querySelector('{element}');
return node.textContent.substring({start_offset}, {end_offset} + 1);
"""

def get_text(browser, region):
    return browser.execute_script(GET_TEXT_SCRIPT.format(
        element=region.element, start_offset=region.start_offset,
        end_offset=region.end_offset))

In [62]:
GET_HTML_SCRIPT = """
var node = document.querySelector('{element}');
return node.outerHTML;
"""

def get_html(browser, region):
    return browser.execute_script(GET_HTML_SCRIPT.format(element=region.element))

### Regions not Detected

In [63]:
def open_missing_detections(start_index=0, debug=True):
    for i, (_, region_list) in enumerate(missing_regions.items()[start_index:], start_index):
        ''' Most specific selector will be the longest one. '''
        r = max(region_list, key=lambda r: len(r.element))
        print ""
        print "?? Open next example of missed detection? ",
        try:
            again = raw_input()
            if again.lower() == 'n':
                break
            print ""
            print "===== REGION %d =====" % i
            print r
            show_region(browser, r)
            if debug:
                print "* Text: ", get_text(browser, r)
                html = get_html(browser,r)
                print "* HTML: ",
                if len(html) > 1000:
                    print "too long to render"
                else:
                    print html
        except KeyboardInterrupt:
            return

In [None]:
open_missing_detections()


?? Open next example of missed detection? 
 
===== REGION 0 =====
{URL: http://127.0.0.1:8000/pages/wget/linux%20redirect%20wget%20tutorial/6/linux.die.net/man/1/wget.html, Element: HTML > BODY:nth-of-type(1) > DIV:nth-of-type(1) > DIV:nth-of-type(2) > DL:nth-of-type(5) > DD:nth-of-type(4), Offsets: (99, 118)}
* Text:  wget -r http://host 
* HTML:  <dd>Use the protocol name as a directory component of local file names. For example, with this option, <b>wget -r http://</b><i>host</i> will save to
<b>http/</b><i>host</i><b>/...</b> rather than just to <i>host</i><b>/...</b>.
</dd>

?? Open next example of missed detection? 
 
===== REGION 1 =====
{URL: http://127.0.0.1:8000/pages/wget/linux%20redirect%20wget%20tutorial/6/linux.die.net/man/1/wget.html, Element: HTML > BODY:nth-of-type(1) > DIV:nth-of-type(1) > DIV:nth-of-type(2) > DL:nth-of-type(13) > DT:nth-of-type(1) > P:nth-of-type(3), Offsets: (101, 161)}
* Text:  wget -rl0 -kKE -t5 --no-proxy http://yoyodyne.com -o
/tmp/log
* HTML: 

### False Detections

In [None]:
def open_false_detections(start_index=0, debug=True):
    for i, r in enumerate(false_regions[start_index:], start_index):
        print ""
        print "?? Open next example of false detection? ",
        try:
            again = raw_input()
            if again.lower() == 'n':
                break
            print ""
            print "===== REGION %d =====" % i
            print r
            show_region(browser, r)
            if debug:
                print "* Text: ", get_text(browser, r)
                html = get_html(browser,r)
                print "* HTML: ",
                if len(html) > 1000:
                    print "too long to render"
                else:
                    print html
        except KeyboardInterrupt:
            return

In [None]:
open_false_detections()


?? Open next example of false detection? 
 
===== REGION 0 =====
{URL: http://127.0.0.1:8000/pages/wget/download%20http%20wget%20tutorial/4/www.gnu.org/software/wget/manual/html_node/Recursive-Retrieval-Options.html, Element: HTML > BODY:nth-of-type(1) > DL:nth-of-type(1) > DD:nth-of-type(7) > DIV:nth-of-type(3) > PRE:nth-of-type(1), Offsets: (0, 33)}
* Text:  wget -r -l 1 -p http://site/1.html
* HTML:  <pre class="example">wget -r -l 1 -p http://<var>site</var>/1.html
</pre>

?? Open next example of false detection? 
 
===== REGION 1 =====
{URL: http://127.0.0.1:8000/pages/wget/download%20http%20wget%20tutorial/4/www.gnu.org/software/wget/manual/html_node/Recursive-Retrieval-Options.html, Element: HTML > BODY:nth-of-type(1) > DL:nth-of-type(1) > DD:nth-of-type(7) > DIV:nth-of-type(4) > PRE:nth-of-type(1), Offsets: (0, 33)}
* Text:  wget -r -l 0 -p http://site/1.html
* HTML:  <pre class="example">wget -r -l 0 -p http://<var>site</var>/1.html
</pre>

?? Open next example of false detec