In [1]:
import os.path
import urllib
import csv
import time
import glob
import random
from selenium import webdriver

from models import Page
from open import build_local_url
from order import get_test_lists

# Data Structures and Methods

## Classes

In [2]:
class Region(object):
    
    def __init__(self, url, element, start_offset, end_offset, text=None):
        self.url = url
        self.element = element
        self.start_offset = start_offset
        self.end_offset = end_offset
        self.text = text
    
    def __eq__(self, other):
        return (isinstance(other, self.__class__)
            and self.element == other.element
            and self.url == other.url
            and self.start_offset == other.start_offset
            and self.end_offset == other.end_offset)

    def __ne__(self, other):
        return not self.__eq__(other)

    def __repr__(self):
        return "{{Text: {text} URL: {url}, Element: {el}, Offsets: ({so}, {eo})}}".format(
            text=self.text, url=self.url, el=self.element, so=self.start_offset, eo=self.end_offset)

## Methods for Computing Accuracy

In [3]:
def is_detected_region_true(region, truth_regions):
    for truth_list in truth_regions.values():
        if region in truth_list:
            return True
    return False

In [4]:
def is_true_region_detected(region_candidates, detected_regions):
    ''' 
    A "true" region can be described relative to any number of its parent elements.
    We pass in all of these possible region descriptions as 'region_candidates'.
    '''
    for r in region_candidates:
        if r in detected_regions:
            return True
    return False

In [5]:
def precision(detected_regions, truth_regions):
    correct = 0
    true_regions = []
    false_regions = []
    total = len(detected_regions)
    for d in detected_regions:
        if is_detected_region_true(d, truth_regions):
            true_regions.append(d)
            correct = correct + 1
        else:
            false_regions.append(d)
    precision = correct / float(total)
    return precision, true_regions, false_regions

In [6]:
def recall(detected_regions, truth_regions):
    found_count = 0
    found = {}
    missing = {}
    total = len(truth_regions.keys())
    for key, true_region_candidates in truth_regions.items():
        if is_true_region_detected(true_region_candidates, detected_regions):
            found_count += 1
            found[key] = true_region_candidates
        else:
            missing[key] = true_region_candidates
    recall = found_count / float(total)
    return recall, found, missing

### Tests for accuracy methods

In [7]:
truth_regions = {
    ('url0', 0, 1): [Region('url0', 'el0', 0, 1), Region('url0', 'el1', 3, 4)],
    ('url1', 1, 2): [Region('url1', 'el0', 1, 2), Region('url1', 'el1', 2, 3)],
}

In [8]:
print is_detected_region_true(Region('url0', 'el1', 3, 4), truth_regions)  # True
print is_detected_region_true(Region('url0', 'el2', 0, 1), truth_regions)  # False

True
False


In [9]:
detected_regions = [Region('url0', 'el0', 0, 1), Region('url1', 'el1', 1, 2)]
region_candidates0 = [Region('url1', 'el1', 0, 1), Region('url0', 'el0', 1, 2)]
region_candidates1 = [Region('url0', 'el0', 0, 1), Region('url0', 'el1', 1, 2)]
print is_true_region_detected(region_candidates0, detected_regions)  # False
print is_true_region_detected(region_candidates1, detected_regions)  # True        

False
True


# Data Loading and Preparation

In [10]:
DETECTED_FILE = os.path.join('regions', 'detections.tsv')
TRUTH_FILE = os.path.join('regions', 'groundtruth_validation.txt')

For now, we limit ourselves to considering only the first 50 cross-validation results.  Eventually we should be able to boost this number up to around 100.

In [14]:
VALIDATION_SIZE = 50

In [15]:
wget_pages = get_test_lists('regex')['validation'][:VALIDATION_SIZE]
validation_urls = [urllib.quote(build_local_url(p), safe='/:') for p in wget_pages]

Load in the detections from the stored TSV files

In [16]:
detected_regions = []
with open(DETECTED_FILE) as detected_tsv:
    reader = csv.DictReader(detected_tsv, delimiter='\t', 
        fieldnames=['timestamp', 'url', 'element', 'start_offset', 'end_offset', 'text'])
    for r in reader:
        if r['url'] in validation_urls:
            detected_regions.append(
                Region(r['url'], r['element'], r['start_offset'], r['end_offset'], r['text'])
            )

For the manually extracted regions, index each on by the absolute offsets within the page and the URL of the page.  This is because we extract all possible relative positions of the region within the page, but each of these are really the same region.

In [17]:
with open(TRUTH_FILE, 'rU') as truth_tsv:
    truth_regions = {}
    for line in truth_tsv.readlines():
        rel_start_offset, rel_end_offset, element, url, abs_start_offset, abs_end_offset, text = line.split(',,,')
        if url in validation_urls:
            key = (url, abs_start_offset, abs_end_offset)
            if key not in truth_regions:
                truth_regions[key] = []
            truth_regions[key].append(
                Region(url, element, rel_start_offset, rel_end_offset, text)
            )

## Verify that the regions have been properly loaded

In [18]:
detected_urls = set([r.url for r in detected_regions])
truth_urls = set(key[0] for key in truth_regions.keys())

In [19]:
print len(detected_urls), len(truth_urls)

17 44


In [20]:
print detected_regions[0]

{Text: /opt/omni/lbin URL: http://127.0.0.1:8000/pages/regex/linux%20regex%20sed%20tutorial/8/www.thegeekstuff.com/2009/10/unix-sed-tutorial-advanced-sed-substitution-examples/, Element: HTML > BODY:nth-of-type(1) > DIV:nth-of-type(1) > DIV:nth-of-type(2) > DIV:nth-of-type(1) > DIV:nth-of-type(1) > DIV:nth-of-type(2) > PRE:nth-of-type(2), Offsets: (9, 22)}


In [21]:
print truth_regions.values()[0][0]

{Text: ^(\d+)\.(\d+)\.(\d+)\.(\d+)$
 URL: http://127.0.0.1:8000/pages/regex/regex%20split%20string%20tutorial/4/wiki.tcl.tk/989.html, Element: HTML > BODY:nth-of-type(1) > DIV:nth-of-type(1) > DIV:nth-of-type(2) > DIV:nth-of-type(1) > PRE:nth-of-type(22), Offsets: (43, 70)}


# Compute Accuracy!

In [22]:
prec, true_regions, false_regions = precision(detected_regions, truth_regions)
print "Precision: ", prec

Precision:  0.77519379845


In [23]:
rec, _, missing_regions = recall(detected_regions, truth_regions)
print "Recall: ", rec

Recall:  0.139325842697


## Save Results for posterity

In [24]:
history_dir = os.path.join('regions', 'detected', 'history')

In [25]:
if not os.path.isdir(history_dir):
    os.makedirs(history_dir)

In [26]:
def get_regions_filename(basename):
    record_index = 0
    while True:
        record_prefix = '{basename}-{record_index}-'.format(
            basename=basename, record_index=record_index)
        record_prefix_path = os.path.join(history_dir, record_prefix)
        if not glob.glob(record_prefix_path + '*'):
            ts = time.strftime('%Y%m%d-%H:%M:%S')
            filename = record_prefix_path + ts + '.tsv'
            return filename
        record_index += 1

In [27]:
false_fn = get_regions_filename('false_detections')
missing_fn = get_regions_filename('missing_regions')

In [28]:
def write_region(file_, r):
    file_.write('\t'.join([r.url, r.element, r.start_offset, r.end_offset]) + '\n')

In [29]:
with open(false_fn, 'w') as false_file:
    for r in false_regions:
        write_region(false_file, r)

In [30]:
with open(missing_fn, 'w') as missing_file:
    for _, region_list in missing_regions.items():
        region = max(region_list, key=lambda r: len(r.element))
        write_region(missing_file, region)

## Examine True Positives

Print one random region from each of the URLs where a selector was found

In [31]:
url_map = {}
for r in true_regions:
    if not r.url in url_map.keys():
        url_map[r.url] = []
    url_map[r.url].append(r)

In [32]:
for url, regions in url_map.items():
    print random.choice(regions).text

username
^([A-Za-z0-9_-\s]+)/([A-Za-z0-9_-\s]+)/?$
^author/(.+)$
^/products/(.?)/(.?)/5702/(.*).html
^products/([0-9][0-9])$
\$\$(.*?)\$\$
^(\d)_([a-z])\.html$
^gallery/index.php
[A-Z]
yyy
\<br\>
Linux
/opt/omni/lbin


## Examine False Positives

In [34]:
browser = webdriver.Firefox()

In [35]:
SHOW_REGION_SCRIPT = """
var node = document.querySelector('{element}');
var range = document.createRange();
range.selectNode(node);
var selection = window.getSelection();
selection.addRange(range);
node.scrollIntoView();
"""

def show_region(browser, region):
    browser.get(region.url)
    if region.element != '':
        browser.execute_script(SHOW_REGION_SCRIPT.format(element=region.element))

The following scripts let us extract the text and HTML for a region, if we want to debug the detection.  Though note that they have to be run after the browser has already been directed to the page where the region appears.

In [36]:
GET_TEXT_SCRIPT = """
var node = document.querySelector('{element}');
return node.textContent.substring({start_offset}, {end_offset} + 1);
"""

def get_text(browser, region):
    return browser.execute_script(GET_TEXT_SCRIPT.format(
        element=region.element, start_offset=region.start_offset,
        end_offset=region.end_offset))

In [37]:
GET_HTML_SCRIPT = """
var node = document.querySelector('{element}');
return node.outerHTML;
"""

def get_html(browser, region):
    return browser.execute_script(GET_HTML_SCRIPT.format(element=region.element))

### Regions not Detected

In [38]:
def open_missing_detections(start_index=0, debug=True):
    for i, (_, region_list) in enumerate(missing_regions.items()[start_index:], start_index):
        ''' Most specific selector will be the longest one. '''
        r = max(region_list, key=lambda r: len(r.element))
        print ""
        print "?? Open next example of missed detection? ",
        try:
            again = raw_input()
            if again.lower() == 'n':
                break
            print ""
            print "===== REGION %d =====" % i
            print r
            show_region(browser, r)
            if debug:
                print "* Text: ", get_text(browser, r)
                html = get_html(browser,r)
                print "* HTML: ",
                if len(html) > 1000:
                    print "too long to render"
                else:
                    print html
        except KeyboardInterrupt:
            return

In [39]:
open_missing_detections()


?? Open next example of missed detection? 
 
===== REGION 0 =====
{Text: ^(\d+)\.(\d+)\.(\d+)\.(\d+)$
 URL: http://127.0.0.1:8000/pages/regex/regex%20split%20string%20tutorial/4/wiki.tcl.tk/989.html, Element: HTML > BODY:nth-of-type(1) > DIV:nth-of-type(1) > DIV:nth-of-type(2) > DIV:nth-of-type(1) > PRE:nth-of-type(22), Offsets: (43, 70)}
* Text:  ^(\d+)\.(\d+)\.(\d+)\.(\d+)$
* HTML:  <pre class="sh_tcl">set string "0377.255.255.255"
if {[regexp {^(\d+)\.(\d+)\.(\d+)\.(\d+)$} $string _ a b c d]
 &amp;&amp; [string is integer $a] &amp;&amp; [scan $a %d v1] &amp;&amp; 0 &lt;= $v1 &amp;&amp; $v1 &lt;= 255
 &amp;&amp; [string is integer $b] &amp;&amp; [scan $b %d v2] &amp;&amp; 0 &lt;= $v2 &amp;&amp; $v2 &lt;= 255
 &amp;&amp; [string is integer $c] &amp;&amp; [scan $c %d v3] &amp;&amp; 0 &lt;= $v3 &amp;&amp; $v3 &lt;= 255
 &amp;&amp; [string is integer $d] &amp;&amp; [scan $d %d v4] &amp;&amp; 0 &lt;= $v4 &amp;&amp; $v4 &lt;= 255} {puts $v1.$v2.$v3.$v4} else {puts none}</pre>

?? Open nex

### False Detections

In [40]:
def open_false_detections(start_index=0, debug=True):
    for i, r in enumerate(false_regions[start_index:], start_index):
        print ""
        print "?? Open next example of false detection? ",
        try:
            again = raw_input()
            if again.lower() == 'n':
                break
            print ""
            print "===== REGION %d =====" % i
            print r
            show_region(browser, r)
            if debug:
                print "* Text: ", get_text(browser, r)
                html = get_html(browser,r)
                print "* HTML: ",
                if len(html) > 1000:
                    print "too long to render"
                else:
                    print html
        except KeyboardInterrupt:
            return

In [None]:
open_false_detections()


?? Open next example of false detection? 
 
===== REGION 0 =====
{Text: (\a+)(\b+) URL: http://127.0.0.1:8000/pages/regex/javascript%20regex%20replace%20tutorial/10/developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/replace.html, Element: HTML > BODY:nth-of-type(1) > MAIN:nth-of-type(1) > DIV:nth-of-type(1) > DIV:nth-of-type(1) > DIV:nth-of-type(1) > DIV:nth-of-type(3) > DIV:nth-of-type(2) > DIV:nth-of-type(2) > ARTICLE:nth-of-type(1) > TABLE:nth-of-type(2) > TBODY:nth-of-type(1) > TR:nth-of-type(3) > TD:nth-of-type(2) > CODE:nth-of-type(4), Offsets: (1, 10)}
* Text:  (\a+)(\b+)
* HTML:  <code>/(\a+)(\b+)/</code>

?? Open next example of false detection? 
 
===== REGION 1 =====
{Text: (?<=finalNumber=")(.*?)(?=") URL: http://127.0.0.1:8000/pages/regex/linux%20regex%20sed%20tutorial/9/stackoverflow.com/questions/14479535/print-regex-matches-using-sed-in-bash.html, Element: HTML > BODY:nth-of-type(1) > DIV:nth-of-type(5) > DIV:nth-of-type(2) > DIV:nth-of-typ