In [1]:
import sys
import os.path
sys.path.append(os.path.join(os.pardir, os.pardir, 'common'))

In [2]:
import urllib

from evaluate import load_detected_regions, load_groundtruth_regions, save_results, print_stats, RegionInspector
from open import build_local_url
from order import get_test_lists

# Data Loading and Preparation

In [3]:
DETECTED_FILE = os.path.join('regions', 'detections.tsv')
TRUTH_FILE = os.path.join('regions', 'groundtruth_validation.txt')

For now, we limit ourselves to considering only the first 50 cross-validation results.  Eventually we should be able to boost this number up to around 100.

In [4]:
VALIDATION_SIZE = 50

In [5]:
wget_pages = get_test_lists('regex')['validation'][:VALIDATION_SIZE]
validation_urls = [urllib.quote(build_local_url(p), safe='/:') for p in wget_pages]

Load in the detections from the stored TSV files

In [6]:
detected_regions = load_detected_regions(DETECTED_FILE, validation_urls)

In [7]:
truth_regions = load_groundtruth_regions(TRUTH_FILE, delimiter=',,,', valid_urls=validation_urls)

## Verify that the regions have been properly loaded

In [8]:
detected_urls = set([r.url for r in detected_regions])
truth_urls = set(key[0] for key in truth_regions.keys())

In [9]:
print len(detected_urls), len(truth_urls)

17 44


In [10]:
print unicode(detected_regions[0])

{Text: /opt/omni/lbin URL: http://127.0.0.1:8000/pages/regex/linux%20regex%20sed%20tutorial/8/www.thegeekstuff.com/2009/10/unix-sed-tutorial-advanced-sed-substitution-examples/, Element: HTML > BODY:nth-of-type(1) > DIV:nth-of-type(1) > DIV:nth-of-type(2) > DIV:nth-of-type(1) > DIV:nth-of-type(1) > DIV:nth-of-type(2) > PRE:nth-of-type(2), Offsets: (9, 22)}


In [11]:
print unicode(truth_regions.values()[0][0])

{Text: ^(\d+)\.(\d+)\.(\d+)\.(\d+)$ URL: http://127.0.0.1:8000/pages/regex/regex%20split%20string%20tutorial/4/wiki.tcl.tk/989.html, Element: HTML > BODY:nth-of-type(1) > DIV:nth-of-type(1) > DIV:nth-of-type(2) > DIV:nth-of-type(1) > PRE:nth-of-type(22), Offsets: (43, 70)}


# Compute Accuracy!

In [12]:
true_regions, false_regions, missing_regions = print_stats(detected_regions, truth_regions)

Matches: 2
^(\d)\.(html|asp)$
Matches: 2
[A-Z]
Matches: 2
^([^\.\?/]+)/([A-Za-z_0-9\-]+)$
Matches: 1
^index\.php$
Matches: 2
(\w+)\s(\w+)
Matches: 2
(\d+(?:\.\d*)?)F\b
Matches: 2
[A-Z]
Matches: 2
^alice.html$
Matches: 1
^author/(.+)$
Matches: 2
^de/(.*)$
Matches: 2
\${2}(.*?)\${2}
Matches: 1
^products/([0-9][0-9])$
Matches: 2
\$\$(.+?)\$\$
Matches: 2
(x_*)|(-)
Matches: 1
^products/([0-9]+)$
Matches: 1
<br[^>]*>
Matches: 1
^css(/)?$
Matches: 1
^page/([^/\.]+)/?$
Matches: 2
^([^\.\?/]+)$
Matches: 1
username
Matches: 1
^products/([0-9][0-9])$
Matches: 2
^1_a\.html$
Matches: 1
^products/([0-9][0-9])/$
Matches: 2
^en/(.*)$
Matches: 1
Linux
Matches: 2
^something/(.*)
Matches: 2
^example\.com$
Matches: 2
^([^\.\?/]+)/([0-9]+)$
Matches: 2
\${2}(.*?)\${2}
Matches: 2
^([A-Za-z0-9_-\s]+)/([A-Za-z0-9_-\s]+)/?$
Matches: 1
\.(gif|jpg|png)$
Matches: 2
^gallery/index.php
Matches: 2
xmas
Matches: 2
^www\.example\.com$
Matches: 1
\<br\>
Matches: 2
^(4_([a-b]))\.html$
Matches: 2
^([^\.\?/]+)/$
Matches: 1

In [19]:
with open("followup.txt", 'w') as ofile:
    for t in false_regions:
        ofile.write('\t'.join([t.url, t.element, t.text]) +'\n')

In [21]:
r = true_regions[0]

In [20]:
print len(set([(_.abs_start_offset, _.abs_end_offset, _.url) for _ in true_regions]))

AttributeError: 'Region' object has no attribute 'abs_start_offset'

In [13]:
history_dir = os.path.join('regions', 'detected', 'history')

In [14]:
save_results(history_dir, false_regions, missing_regions)

## Examine Detection Faults

In [13]:
inspector = RegionInspector()

In [None]:
inspector.open_missing_detections(missing_regions)


?? Open next example of missed detection? 
 
===== REGION 0 =====
{Text: ^(\d+)\.(\d+)\.(\d+)\.(\d+)$ URL: http://127.0.0.1:8000/pages/regex/regex%20split%20string%20tutorial/4/wiki.tcl.tk/989.html, Element: HTML > BODY:nth-of-type(1) > DIV:nth-of-type(1) > DIV:nth-of-type(2) > DIV:nth-of-type(1) > PRE:nth-of-type(22), Offsets: (43, 70)}
* Text:  ^(\d+)\.(\d+)\.(\d+)\.(\d+)$
* HTML:  too long to render


In [17]:
inspector.open_false_detections(false_regions)


?? Open next example of missed detection? 
 
===== REGION 0 =====
{Text: (\a+)(\b+) URL: http://127.0.0.1:8000/pages/regex/javascript%20regex%20replace%20tutorial/10/developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/replace.html, Element: HTML > BODY:nth-of-type(1) > MAIN:nth-of-type(1) > DIV:nth-of-type(1) > DIV:nth-of-type(1) > DIV:nth-of-type(1) > DIV:nth-of-type(3) > DIV:nth-of-type(2) > DIV:nth-of-type(2) > ARTICLE:nth-of-type(1) > TABLE:nth-of-type(2) > TBODY:nth-of-type(1) > TR:nth-of-type(3) > TD:nth-of-type(2) > CODE:nth-of-type(4), Offsets: (1, 10)}
* Text:  (\a+)(\b+)
* HTML:  <code>/(\a+)(\b+)/</code>


NoSuchWindowException: Message: Window not found. The browser window may have been closed.
Stacktrace:
    at nsCommandProcessor.prototype.execute (file:///var/folders/05/6w_fy5m96dj6w_wdd89v0y0m0000gp/T/tmpuNzk5U/extensions/fxdriver@googlecode.com/components/command-processor.js:12711)
    at Dispatcher.executeAs/< (file:///var/folders/05/6w_fy5m96dj6w_wdd89v0y0m0000gp/T/tmpuNzk5U/extensions/fxdriver@googlecode.com/components/driver-component.js:9456)
    at Resource.prototype.handle (file:///var/folders/05/6w_fy5m96dj6w_wdd89v0y0m0000gp/T/tmpuNzk5U/extensions/fxdriver@googlecode.com/components/driver-component.js:9603)
    at Dispatcher.prototype.dispatch (file:///var/folders/05/6w_fy5m96dj6w_wdd89v0y0m0000gp/T/tmpuNzk5U/extensions/fxdriver@googlecode.com/components/driver-component.js:9550)
    at WebDriverServer/<.handle (file:///var/folders/05/6w_fy5m96dj6w_wdd89v0y0m0000gp/T/tmpuNzk5U/extensions/fxdriver@googlecode.com/components/driver-component.js:12497)
    at createHandlerFunc/< (file:///var/folders/05/6w_fy5m96dj6w_wdd89v0y0m0000gp/T/tmpuNzk5U/extensions/fxdriver@googlecode.com/components/httpd.js:2054)
    at ServerHandler.prototype.handleResponse (file:///var/folders/05/6w_fy5m96dj6w_wdd89v0y0m0000gp/T/tmpuNzk5U/extensions/fxdriver@googlecode.com/components/httpd.js:2387)
    at Connection.prototype.process (file:///var/folders/05/6w_fy5m96dj6w_wdd89v0y0m0000gp/T/tmpuNzk5U/extensions/fxdriver@googlecode.com/components/httpd.js:1223)
    at RequestReader.prototype._handleResponse (file:///var/folders/05/6w_fy5m96dj6w_wdd89v0y0m0000gp/T/tmpuNzk5U/extensions/fxdriver@googlecode.com/components/httpd.js:1677)
    at RequestReader.prototype._processBody (file:///var/folders/05/6w_fy5m96dj6w_wdd89v0y0m0000gp/T/tmpuNzk5U/extensions/fxdriver@googlecode.com/components/httpd.js:1525)
    at RequestReader.prototype.onInputStreamReady (file:///var/folders/05/6w_fy5m96dj6w_wdd89v0y0m0000gp/T/tmpuNzk5U/extensions/fxdriver@googlecode.com/components/httpd.js:1393)