In [1]:
import sys
import os.path
sys.path.append(os.path.join(os.pardir, os.pardir, 'common'))

In [2]:
import urllib

from evaluate import load_detected_regions, load_groundtruth_regions, save_results, print_stats, RegionInspector
from open import build_local_url
from order import get_test_lists

# Data Loading and Preparation

In [3]:
DETECTED_FILE = os.path.join('regions', 'detected', 'wget.tsv')
TRUTH_FILE = os.path.join('regions', 'extracted', 'wget_validation.tsv')

For now, we limit ourselves to considering only the first 50 cross-validation results.  Eventually we should be able to boost this number up to around 100.

In [4]:
VALIDATION_SIZE = 50

In [5]:
wget_pages = get_test_lists('wget')['validation'][:VALIDATION_SIZE]
validation_urls = [urllib.quote(build_local_url(p), safe='/:') for p in wget_pages]

Load in the detections from the stored TSV files

In [6]:
detected_regions = load_detected_regions(DETECTED_FILE, validation_urls)

In [7]:
truth_regions = load_groundtruth_regions(TRUTH_FILE, valid_urls=validation_urls)

## Verify that the regions have been properly loaded

In [8]:
detected_urls = set([r.url for r in detected_regions])
truth_urls = set(key[0] for key in truth_regions.keys())

In [9]:
print len(detected_urls), len(truth_urls)

29 37


In [10]:
print detected_regions[0]

{Text: wget -qO- http://example.com URL: http://127.0.0.1:8000/pages/wget/linux%20scripting%20wget%20tutorial/7/askubuntu.com/questions/25599/get-wget-output-to-a-variable.html, Element: HTML > BODY:nth-of-type(1) > DIV:nth-of-type(5) > DIV:nth-of-type(2) > DIV:nth-of-type(1) > DIV:nth-of-type(3) > DIV:nth-of-type(2) > DIV:nth-of-type(2) > TABLE:nth-of-type(1) > TBODY:nth-of-type(1) > TR:nth-of-type(1) > TD:nth-of-type(2) > DIV:nth-of-type(1) > PRE:nth-of-type(1) > CODE:nth-of-type(1), Offsets: (9, 36)}


In [11]:
print truth_regions.values()[0][0]

{Text: None URL: http://127.0.0.1:8000/pages/wget/linux%20redirect%20wget%20tutorial/6/linux.die.net/man/1/wget.html, Element: HTML > BODY:nth-of-type(1) > DIV:nth-of-type(1) > DIV:nth-of-type(2) > DL:nth-of-type(13) > DT:nth-of-type(1) > P:nth-of-type(3), Offsets: (101, 161)}


# Compute Accuracy!

In [12]:
true_regions, false_regions, missing_regions = print_stats(detected_regions, truth_regions)

Precision: 0.8382 (114/136), Recall 0.6064 (114/188)
Per-page Precision: 0.8655, Recall 0.6206


## Save Results for posterity

In [13]:
history_dir = os.path.join('regions', 'detected', 'history')

In [14]:
save_results(history_dir, false_regions, missing_regions)

## Examine Detection Faults

In [15]:
inspector = RegionInspector()

In [17]:
inspector.open_missing_detections(missing_regions)


?? Open next example of missed detection? 
 
===== REGION 0 =====
{Text: None URL: http://127.0.0.1:8000/pages/wget/linux%20redirect%20wget%20tutorial/6/linux.die.net/man/1/wget.html, Element: HTML > BODY:nth-of-type(1) > DIV:nth-of-type(1) > DIV:nth-of-type(2) > DL:nth-of-type(5) > DD:nth-of-type(4), Offsets: (99, 118)}
* Text:  wget -r http://host 
* HTML:  <dd>Use the protocol name as a directory component of local file names. For example, with this option, <b>wget -r http://</b><i>host</i> will save to
<b>http/</b><i>host</i><b>/...</b> rather than just to <i>host</i><b>/...</b>.
</dd>


===== REGION 1 =====
{Text: None URL: http://127.0.0.1:8000/pages/wget/linux%20redirect%20wget%20tutorial/6/linux.die.net/man/1/wget.html, Element: HTML > BODY:nth-of-type(1) > DIV:nth-of-type(1) > DIV:nth-of-type(2) > DL:nth-of-type(13) > DT:nth-of-type(1) > P:nth-of-type(3), Offsets: (101, 161)}
* Text:  wget -rl0 -kKE -t5 --no-proxy http://yoyodyne.com -o
/tmp/log
* HTML:  <p>Try to repeat the 

In [18]:
inspector.open_false_detections(false_regions)


?? Open next example of missed detection? 
 
===== REGION 0 =====
{Text: wget -r -l 1 -p http://site/1.html URL: http://127.0.0.1:8000/pages/wget/download%20http%20wget%20tutorial/4/www.gnu.org/software/wget/manual/html_node/Recursive-Retrieval-Options.html, Element: HTML > BODY:nth-of-type(1) > DL:nth-of-type(1) > DD:nth-of-type(7) > DIV:nth-of-type(3) > PRE:nth-of-type(1), Offsets: (0, 33)}
* Text:  wget -r -l 1 -p http://site/1.html
* HTML:  <pre class="example">wget -r -l 1 -p http://<var>site</var>/1.html
</pre>


===== REGION 1 =====
{Text: wget -r -l 0 -p http://site/1.html URL: http://127.0.0.1:8000/pages/wget/download%20http%20wget%20tutorial/4/www.gnu.org/software/wget/manual/html_node/Recursive-Retrieval-Options.html, Element: HTML > BODY:nth-of-type(1) > DL:nth-of-type(1) > DD:nth-of-type(7) > DIV:nth-of-type(4) > PRE:nth-of-type(1), Offsets: (0, 33)}
* Text:  wget -r -l 0 -p http://site/1.html
* HTML:  <pre class="example">wget -r -l 0 -p http://<var>site</var>/1.html
</p