In [1]:
import sys
import os.path
sys.path.append(os.path.join(os.pardir, os.pardir, 'common'))

In [2]:
import urllib

from evaluate import load_detected_regions, load_groundtruth_regions, save_results, print_stats, RegionInspector
from open import build_local_url
from order import get_test_lists

# Data Loading and Preparation

In [3]:
DETECTED_FILE = os.path.join('regions', 'detected', 'wget.tsv')
TRUTH_FILE = os.path.join('regions', 'extracted', 'wget_validation.txt')

For now, we limit ourselves to considering only the first 50 cross-validation results.  Eventually we should be able to boost this number up to around 100.

In [4]:
VALIDATION_SIZE = 50

In [5]:
wget_pages = get_test_lists('wget')['validation'][:VALIDATION_SIZE]
validation_urls = [urllib.quote(build_local_url(p), safe='/:') for p in wget_pages]

Load in the detections from the stored TSV files

In [6]:
detected_regions = load_detected_regions(DETECTED_FILE, validation_urls)

In [7]:
truth_regions = load_groundtruth_regions(TRUTH_FILE, valid_urls=validation_urls, delimiter=',,,')

## Verify that the regions have been properly loaded

In [8]:
detected_urls = set([r.url for r in detected_regions])
truth_urls = set(key[0] for key in truth_regions.keys())

In [9]:
print len(detected_urls), len(truth_urls)

29 37


In [10]:
print detected_regions[0]

<evaluate.Region object at 0x10bb63ed0>


In [11]:
print truth_regions.values()[0][0]

<evaluate.Region object at 0x10bcc55d0>


# Compute Accuracy!

In [12]:
true_regions, false_regions, missing_regions = print_stats(detected_regions, truth_regions)

Matches: 1
wget ftp://somedom.com/pub/downloads/*.pdf
Matches: 1
None
Matches: 1
wget -q --spider mysite.com
Matches: 1
None
Matches: 1
None
Matches: 1
None
Matches: 1
None
Matches: 1
None
Matches: 1
None
Matches: 1
None
Matches: 1
None
Matches: 1
None
Matches: 1
None
Matches: 1
None
Matches: 1
None
Matches: 1
None
Matches: 1
None
Matches: 1
None
Matches: 1
None
Matches: 1
None
Matches: 1
None
Matches: 1
None
Matches: 1
None
Matches: 1
None
Matches: 1
None
Matches: 1
None
Matches: 1
None
Matches: 1
None
Matches: 1
None
Matches: 1
wget mysite.com > /dev/null 2>&1
Matches: 1
None
Matches: 1
wget -i list.txt
Matches: 1
wget -r -l 0 -p http://site/1.html
Matches: 1
None
Matches: 1
wget --spider --force-html -r -l1 http://somesite.com 2>&1
Matches: 1
None
Matches: 1
None
Matches: 1
None
Matches: 1
None
Matches: 1
None
Matches: 1
None
Matches: 1
None
Matches: 1
None
Matches: 1
None
Matches: 1
wget --ignore-tags=a,area -H -k -K -r http://<site>/<document>
Matches: 1
None
Matches: 1
None
Match

## Save Results for posterity

In [34]:
history_dir = os.path.join('regions', 'detected', 'history')

In [35]:
save_results(history_dir, false_regions, missing_regions)

## Examine Detection Faults

In [13]:
inspector = RegionInspector()

In [None]:
inspector.open_missing_detections(missing_regions)


?? Open next example of missed detection? 
 
===== REGION 0 =====
{Text: None URL: http://127.0.0.1:8000/pages/wget/linux%20redirect%20wget%20tutorial/6/linux.die.net/man/1/wget.html, Element: HTML > BODY:nth-of-type(1) > DIV:nth-of-type(1) > DIV:nth-of-type(2) > DL:nth-of-type(5) > DD:nth-of-type(4), Offsets: (99, 118)}
* Text:  wget -r http://host 
* HTML:  <dd>Use the protocol name as a directory component of local file names. For example, with this option, <b>wget -r http://</b><i>host</i> will save to
<b>http/</b><i>host</i><b>/...</b> rather than just to <i>host</i><b>/...</b>.
</dd>


In [15]:
inspector.open_false_detections(false_regions)


?? Open next example of missed detection? 
 
===== REGION 0 =====
{Text: wget command - how to download files URL: http://127.0.0.1:8000/pages/wget/shell%20unix%20wget%20tutorial/2/alvinalexander.com/linux-unix/wget-command-shell-script-example-download-url.html, Element: HTML > BODY:nth-of-type(1) > DIV:nth-of-type(1) > DIV:nth-of-type(2) > DIV:nth-of-type(1) > DIV:nth-of-type(2) > DIV:nth-of-type(1) > DIV:nth-of-type(1) > DIV:nth-of-type(1) > DIV:nth-of-type(2) > DIV:nth-of-type(1) > DIV:nth-of-type(1) > DIV:nth-of-type(1), Offsets: (6, 41)}
* Text:  wget command - how to download files
* HTML:  too long to render
