In [1]:
import os.path
import urllib
import random

from evaluate import load_detected_regions, load_groundtruth_regions, save_results, print_stats, RegionInspector
from models import Page
from open import build_local_url
from order import get_test_lists

# Data Loading and Preparation

In [2]:
DETECTED_FILE = os.path.join('regions', 'detected', 'jquery.tsv')
TRUTH_FILE = os.path.join('regions', 'extracted', 'jquery_validation_44.txt')

For now, we limit ourselves to considering only the first 50 cross-validation results.  Eventually we should be able to boost this number up to around 100.

In [3]:
VALIDATION_SIZE = 45

In [4]:
wget_pages = get_test_lists('jquery')['validation'][:VALIDATION_SIZE]
validation_urls = [urllib.quote(build_local_url(p), safe='/:') for p in wget_pages]

Load in the detections from the stored TSV files

In [5]:
detected_regions = load_detected_regions(DETECTED_FILE, validation_urls)

In [6]:
truth_regions = load_groundtruth_regions(TRUTH_FILE, valid_urls=validation_urls)

## Verify that the regions have been properly loaded

In [7]:
detected_urls = set([r.url for r in detected_regions])
truth_urls = set(key[0] for key in truth_regions.keys())

In [8]:
print len(detected_urls), len(truth_urls)

23 43


In [9]:
print detected_regions[0]

{Text: #submitButton URL: http://127.0.0.1:8000/pages/jquery/iframe%20javascript%20jquery%20tutorial/6/www.startutorial.com/articles/view/jquery-file-posting-using-iframe.html, Element: HTML > BODY:nth-of-type(1) > DIV:nth-of-type(2) > DIV:nth-of-type(1) > DIV:nth-of-type(1) > DIV:nth-of-type(2) > PRE:nth-of-type(2), Offsets: (133, 145)}


In [10]:
print truth_regions.values()[0][0]

{Text: None URL: http://127.0.0.1:8000/pages/jquery/javascript%20jquery%20jquery-plugins%20tutorial/5/www.smashingmagazine.com/2011/10/essential-jquery-plugin-patterns/, Element: HTML > BODY:nth-of-type(1) > MAIN:nth-of-type(1) > DIV:nth-of-type(1) > DIV:nth-of-type(1) > DIV:nth-of-type(1) > DIV:nth-of-type(3) > ARTICLE:nth-of-type(1) > PRE:nth-of-type(13) > CODE:nth-of-type(1) > SPAN:nth-of-type(14), Offsets: (1, 7)}


# Compute Accuracy!

In [11]:
true_regions, false_regions, missing_regions = print_stats(detected_regions, truth_regions)

Precision: 0.6891 (164/238), Recall 0.4781 (164/343)
Per-page Precision: 0.6173, Recall 0.4087


## Save Results for posterity

In [12]:
history_dir = os.path.join('regions', 'detected', 'history')

In [13]:
save_results(history_dir, false_regions, missing_regions)

## Examine Detection Faults

In [15]:
inspector = RegionInspector()

In [16]:
inspector.open_missing_detections(missing_regions)


?? Open next example of missed detection? 
 
===== REGION 0 =====
{Text: None URL: http://127.0.0.1:8000/pages/jquery/javascript%20jquery%20jquery-plugins%20tutorial/5/www.smashingmagazine.com/2011/10/essential-jquery-plugin-patterns/, Element: HTML > BODY:nth-of-type(1) > MAIN:nth-of-type(1) > DIV:nth-of-type(1) > DIV:nth-of-type(1) > DIV:nth-of-type(1) > DIV:nth-of-type(3) > ARTICLE:nth-of-type(1) > PRE:nth-of-type(13) > CODE:nth-of-type(1) > SPAN:nth-of-type(14), Offsets: (1, 7)}
* Text:  .item-b
* HTML:  <span class="token string">'.item-b'</span>


===== REGION 1 =====
{Text: None URL: http://127.0.0.1:8000/pages/jquery/javascript%20jquery%20jquery-plugins%20tutorial/9/devheart.org/articles/tutorial-creating-a-jquery-plugin/, Element: HTML > BODY:nth-of-type(1) > DIV:nth-of-type(1) > DIV:nth-of-type(1) > DIV:nth-of-type(1) > DIV:nth-of-type(1) > DIV:nth-of-type(2) > DIV:nth-of-type(1) > DIV:nth-of-type(1) > DIV:nth-of-type(1) > DIV:nth-of-type(3) > DIV:nth-of-type(13) > DIV:nth-o

In [17]:
inspector.open_false_detections(false_regions)


?? Open next example of missed detection? 
 
===== REGION 0 =====
{Text: iframe URL: http://127.0.0.1:8000/pages/jquery/iframe%20javascript%20jquery%20tutorial/6/www.startutorial.com/articles/view/jquery-file-posting-using-iframe.html, Element: HTML > BODY:nth-of-type(1) > DIV:nth-of-type(2) > DIV:nth-of-type(1) > DIV:nth-of-type(1) > DIV:nth-of-type(2) > PRE:nth-of-type(2), Offsets: (258, 263)}
* Text:  iframe
* HTML:  <pre class="prettyprint">&lt;header&gt;

***other codes ****


&lt;!-- document javascripts --&gt;	 
&lt;script type="text/javascript"&gt;
$(document).ready(function () {
  $('#submitButton').click(function(){
     if($('iframe[name=iframeTarget]').length&lt;1){
		    var iframe=document.createElement('iframe');
			$(iframe).css('display','none');
			$(iframe).attr('src','#');
				
			$(iframe).attr('name','iframeTarget');
			$('body').append(iframe);
				
			$(this).attr('target','iframeTarget');
			}          
     });
    });
&lt;/script&gt;



***other codes ****

