In [1]:
from csv import DictReader
import random
import pandas as pd
import numpy as np
from IPython.display import display

In [2]:
from evaluate import RegexAffinityEvaluator, CssAffinityEvaluator

# Load Examples

In [3]:
REGEX_VALIDATION_FILENAME = 'regex_validation_20.csv'

link_texts = {}

with open(REGEX_VALIDATION_FILENAME) as regex_val_file:
    
    for line in regex_val_file.readlines():
        tokens = line.strip().split(',,,')
        link = tokens[3]
        text = tokens[-1]
        
        if not link in link_texts.keys():
            link_texts[link] = []
        link_texts[link].append(text)

Pick a random regular expression from each regex page

In [4]:
random.seed(1891014916634710808)

regex_examples = []
for link, texts in link_texts.items():
    regex_examples.append(random.choice(texts))

CSS examples have already been selected ahead of time in E45 by randomly selecting one properly detected selector from each page where at least one CSS selector was properly detected

In [5]:
css_examples = [
    '#infolist',
    '#message',
    '#other',
    '#frameDemo',
    '#txt',
    'div:last',
    'iframe[name=iframeTarget]',
    '#result',
    '.window',
    '#my-element',
    'div.foo',
    '#birthday',
    '#input',
    '.myeditable',
    'select.form-control',
    '#nav a',
    '#my-div img',
    '#followercnt',
    '.footable',
    '#submit4',
]

# Arbiter Function

In [13]:
def decide(css_score, regex_score):
    return "Regex" if regex_score > css_score else "CSS selector" if css_score > regex_score else "Tie"

In [14]:
regex_evaluator = RegexAffinityEvaluator()
css_evaluator = CssAffinityEvaluator()

In [21]:
def display_scores(examples):
    scores = []
    for example in examples:
        css_score = round(css_evaluator.evaluate(example), 2)
        regex_score = round(regex_evaluator.evaluate(example), 2)
        scores.append([
            example,
            css_score,
            regex_score,
            decide(css_score, regex_score)
        ])
    score_frame = pd.DataFrame(scores, columns=["Text", "CSS Affinity", "Regex Affinity", "Class"])
    display(score_frame)

## CSS Selectors

In [22]:
display_scores(css_examples)

Unnamed: 0,Text,CSS Affinity,Regex Affinity,Class
0,#infolist,1.0,0.0,CSS selector
1,#message,1.0,0.0,CSS selector
2,#other,1.0,0.0,CSS selector
3,#frameDemo,1.0,0.0,CSS selector
4,#txt,1.0,0.0,CSS selector
5,div:last,0.5,0.0,CSS selector
6,iframe[name=iframeTarget],0.75,0.14,CSS selector
7,#result,1.0,0.0,CSS selector
8,.window,1.0,0.14,CSS selector
9,#my-element,1.0,0.0,CSS selector


## Regular expressions

In [23]:
display_scores(regex_examples)

Unnamed: 0,Text,CSS Affinity,Regex Affinity,Class
0,<username><!\[CDATA\[name\]\]></username>,0,0.0,Tie
1,"(?<=finalNumber="")(.*?)(?="")",0,0.22,Regex
2,search,0,0.0,Tie
3,"""^[_A-Za-z0-9-\\+]+(\\.[_A-Za-z0-9-]+)*@"" + ""[...",0,0.57,Regex
4,(..)_(..),0,0.86,Regex
5,"^[A-Z]{3,9}\ /index\.php\ HTTP/",0,0.11,Regex
6,(<b>) #match opening <b> tag (.*?) #match any...,0,0.05,Regex
7,\\s,0,0.0,Tie
8,^([^/]+)$,0,1.0,Regex
9,[^a-zA-Z0-9],0,1.0,Regex
