In [1]:
from csv import DictReader
import random
import pandas as pd
from pandas import ExcelWriter
import numpy as np
import os.path
from IPython.display import display

In [2]:
from evaluate import RegexAffinityEvaluator, CssAffinityEvaluator

# Load Examples

In [3]:
REGEX_VALIDATION_FILENAME = 'regex_validation_20.csv'

link_texts = {}

with open(REGEX_VALIDATION_FILENAME) as regex_val_file:
    
    for line in regex_val_file.readlines():
        tokens = line.strip().split(',,,')
        link = tokens[3]
        text = tokens[-1]
        
        if not link in link_texts.keys():
            link_texts[link] = []
        link_texts[link].append(text)

Pick a random regular expression from each regex page

In [4]:
random.seed(1891014916634710808)

regex_examples = []
for link, texts in link_texts.items():
    regex_examples.append(random.choice(texts))

CSS examples have already been selected ahead of time in E45 by randomly selecting one properly detected selector from each page where at least one CSS selector was properly detected

In [5]:
css_examples = [
    '#infolist',
    '#message',
    '#other',
    '#frameDemo',
    '#txt',
    'div:last',
    'iframe[name=iframeTarget]',
    '#result',
    '.window',
    '#my-element',
    'div.foo',
    '#birthday',
    '#input',
    '.myeditable',
    'select.form-control',
    '#nav a',
    '#my-div img',
    '#followercnt',
    '.footable',
    '#submit4',
]

# Arbiter Function

In [6]:
def decide(css_score, regex_score):
    return "Regex" if regex_score > css_score else "CSS selector" if css_score > regex_score else "Tie"

# Predict Language of Strings

In [7]:
regex_evaluator = RegexAffinityEvaluator()
css_evaluator = CssAffinityEvaluator()

In [8]:
def display_scores(examples, language):
    scores = []
    for example in examples:
        css_score = round(css_evaluator.evaluate(example), 2)
        regex_score = round(regex_evaluator.evaluate(example), 2)
        scores.append([
            example,
            language,
            decide(css_score, regex_score),
            css_score,
            regex_score,
        ])
    score_frame = pd.DataFrame(scores, columns=[
        "Text",
        "Expected Language",
        "Predicted Language",
        "CSS Affinity",
        "Regex Affinity",    
    ])
    display(score_frame)
    return score_frame

Initialize writer for dumping data to Excel file, for post-processing

In [9]:
excel_writer = ExcelWriter(os.path.join('output', 'affinities.xlsx'))
excel_writer_long = ExcelWriter(os.path.join('output', 'affinities_long.xlsx'))
rows_written = 0
long_rows_written = 0

In [10]:
def reshape_dataframe(df):
    return pd.melt(df, id_vars=[
        "Text",
        "Expected Language",
        "Predicted Language"
    ], var_name="Evaluator", value_name="Affinity")

In [11]:
def write_scores(df):
    
    global rows_written
    global long_rows_written

    df.to_excel(excel_writer, startrow=rows_written, header=False, index=False)
    excel_writer.save()
    rows_written += len(df)
    
    df_long = reshape_dataframe(df)
    df_long.to_excel(excel_writer_long, startrow=long_rows_written, header=False, index=False)
    excel_writer_long.save()
    long_rows_written += len(df_long)

## CSS Selectors

In [12]:
css_frame = display_scores(css_examples, "CSS selector")
write_scores(css_frame)

Unnamed: 0,Text,Expected Language,Predicted Language,CSS Affinity,Regex Affinity
0,#infolist,CSS selector,CSS selector,1.0,0.0
1,#message,CSS selector,CSS selector,1.0,0.0
2,#other,CSS selector,CSS selector,1.0,0.0
3,#frameDemo,CSS selector,CSS selector,1.0,0.0
4,#txt,CSS selector,CSS selector,1.0,0.0
5,div:last,CSS selector,CSS selector,0.5,0.0
6,iframe[name=iframeTarget],CSS selector,CSS selector,0.75,0.14
7,#result,CSS selector,CSS selector,1.0,0.0
8,.window,CSS selector,CSS selector,1.0,0.14
9,#my-element,CSS selector,CSS selector,1.0,0.0


## Regular expressions

In [13]:
regex_frame = display_scores(regex_examples, "Regex")
write_scores(regex_frame)

Unnamed: 0,Text,Expected Language,Predicted Language,CSS Affinity,Regex Affinity
0,<username><!\[CDATA\[name\]\]></username>,Regex,Tie,0,0.0
1,"(?<=finalNumber="")(.*?)(?="")",Regex,Regex,0,0.22
2,search,Regex,Tie,0,0.0
3,"""^[_A-Za-z0-9-\\+]+(\\.[_A-Za-z0-9-]+)*@"" + ""[...",Regex,Regex,0,0.57
4,(..)_(..),Regex,Regex,0,0.86
5,"^[A-Z]{3,9}\ /index\.php\ HTTP/",Regex,Regex,0,0.11
6,(<b>) #match opening <b> tag (.*?) #match any...,Regex,Regex,0,0.05
7,\\s,Regex,Tie,0,0.0
8,^([^/]+)$,Regex,Regex,0,1.0
9,[^a-zA-Z0-9],Regex,Regex,0,1.0


# Cleanup

In [14]:
excel_writer.close()
excel_writer_long.close()