Skip to content
This repository
branch: master
executable file 210 lines (159 sloc) 6.244 kb
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209
#!/usr/bin/env python
"""
This module runs the performance tests to compare the ``re`` module with the
``re2`` module. You can just run it from the command line, assuming you have re2
installed, and it will output a table in ReST format comparing everything.

To add a test, you can add a function to the bottom of this page that uses the
@register_test() decorator. Alternatively, you can create a module that uses it and
import it.
"""
from timeit import Timer
import simplejson

import re2
import re
try:
    import regex
except ImportError:
    regex = None

import os
import gzip

re2.set_fallback_notification(re2.FALLBACK_EXCEPTION)

os.chdir(os.path.dirname(__file__) or '.')

tests = {}

setup_code = """\
import re2
import re
from __main__ import tests, current_re
test = tests[%r]
"""

current_re = [None]




def main():
    benchmarks = {}
    # Run all of the performance comparisons.
    for testname, method in tests.items():
        benchmarks[testname] = {}
        if regex is not None:
            modules = (re, re2, regex)
        else:
            modules = (re, re2)
        results = [None for module in modules]
        for i, module in enumerate(modules):
            # We pre-compile the pattern, because that's
            # what people do.
            current_re[0] = module.compile(method.pattern)

            results[i] = method(current_re[0], **method.data)

            # Run a test.
            t = Timer("test(current_re[0],**test.data)",
                      setup_code % testname)
            benchmarks[testname][module.__name__] = (t.timeit(method.num_runs),
                                                     method.__doc__.strip(),
                                                     method.pattern,
                                                     method.num_runs)
        for i in range(len(results) - 1):
            if results[i] != results[i + 1]:
                raise ValueError("re2 output is not the same as re output: %s" % testname)

    benchmarks_to_ReST(benchmarks)


def benchmarks_to_ReST(benchmarks):
    """
Convert dictionary to a nice table for ReST.
"""
    if regex is not None:
        headers = ('Test', 'Description', '# total runs', '``re`` time(s)', '``re2`` time(s)', '% ``re`` time', '``regex`` time(s)', '% ``regex`` time')
    else:
        headers = ('Test', 'Description', '# total runs', '``re`` time(s)', '``re2`` time(s)', '% ``regex`` time')
    table = [headers]
    f = lambda x: "%0.3f" % x
    p = lambda x: "%0.2f%%" % (x * 100)

    for test, data in benchmarks.items():
        row = [test, data["re"][1], str(data["re"][3]), f(data["re"][0]), f(data["re2"][0])]
        
        row.append(p(data["re2"][0] / data["re"][0]))
        if regex is not None:
            row.extend((f(data["regex"][0]), p(data["re2"][0] / data["regex"][0])))
        table.append(row)
    col_sizes = [0] * len(table[0])
    for col in range(len(table[0])):
        col_sizes[col] = max(len(row[col]) for row in table)

    def print_divider(symbol='-'):
        print '+' + '+'.join(symbol*col_size for col_size in col_sizes) + '+'
    def print_row(row):
        print '|' + '|'.join(item.ljust(col_sizes[i]) for i, item in enumerate(row)) + '|'

    print_divider()
    print_row(table[0])
    print_divider('=')
    for row in table[1:]:
        print_row(row)
        print_divider()





###############################################
# Tests for performance
###############################################


# Convenient decorator for registering a new test.
def register_test(name, pattern, num_runs = 100, **data):
    def decorator(method):
        tests[name] = method
        method.pattern = pattern
        method.num_runs = num_runs
        method.data = data

        return method
    return decorator


# This is the only function to get data right now,
# but I could imagine other functions as well.
_wikidata = None
def getwikidata():
    global _wikidata
    if _wikidata is None:
        _wikidata = gzip.open('wikipages.xml.gz').read()
    return _wikidata



#register_test("Findall URI|Email",
# r'([a-zA-Z][a-zA-Z0-9]*)://([^ /]+)(/[^ ]*)?|([^ @]+)@([^ @]+)',
# 2,
# data=getwikidata())
def findall_uriemail(pattern, data):
    """
Find list of '([a-zA-Z][a-zA-Z0-9]*)://([^ /]+)(/[^ ]*)?|([^ @]+)@([^ @]+)'
"""
    return len(pattern.findall(data))



#register_test("Replace WikiLinks",
# r'(\[\[(^\|)+.*?\]\])',
# data=getwikidata())
def replace_wikilinks(pattern, data):
    """
This test replaces links of the form [[Obama|Barack_Obama]] to Obama.
"""
    return len(pattern.sub(r'\1', data))



#register_test("Remove WikiLinks",
# r'(\[\[(^\|)+.*?\]\])',
# data=getwikidata())
def remove_wikilinks(pattern, data):
    """
This test replaces links of the form [[Obama|Barack_Obama]] to the empty string
"""
    return len(pattern.sub(r'', data))





#register_test("Remove WikiLinks",
# r'(<page[^>]*>)',
# data=getwikidata())
def split_pages(pattern, data):
    """
This test splits the data by the <page> tag.
"""
    return len(pattern.split(data))


def getweblogdata():
    return open(os.path.join(os.path.dirname(__file__), 'access.log'))

@register_test("weblog scan",
               #r'^(\S+) (\S+) (\S+) \[(\d{1,2})/(\w{3})/(\d{4}):(\d{2}):(\d{2}):(\d{2}) -(\d{4})\] "(\S+) (\S+) (\S+)" (\d+) (\d+|-) "([^"]+)" "([^"]+)"\n',
# '(\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) ? (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (".*?"|-) (\S+) (\S+) (\S+) (\S+)',
               '(\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) ? (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+)',
               data=getweblogdata())
def weblog_matches(pattern, data):
    """
Match weblog data line by line.
"""
    total=0
    for line in data.read()[:20000].splitlines():
        p = pattern.search(line)
        #for p in pattern.finditer(data.read()[:20000]):
        if p:
            total += len(p.groups())
    data.seek(0)

    return 0

if __name__ == '__main__':
    main()
Something went wrong with that request. Please try again.