Skip to content

Commit

Permalink
Major performance improvement for a very common function!
Browse files Browse the repository at this point in the history
4.5 seconds to 0.17! w0000t!
  • Loading branch information
andresriancho authored and Andres Riancho committed Feb 9, 2020
1 parent 850893e commit bb56922
Show file tree
Hide file tree
Showing 3 changed files with 68 additions and 23 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,36 @@
from w3af.core.controllers.misc.fuzzy_string_cmp import fuzzy_equal


NOT_HASH = {'\t', '\n', '\r', ' ',
'!', '"', '#', '$', "'",
'(', ')', '*', ',', '.',
'/', ':', ';', '<',
'=', '>', '?', '@',
'[', '\\', ']', '^',
'`', '{', '|', '}',
NOT_HASH = {'\t',
'\n',
'\r',
' ',
'!',
'"',
'#',
'$',
"'",
'(',
')',
'*',
',',
'.',
'/',
':',
';',
'<',
'=',
'>',
'?',
'@',
'[',
'\\',
']',
'^',
'`',
'{',
'|',
'}',
'~'}


Expand All @@ -52,6 +75,9 @@ def fuzzy_equal_for_diff(diff_x, diff_y, is_equal_ratio):
:return: True if the two results of applying the diff() function are
fuzzy equal (applying split_by_sep technique)
"""
if diff_x == diff_y:
return True

split_x = split_by_sep(diff_x)
split_y = split_by_sep(diff_y)

Expand Down
43 changes: 27 additions & 16 deletions w3af/core/controllers/misc/diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
"""
import string
import difflib
import diff_match_patch as dmp_module

Expand All @@ -29,6 +30,16 @@
# is reached, a (partial) result is returned
MAX_DIFF_TIME = 20

#
# Translation table to split strings by multiple chars
#
# The only issue with this method is that it will yield "false positives" when
# the string to split has null bytes, but that is acceptable due to the performance
# improvement gains
#
TRANSLATION_TABLE = string.maketrans('\n\t\r"\'<',
'\0\0\0\0\0\0')


def diff_dmp(a, b):
"""
Expand Down Expand Up @@ -170,19 +181,19 @@ def split_by_sep(seq):
:param seq: A string
:return: A list of strings (chunks) for the input string
"""
chunk = []
chunks = []
append = chunks.append
empty_string_join = ''.join
separators = {'\n', '\t', '\r', '"', "'", '<'}

for c in seq:
if c in separators:
append(empty_string_join(chunk))
chunk = []
else:
chunk.append(c)

append(empty_string_join(chunk))

return chunks
#
# There was a previous version of this algorithm which used python code
# and a few performance tricks [0], but this is MUCH faster and easier to
# read.
#
# This code with translate and split runs 1000 loops of test_split_by_sep_perf
# in 0.17 seconds, while the older code [0] run the same test in 4.5 seconds.
#
# Just when you think it is impossible to improve the performance of a simple
# algorithm... a new idea appears and reduces the time from 4.5 to 0.17...
# amazing!
#
# [0] https://github.com/andresriancho/w3af/blob/2ded693c959c91dc3e4daca276460d6c64ada479/w3af/core/controllers/misc/diff.py#L173
#
translated_seq = string.translate(seq, TRANSLATION_TABLE)
return translated_seq.split('\0')
8 changes: 8 additions & 0 deletions w3af/core/controllers/misc/tests/test_diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,14 @@ def test_special_chars(self):


class TestSplitBySep(unittest.TestCase):
def test_split_by_sep_1(self):
result = split_by_sep('hello world<bye bye!')
self.assertEqual(result, ['hello world', 'bye bye!'])

def test_split_by_sep_2(self):
result = split_by_sep('hello world<bye\nbye!')
self.assertEqual(result, ['hello world', 'bye', 'bye!'])

def test_split_by_sep_perf(self):
loops = 1000
inputs = [unittest.__doc__,
Expand Down

0 comments on commit bb56922

Please sign in to comment.