Major performance improvement for a very common function!

4.5 seconds to 0.17! w0000t!
andresriancho · Feb 9, 2020 · bb56922 · bb56922
1 parent 850893e
commit bb56922
Show file tree

Hide file tree

Showing 3 changed files with 68 additions and 23 deletions.
diff --git a/w3af/core/controllers/core_helpers/not_found/fuzzy_equal_for_diff.py b/w3af/core/controllers/core_helpers/not_found/fuzzy_equal_for_diff.py
@@ -24,13 +24,36 @@
 from w3af.core.controllers.misc.fuzzy_string_cmp import fuzzy_equal
 
 
-NOT_HASH = {'\t', '\n', '\r', ' ',
-            '!', '"', '#', '$', "'",
-            '(', ')', '*', ',', '.',
-            '/', ':', ';', '<',
-            '=', '>', '?', '@',
-            '[', '\\', ']', '^',
-            '`', '{', '|', '}',
+NOT_HASH = {'\t',
+            '\n',
+            '\r',
+            ' ',
+            '!',
+            '"',
+            '#',
+            '$',
+            "'",
+            '(',
+            ')',
+            '*',
+            ',',
+            '.',
+            '/',
+            ':',
+            ';',
+            '<',
+            '=',
+            '>',
+            '?',
+            '@',
+            '[',
+            '\\',
+            ']',
+            '^',
+            '`',
+            '{',
+            '|',
+            '}',
             '~'}
 
 
@@ -52,6 +75,9 @@ def fuzzy_equal_for_diff(diff_x, diff_y, is_equal_ratio):
     :return: True if the two results of applying the diff() function are
              fuzzy equal (applying split_by_sep technique)
     """
+    if diff_x == diff_y:
+        return True
+
     split_x = split_by_sep(diff_x)
     split_y = split_by_sep(diff_y)
 

diff --git a/w3af/core/controllers/misc/diff.py b/w3af/core/controllers/misc/diff.py
@@ -19,6 +19,7 @@
 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 
 """
+import string
 import difflib
 import diff_match_patch as dmp_module
 
@@ -29,6 +30,16 @@
 # is reached, a (partial) result is returned
 MAX_DIFF_TIME = 20
 
+#
+# Translation table to split strings by multiple chars
+#
+# The only issue with this method is that it will yield "false positives" when
+# the string to split has null bytes, but that is acceptable due to the performance
+# improvement gains
+#
+TRANSLATION_TABLE = string.maketrans('\n\t\r"\'<',
+                                     '\0\0\0\0\0\0')
+
 
 def diff_dmp(a, b):
     """
@@ -170,19 +181,19 @@ def split_by_sep(seq):
     :param seq: A string
     :return: A list of strings (chunks) for the input string
     """
-    chunk = []
-    chunks = []
-    append = chunks.append
-    empty_string_join = ''.join
-    separators = {'\n', '\t', '\r', '"', "'", '<'}
-
-    for c in seq:
-        if c in separators:
-            append(empty_string_join(chunk))
-            chunk = []
-        else:
-            chunk.append(c)
-
-    append(empty_string_join(chunk))
-
-    return chunks
+    #
+    # There was a previous version of this algorithm which used python code
+    # and a few performance tricks [0], but this is MUCH faster and easier to
+    # read.
+    #
+    # This code with translate and split runs 1000 loops of test_split_by_sep_perf
+    # in 0.17 seconds, while the older code [0] run the same test in 4.5 seconds.
+    #
+    # Just when you think it is impossible to improve the performance of a simple
+    # algorithm... a new idea appears and reduces the time from 4.5 to 0.17...
+    # amazing!
+    #
+    # [0] https://github.com/andresriancho/w3af/blob/2ded693c959c91dc3e4daca276460d6c64ada479/w3af/core/controllers/misc/diff.py#L173
+    #
+    translated_seq = string.translate(seq, TRANSLATION_TABLE)
+    return translated_seq.split('\0')
diff --git a/w3af/core/controllers/misc/tests/test_diff.py b/w3af/core/controllers/misc/tests/test_diff.py
@@ -127,6 +127,14 @@ def test_special_chars(self):
 
 
 class TestSplitBySep(unittest.TestCase):
+    def test_split_by_sep_1(self):
+        result = split_by_sep('hello world<bye bye!')
+        self.assertEqual(result, ['hello world', 'bye bye!'])
+
+    def test_split_by_sep_2(self):
+        result = split_by_sep('hello world<bye\nbye!')
+        self.assertEqual(result, ['hello world', 'bye', 'bye!'])
+
     def test_split_by_sep_perf(self):
         loops = 1000
         inputs = [unittest.__doc__,