Unittests, better documentation and minor refactoring

andresriancho · Nov 28, 2019 · 8bf5853 · 8bf5853
1 parent 11831e1
commit 8bf5853
Show file tree

Hide file tree

Showing 7 changed files with 220 additions and 29 deletions.
diff --git a/w3af/core/data/parsers/__init__.py b/w3af/core/data/parsers/__init__.py
@@ -1,10 +0,0 @@
-import re
-
-#URL_RE = ('((http|https):[A-Za-z0-9/](([A-Za-z0-9$_.+!*(),;/?:@&~=-])|%'
-#    '[A-Fa-f0-9]{2})+(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*(),;/?:@&~=%-]*))?)')
-
-URL_RE = re.compile(r'((http|https)://([\w:@\-./]*?)[^ \0\n\r\t"\'<>]*)', re.U)
-
-RELATIVE_URL_RE = re.compile(
-    r'((:?[/]{1,2}[\w\-~.%]+)+\.\w{2,4}(((\?)([\w\-~.%]*=[\w\-~.%]*))'
-    r'((&)([\w\-~.%]*=[\w\-~.%]*))*)?)', re.U)

diff --git a/w3af/core/data/parsers/pynarcissus/link_extractor.py b/w3af/core/data/parsers/pynarcissus/link_extractor.py
@@ -20,7 +20,7 @@
 
 """
 from w3af.core.data.parsers.pynarcissus.string_extractor import StringExtractor
-from w3af.core.data.parsers import URL_RE
+from w3af.core.data.parsers.utils.url_regex import URL_RE, RELATIVE_URL_RE
 from w3af.core.data.parsers.doc.url import URL
 
 
@@ -41,4 +41,4 @@ def extract_full_urls(self):
             except ValueError:
                 pass
 
-        return urls
+        return urls
diff --git a/w3af/core/data/parsers/pynarcissus/tests/test_link_extractor.py b/w3af/core/data/parsers/pynarcissus/tests/test_link_extractor.py
@@ -21,7 +21,7 @@
 """
 import unittest
 
-from w3af.core.data.parsers import URL_RE
+from w3af.core.data.parsers.utils.url_regex import URL_RE
 from w3af.core.data.parsers.doc.url import URL
 from w3af.core.data.parsers.pynarcissus.link_extractor import JSLinkExtractor
 from w3af.core.data.parsers.pynarcissus.tests.test_string_extractor import JSParserMixin
@@ -47,4 +47,4 @@ def test_jquery_re(self):
             except ValueError:
                 pass
 
-        return urls
+        return urls
diff --git a/w3af/core/data/parsers/utils/re_extract.py b/w3af/core/data/parsers/utils/re_extract.py
@@ -23,17 +23,21 @@
 
 from w3af.core.data.parsers.doc.baseparser import BaseParser
 from w3af.core.data.parsers.doc.url import URL
-from w3af.core.data.parsers import URL_RE, RELATIVE_URL_RE
+from w3af.core.data.parsers.utils.url_regex import URL_RE, RELATIVE_URL_RE
 
 
 class ReExtract(BaseParser):
     """
-    A helper that extracts URLs from a string using regular expressions.
+    A helper class that extracts URLs from a string using regular expressions.
 
-    THIS CODE IS SLOW! USE WITH CARE!
+        THIS CODE IS SLOW!
+        DO NOT APPLY THESE REGULAR EXPRESSIONS TO ALL HTTP RESPONSES!
+
+    You have been warned.
 
     :author: Andres Riancho (andres.riancho@gmail.com)
     """
+
     # Matches strings like:
     #
     #   PHP/5.2.4-2ubuntu5.7
@@ -45,7 +49,11 @@ class ReExtract(BaseParser):
 
     QUOTES = {"'", '"'}
 
-    def __init__(self, doc_string, base_url, encoding, relative=True,
+    def __init__(self,
+                 doc_string,
+                 base_url,
+                 encoding,
+                 relative=True,
                  require_quotes=False):
         self._re_urls = set()
 
@@ -65,13 +73,17 @@ def parse(self):
             self._extract_relative_urls(self._doc_string)
 
     def _is_quoted(self, url_mo, doc_string):
+        """
+        :return: True if the URL extracted using regular expressions has quotes
+                 around it.
+        """
         start, end = url_mo.span()
         doc_string_len = len(doc_string)
 
         if end == doc_string_len:
             return False
 
-        if doc_string[start-1] not in self.QUOTES:
+        if doc_string[start - 1] not in self.QUOTES:
             return False
 
         if doc_string[end] not in self.QUOTES:
@@ -97,14 +109,20 @@ def _extract_full_urls(self, doc_string):
 
     def _extract_relative_urls(self, doc_string):
         """
-        Now detect some relative URL's (also using regexs)
+        Extract relative URL's using regular expressions
         """
-        # TODO: Also matches //foo/bar.txt and http://host.tld/foo/bar.txt
-        # I'm removing those matches with the filter
+        #
+        # The RELATIVE_URL_RE is very complex and (for some cases) dumb.
+        #
+        # The regular expression results need to be filtered to make sure
+        # they are actually URLs. Some of the URLs that will be dropped by
+        # `_filter_false_urls` will be caught by `URL_RE`.
+        #
+        # Take a look at `test_url_regex.py` for examples
+        #
         relative_urls = RELATIVE_URL_RE.finditer(doc_string)
-        filter_false_urls = self._filter_false_urls
 
-        for url_mo in filter(filter_false_urls, relative_urls):
+        for url_mo in filter(self._filter_false_urls, relative_urls):
             if self._require_quotes:
                 if not self._is_quoted(url_mo, doc_string):
                     continue
@@ -113,16 +131,16 @@ def _extract_relative_urls(self, doc_string):
                 url = self._base_url.url_join(url_mo.group(0)).url_string
                 url = URL(self._decode_url(url), encoding=self._encoding)
             except ValueError:
-                # In some cases, the relative URL is invalid and triggers an
+                #
+                # In some cases, the relative URL is invalid and triggers a
                 # ValueError: Invalid URL "%s" exception. All we can do at this
-                # point is to ignore this "fake relative URL".
+                # point is to ignore it.
+                #
                 pass
             else:
                 url_lower = url.url_string.lower()
 
-                if url_lower.startswith('http://') or \
-                url_lower.startswith('https://'):
-
+                if url_lower.startswith('http://') or url_lower.startswith('https://'):
                     self._re_urls.add(url)
 
     def _filter_false_urls(self, potential_url_mo):

diff --git a/w3af/core/data/parsers/utils/tests/test_re_extract.py b/w3af/core/data/parsers/utils/tests/test_re_extract.py
@@ -0,0 +1,39 @@
+# -*- coding: utf-8 -*-
+"""
+test_re_extract.py
+
+Copyright 2019 Andres Riancho
+
+This file is part of w3af, http://w3af.org/ .
+
+w3af is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation version 2 of the License.
+
+w3af is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with w3af; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+"""
+import unittest
+
+from w3af.core.data.parsers.utils.re_extract import ReExtract
+from w3af.core.data.parsers.doc.url import URL
+
+
+class TestReExtract(unittest.TestCase):
+    def test_relative_regex(self):
+        doc_string = '123 ../../foobar/uploads/foo.png 465'
+        base_url = URL('https://w3af.org/abc/def/')
+
+        re_extract = ReExtract(doc_string, base_url, 'utf-8')
+        re_extract.parse()
+
+        references = re_extract.get_references()
+
+        self.assertEqual(references, [URL('https://w3af.org/foobar/uploads/foo.png')])
diff --git a/w3af/core/data/parsers/utils/tests/test_url_regex.py b/w3af/core/data/parsers/utils/tests/test_url_regex.py
@@ -0,0 +1,130 @@
+# -*- coding: utf-8 -*-
+"""
+test_url_regex.py
+
+Copyright 2019 Andres Riancho
+
+This file is part of w3af, http://w3af.org/ .
+
+w3af is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation version 2 of the License.
+
+w3af is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with w3af; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+"""
+import unittest
+
+
+from w3af.core.data.parsers.utils.url_regex import URL_RE, RELATIVE_URL_RE
+
+
+class TestURLRegex(unittest.TestCase):
+    def test_simple_domain(self):
+        matches = URL_RE.findall('http://w3af.org/')
+        self.assertEqual(matches[0][0], 'http://w3af.org/')
+
+    def test_case_insensitive(self):
+        matches = URL_RE.findall('hTTp://w3af.org/')
+        self.assertEqual(matches[0][0], 'hTTp://w3af.org/')
+
+    def test_simple_domain_padding(self):
+        matches = URL_RE.findall('123 http://w3af.org/ 456')
+        self.assertEqual(matches[0][0], 'http://w3af.org/')
+
+    def test_domain_filename_padding(self):
+        matches = URL_RE.findall('123 http://w3af.org/scanner 456')
+        self.assertEqual(matches[0][0], 'http://w3af.org/scanner')
+
+    def test_domain_filename_query_string_padding(self):
+        matches = URL_RE.findall('123 http://w3af.org/scanner?id=1 456')
+        self.assertEqual(matches[0][0], 'http://w3af.org/scanner?id=1')
+
+    def test_domain_filename_query_string_multiple_params_padding(self):
+        matches = URL_RE.findall('123 http://w3af.org/scanner?id=1&foo=bar 456')
+        self.assertEqual(matches[0][0], 'http://w3af.org/scanner?id=1&foo=bar')
+
+    def test_no_match_1(self):
+        matches = URL_RE.findall('ftp://w3af.org')
+        self.assertEqual(matches, [])
+
+    def test_no_match_2(self):
+        matches = URL_RE.findall('httt://w3af.org')
+        self.assertEqual(matches, [])
+
+    def test_no_match_3(self):
+        matches = URL_RE.findall('http!://w3af.org')
+        self.assertEqual(matches, [])
+
+    def test_no_match_4(self):
+        matches = URL_RE.findall('http:--w3af.org')
+        self.assertEqual(matches, [])
+
+
+class TestRelativeURLRegex(unittest.TestCase):
+    def test_simple_filename(self):
+        matches = RELATIVE_URL_RE.findall('/abc.html')
+        self.assertEqual(matches[0][0], '/abc.html')
+
+    @unittest.SkipTest
+    def test_starts_without_slash(self):
+        #
+        # TODO: This is a bug!
+        #
+        #       Removing the SkipTest will show that the test is matching
+        #       /def/123.html instead of the expected abd/def/123.html
+        #
+        #       The regular expression matches start with /
+        #
+        matches = RELATIVE_URL_RE.findall('abc/def/123.html')
+        self.assertEqual(matches[0][0], 'abc/def/123.html')
+
+    def test_with_padding(self):
+        matches = RELATIVE_URL_RE.findall('123 /abc/def/123.html 456')
+        self.assertEqual(matches[0][0], '/abc/def/123.html')
+
+    def test_two_slashes(self):
+        # This is filtered by ReExtract._filter_false_urls
+        matches = RELATIVE_URL_RE.findall('//foo.123.html')
+        self.assertEqual(matches[0][0], '//foo.123.html')
+
+    def test_relative(self):
+        matches = RELATIVE_URL_RE.findall('../../foobar/uploads/bar.html')
+        self.assertEqual(matches[0][0], '/../foobar/uploads/bar.html')
+
+    def test_query_string(self):
+        matches = RELATIVE_URL_RE.findall('/foo.html?id=1')
+        self.assertEqual(matches[0][0], '/foo.html?id=1')
+
+    def test_path_query_string(self):
+        matches = RELATIVE_URL_RE.findall('/abc/foo.html?id=1')
+        self.assertEqual(matches[0][0], '/abc/foo.html?id=1')
+
+    def test_path_query_string_multi(self):
+        matches = RELATIVE_URL_RE.findall('/abc/foo.html?id=1&foo=1')
+        self.assertEqual(matches[0][0], '/abc/foo.html?id=1&foo=1')
+
+    def test_full_url(self):
+        # This is filtered by ReExtract._filter_false_urls
+        matches = RELATIVE_URL_RE.findall('http://w3af.org/foo.html')
+        self.assertEqual(matches[0][0], '://w3af.org/foo.html')
+
+    def test_with_fake_start(self):
+        matches = RELATIVE_URL_RE.findall('</abc> /def.html')
+        self.assertEqual(matches[0][0], '/def.html')
+
+    def test_no_match_1(self):
+        matches = RELATIVE_URL_RE.findall('/abc')
+        self.assertEqual(matches, [])
+
+    def test_no_match_2(self):
+        matches = RELATIVE_URL_RE.findall('abc.html')
+        self.assertEqual(matches, [])
+
diff --git a/w3af/core/data/parsers/utils/url_regex.py b/w3af/core/data/parsers/utils/url_regex.py
@@ -0,0 +1,14 @@
+import re
+
+URL_RE = re.compile(r'((http|https)://([\w:@\-./]*?)[^ \0\n\r\t"\'<>]*)', re.U | re.I)
+
+RELATIVE_URL_RE = re.compile(
+    r'((:?[/]{1,2}[\w\-~.%]+)+'
+    # extension with two to four characters
+    r'\.\w{2,4}'
+    # query string
+    r'(((\?)'
+    # query string parameter
+    r'([\w\-~.%]*=[\w\-~.%]*))'
+    # ampersand and more parameters
+    r'((&)([\w\-~.%]*=[\w\-~.%]*))*)?)', re.U | re.I)