Skip to content

Commit

Permalink
Unittests, better documentation and minor refactoring
Browse files Browse the repository at this point in the history
  • Loading branch information
andresriancho committed Nov 28, 2019
1 parent 11831e1 commit 8bf5853
Show file tree
Hide file tree
Showing 7 changed files with 220 additions and 29 deletions.
10 changes: 0 additions & 10 deletions w3af/core/data/parsers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +0,0 @@
import re

#URL_RE = ('((http|https):[A-Za-z0-9/](([A-Za-z0-9$_.+!*(),;/?:@&~=-])|%'
# '[A-Fa-f0-9]{2})+(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*(),;/?:@&~=%-]*))?)')

URL_RE = re.compile(r'((http|https)://([\w:@\-./]*?)[^ \0\n\r\t"\'<>]*)', re.U)

RELATIVE_URL_RE = re.compile(
r'((:?[/]{1,2}[\w\-~.%]+)+\.\w{2,4}(((\?)([\w\-~.%]*=[\w\-~.%]*))'
r'((&)([\w\-~.%]*=[\w\-~.%]*))*)?)', re.U)
4 changes: 2 additions & 2 deletions w3af/core/data/parsers/pynarcissus/link_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
"""
from w3af.core.data.parsers.pynarcissus.string_extractor import StringExtractor
from w3af.core.data.parsers import URL_RE
from w3af.core.data.parsers.utils.url_regex import URL_RE, RELATIVE_URL_RE
from w3af.core.data.parsers.doc.url import URL


Expand All @@ -41,4 +41,4 @@ def extract_full_urls(self):
except ValueError:
pass

return urls
return urls
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
"""
import unittest

from w3af.core.data.parsers import URL_RE
from w3af.core.data.parsers.utils.url_regex import URL_RE
from w3af.core.data.parsers.doc.url import URL
from w3af.core.data.parsers.pynarcissus.link_extractor import JSLinkExtractor
from w3af.core.data.parsers.pynarcissus.tests.test_string_extractor import JSParserMixin
Expand All @@ -47,4 +47,4 @@ def test_jquery_re(self):
except ValueError:
pass

return urls
return urls
48 changes: 33 additions & 15 deletions w3af/core/data/parsers/utils/re_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,17 +23,21 @@

from w3af.core.data.parsers.doc.baseparser import BaseParser
from w3af.core.data.parsers.doc.url import URL
from w3af.core.data.parsers import URL_RE, RELATIVE_URL_RE
from w3af.core.data.parsers.utils.url_regex import URL_RE, RELATIVE_URL_RE


class ReExtract(BaseParser):
"""
A helper that extracts URLs from a string using regular expressions.
A helper class that extracts URLs from a string using regular expressions.
THIS CODE IS SLOW! USE WITH CARE!
THIS CODE IS SLOW!
DO NOT APPLY THESE REGULAR EXPRESSIONS TO ALL HTTP RESPONSES!
You have been warned.
:author: Andres Riancho (andres.riancho@gmail.com)
"""

# Matches strings like:
#
# PHP/5.2.4-2ubuntu5.7
Expand All @@ -45,7 +49,11 @@ class ReExtract(BaseParser):

QUOTES = {"'", '"'}

def __init__(self, doc_string, base_url, encoding, relative=True,
def __init__(self,
doc_string,
base_url,
encoding,
relative=True,
require_quotes=False):
self._re_urls = set()

Expand All @@ -65,13 +73,17 @@ def parse(self):
self._extract_relative_urls(self._doc_string)

def _is_quoted(self, url_mo, doc_string):
"""
:return: True if the URL extracted using regular expressions has quotes
around it.
"""
start, end = url_mo.span()
doc_string_len = len(doc_string)

if end == doc_string_len:
return False

if doc_string[start-1] not in self.QUOTES:
if doc_string[start - 1] not in self.QUOTES:
return False

if doc_string[end] not in self.QUOTES:
Expand All @@ -97,14 +109,20 @@ def _extract_full_urls(self, doc_string):

def _extract_relative_urls(self, doc_string):
"""
Now detect some relative URL's (also using regexs)
Extract relative URL's using regular expressions
"""
# TODO: Also matches //foo/bar.txt and http://host.tld/foo/bar.txt
# I'm removing those matches with the filter
#
# The RELATIVE_URL_RE is very complex and (for some cases) dumb.
#
# The regular expression results need to be filtered to make sure
# they are actually URLs. Some of the URLs that will be dropped by
# `_filter_false_urls` will be caught by `URL_RE`.
#
# Take a look at `test_url_regex.py` for examples
#
relative_urls = RELATIVE_URL_RE.finditer(doc_string)
filter_false_urls = self._filter_false_urls

for url_mo in filter(filter_false_urls, relative_urls):
for url_mo in filter(self._filter_false_urls, relative_urls):
if self._require_quotes:
if not self._is_quoted(url_mo, doc_string):
continue
Expand All @@ -113,16 +131,16 @@ def _extract_relative_urls(self, doc_string):
url = self._base_url.url_join(url_mo.group(0)).url_string
url = URL(self._decode_url(url), encoding=self._encoding)
except ValueError:
# In some cases, the relative URL is invalid and triggers an
#
# In some cases, the relative URL is invalid and triggers a
# ValueError: Invalid URL "%s" exception. All we can do at this
# point is to ignore this "fake relative URL".
# point is to ignore it.
#
pass
else:
url_lower = url.url_string.lower()

if url_lower.startswith('http://') or \
url_lower.startswith('https://'):

if url_lower.startswith('http://') or url_lower.startswith('https://'):
self._re_urls.add(url)

def _filter_false_urls(self, potential_url_mo):
Expand Down
39 changes: 39 additions & 0 deletions w3af/core/data/parsers/utils/tests/test_re_extract.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# -*- coding: utf-8 -*-
"""
test_re_extract.py
Copyright 2019 Andres Riancho
This file is part of w3af, http://w3af.org/ .
w3af is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation version 2 of the License.
w3af is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with w3af; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
"""
import unittest

from w3af.core.data.parsers.utils.re_extract import ReExtract
from w3af.core.data.parsers.doc.url import URL


class TestReExtract(unittest.TestCase):
def test_relative_regex(self):
doc_string = '123 ../../foobar/uploads/foo.png 465'
base_url = URL('https://w3af.org/abc/def/')

re_extract = ReExtract(doc_string, base_url, 'utf-8')
re_extract.parse()

references = re_extract.get_references()

self.assertEqual(references, [URL('https://w3af.org/foobar/uploads/foo.png')])
130 changes: 130 additions & 0 deletions w3af/core/data/parsers/utils/tests/test_url_regex.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
# -*- coding: utf-8 -*-
"""
test_url_regex.py
Copyright 2019 Andres Riancho
This file is part of w3af, http://w3af.org/ .
w3af is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation version 2 of the License.
w3af is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with w3af; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
"""
import unittest


from w3af.core.data.parsers.utils.url_regex import URL_RE, RELATIVE_URL_RE


class TestURLRegex(unittest.TestCase):
def test_simple_domain(self):
matches = URL_RE.findall('http://w3af.org/')
self.assertEqual(matches[0][0], 'http://w3af.org/')

def test_case_insensitive(self):
matches = URL_RE.findall('hTTp://w3af.org/')
self.assertEqual(matches[0][0], 'hTTp://w3af.org/')

def test_simple_domain_padding(self):
matches = URL_RE.findall('123 http://w3af.org/ 456')
self.assertEqual(matches[0][0], 'http://w3af.org/')

def test_domain_filename_padding(self):
matches = URL_RE.findall('123 http://w3af.org/scanner 456')
self.assertEqual(matches[0][0], 'http://w3af.org/scanner')

def test_domain_filename_query_string_padding(self):
matches = URL_RE.findall('123 http://w3af.org/scanner?id=1 456')
self.assertEqual(matches[0][0], 'http://w3af.org/scanner?id=1')

def test_domain_filename_query_string_multiple_params_padding(self):
matches = URL_RE.findall('123 http://w3af.org/scanner?id=1&foo=bar 456')
self.assertEqual(matches[0][0], 'http://w3af.org/scanner?id=1&foo=bar')

def test_no_match_1(self):
matches = URL_RE.findall('ftp://w3af.org')
self.assertEqual(matches, [])

def test_no_match_2(self):
matches = URL_RE.findall('httt://w3af.org')
self.assertEqual(matches, [])

def test_no_match_3(self):
matches = URL_RE.findall('http!://w3af.org')
self.assertEqual(matches, [])

def test_no_match_4(self):
matches = URL_RE.findall('http:--w3af.org')
self.assertEqual(matches, [])


class TestRelativeURLRegex(unittest.TestCase):
def test_simple_filename(self):
matches = RELATIVE_URL_RE.findall('/abc.html')
self.assertEqual(matches[0][0], '/abc.html')

@unittest.SkipTest
def test_starts_without_slash(self):
#
# TODO: This is a bug!
#
# Removing the SkipTest will show that the test is matching
# /def/123.html instead of the expected abd/def/123.html
#
# The regular expression matches start with /
#
matches = RELATIVE_URL_RE.findall('abc/def/123.html')
self.assertEqual(matches[0][0], 'abc/def/123.html')

def test_with_padding(self):
matches = RELATIVE_URL_RE.findall('123 /abc/def/123.html 456')
self.assertEqual(matches[0][0], '/abc/def/123.html')

def test_two_slashes(self):
# This is filtered by ReExtract._filter_false_urls
matches = RELATIVE_URL_RE.findall('//foo.123.html')
self.assertEqual(matches[0][0], '//foo.123.html')

def test_relative(self):
matches = RELATIVE_URL_RE.findall('../../foobar/uploads/bar.html')
self.assertEqual(matches[0][0], '/../foobar/uploads/bar.html')

def test_query_string(self):
matches = RELATIVE_URL_RE.findall('/foo.html?id=1')
self.assertEqual(matches[0][0], '/foo.html?id=1')

def test_path_query_string(self):
matches = RELATIVE_URL_RE.findall('/abc/foo.html?id=1')
self.assertEqual(matches[0][0], '/abc/foo.html?id=1')

def test_path_query_string_multi(self):
matches = RELATIVE_URL_RE.findall('/abc/foo.html?id=1&foo=1')
self.assertEqual(matches[0][0], '/abc/foo.html?id=1&foo=1')

def test_full_url(self):
# This is filtered by ReExtract._filter_false_urls
matches = RELATIVE_URL_RE.findall('http://w3af.org/foo.html')
self.assertEqual(matches[0][0], '://w3af.org/foo.html')

def test_with_fake_start(self):
matches = RELATIVE_URL_RE.findall('</abc> /def.html')
self.assertEqual(matches[0][0], '/def.html')

def test_no_match_1(self):
matches = RELATIVE_URL_RE.findall('/abc')
self.assertEqual(matches, [])

def test_no_match_2(self):
matches = RELATIVE_URL_RE.findall('abc.html')
self.assertEqual(matches, [])

14 changes: 14 additions & 0 deletions w3af/core/data/parsers/utils/url_regex.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import re

URL_RE = re.compile(r'((http|https)://([\w:@\-./]*?)[^ \0\n\r\t"\'<>]*)', re.U | re.I)

RELATIVE_URL_RE = re.compile(
r'((:?[/]{1,2}[\w\-~.%]+)+'
# extension with two to four characters
r'\.\w{2,4}'
# query string
r'(((\?)'
# query string parameter
r'([\w\-~.%]*=[\w\-~.%]*))'
# ampersand and more parameters
r'((&)([\w\-~.%]*=[\w\-~.%]*))*)?)', re.U | re.I)

0 comments on commit 8bf5853

Please sign in to comment.