Skip to content

Commit

Permalink
Add ignore extensions
Browse files Browse the repository at this point in the history
  • Loading branch information
andresriancho committed Apr 24, 2019
1 parent ac6c6dc commit bbc2414
Showing 1 changed file with 43 additions and 19 deletions.
62 changes: 43 additions & 19 deletions w3af/plugins/crawl/web_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
from w3af.core.data.dc.generic.form import Form
from w3af.core.data.dc.cookie import Cookie
from w3af.core.data.options.opt_factory import opt_factory
from w3af.core.data.options.option_types import BOOL, REGEX
from w3af.core.data.options.option_types import BOOL, REGEX, LIST
from w3af.core.data.options.option_list import OptionList
from w3af.core.data.request.fuzzable_request import FuzzableRequest

Expand Down Expand Up @@ -71,6 +71,7 @@ def __init__(self):
self._ignore_regex = ''
self._follow_regex = '.*'
self._only_forward = False
self._ignore_extensions = []
self._compile_re()

def crawl(self, fuzzable_req):
Expand Down Expand Up @@ -282,6 +283,12 @@ def _should_analyze_url(self, ref):
om.out.debug(msg % args)
return False

if self._has_ignored_extension(ref):
msg = 'web_spider will ignore %s (match ignore extensions)'
args = (ref.url_string,)
om.out.debug(msg % args)
return False

# Implementing only forward
if self._only_forward and not self._is_forward(ref):
msg = 'web_spider will ignore %s (is not forward)'
Expand All @@ -291,6 +298,12 @@ def _should_analyze_url(self, ref):

return True

def _has_ignored_extension(self, new_url):
if not self._ignore_extensions:
return False

return new_url.get_extension().lower() in self._ignore_extensions

def _should_verify_extracted_url(self, ref, resp):
"""
:param ref: A newly found URL
Expand Down Expand Up @@ -460,23 +473,29 @@ def get_options(self):
"""
ol = OptionList()

d = ('When crawling only follow links to paths inside the one given'
' as target.')
d = 'Only crawl links to paths inside the URL given as target.'
o = opt_factory('only_forward', self._only_forward, d, BOOL)
ol.add(o)

d = ('When crawling only follow which that match this regular'
' expression. Please note that ignore_regex has precedence over'
' follow_regex.')
d = ('Only crawl links that match this regular expression.'
' Note that ignore_regex has precedence over follow_regex.')
o = opt_factory('follow_regex', self._follow_regex, d, REGEX)
ol.add(o)

d = ('When crawling, DO NOT follow links that match this regular'
' expression. Please note that ignore_regex has precedence over'
' follow_regex.')
d = ('DO NOT crawl links that match this regular expression.'
' Note that ignore_regex has precedence over follow_regex.')
o = opt_factory('ignore_regex', self._ignore_regex, d, REGEX)
ol.add(o)

d = 'DO NOT crawl links that use these extensions.'
h = ('This configuration parameter is commonly used to ignore'
' static files such as zip, pdf, jpeg, etc. It is possible to'
' ignore these files using `ignore_regex`, but configuring'
' this parameter is easier and performs case insensitive'
' matching.')
o = opt_factory('ignore_extensions', self._ignore_extensions, d, LIST, help=h)
ol.add(o)

return ol

def set_options(self, options_list):
Expand All @@ -488,10 +507,14 @@ def set_options(self, options_list):
:return: No value is returned.
"""
self._only_forward = options_list['only_forward'].get_value()

self._ignore_regex = options_list['ignore_regex'].get_value()
self._follow_regex = options_list['follow_regex'].get_value()
self._compile_re()

self._ignore_extensions = options_list['ignore_extensions'].get_value()
self._ignore_extensions = [ext.lower() for ext in self._ignore_extensions]

def _compile_re(self):
"""
Compile the regular expressions that are going to be used to ignore
Expand Down Expand Up @@ -520,21 +543,22 @@ def get_long_desc(self):
This plugin is a classic web spider, it will request a URL and extract
all links and forms from the response.
Three configurable parameter exist:
Four configurable parameter exist:
- only_forward
- ignore_regex
- follow_regex
- ignore_regex
- ignore_extensions
ignore_regex and follow_regex are commonly used to configure the
web_spider to spider all URLs except the "logout" or some other more
exciting link like "Reboot Appliance" that would make the w3af run
finish without the expected result.
web_spider to crawl all URLs except "/logout" or some more
exciting link like "Reboot Appliance" that would greatly reduce
the scan test coverage.
By default ignore_regex is an empty string (nothing is ignored) and
follow_regex is '.*' (everything is followed). Both regular expressions
are normal regular expressions that are compiled with Python's re
module.
The regular expressions are applied to the URLs that are found using the
match function.
are compiled with Python's re module and applied to URLs (with query
string included).
The ignore_extensions configuration parameter is commonly used to ignore
static files such as zip, jpeg, pdf, etc.
"""

0 comments on commit bbc2414

Please sign in to comment.