Add ignore extensions

andresriancho · Apr 24, 2019 · bbc2414 · bbc2414
1 parent ac6c6dc
commit bbc2414
Showing 1 changed file with 43 additions and 19 deletions.
diff --git a/w3af/plugins/crawl/web_spider.py b/w3af/plugins/crawl/web_spider.py
@@ -41,7 +41,7 @@
 from w3af.core.data.dc.generic.form import Form
 from w3af.core.data.dc.cookie import Cookie
 from w3af.core.data.options.opt_factory import opt_factory
-from w3af.core.data.options.option_types import BOOL, REGEX
+from w3af.core.data.options.option_types import BOOL, REGEX, LIST
 from w3af.core.data.options.option_list import OptionList
 from w3af.core.data.request.fuzzable_request import FuzzableRequest
 
@@ -71,6 +71,7 @@ def __init__(self):
         self._ignore_regex = ''
         self._follow_regex = '.*'
         self._only_forward = False
+        self._ignore_extensions = []
         self._compile_re()
 
     def crawl(self, fuzzable_req):
@@ -282,6 +283,12 @@ def _should_analyze_url(self, ref):
             om.out.debug(msg % args)
             return False
 
+        if self._has_ignored_extension(ref):
+            msg = 'web_spider will ignore %s (match ignore extensions)'
+            args = (ref.url_string,)
+            om.out.debug(msg % args)
+            return False
+
         # Implementing only forward
         if self._only_forward and not self._is_forward(ref):
             msg = 'web_spider will ignore %s (is not forward)'
@@ -291,6 +298,12 @@ def _should_analyze_url(self, ref):
 
         return True
 
+    def _has_ignored_extension(self, new_url):
+        if not self._ignore_extensions:
+            return False
+
+        return new_url.get_extension().lower() in self._ignore_extensions
+
     def _should_verify_extracted_url(self, ref, resp):
         """
         :param ref: A newly found URL
@@ -460,23 +473,29 @@ def get_options(self):
         """
         ol = OptionList()
 
-        d = ('When crawling only follow links to paths inside the one given'
-             ' as target.')
+        d = 'Only crawl links to paths inside the URL given as target.'
         o = opt_factory('only_forward', self._only_forward, d, BOOL)
         ol.add(o)
 
-        d = ('When crawling only follow which that match this regular'
-             ' expression. Please note that ignore_regex has precedence over'
-             ' follow_regex.')
+        d = ('Only crawl links that match this regular expression.'
+             ' Note that ignore_regex has precedence over follow_regex.')
         o = opt_factory('follow_regex', self._follow_regex, d, REGEX)
         ol.add(o)
 
-        d = ('When crawling, DO NOT follow links that match this regular'
-             ' expression. Please note that ignore_regex has precedence over'
-             ' follow_regex.')
+        d = ('DO NOT crawl links that match this regular expression.'
+             ' Note that ignore_regex has precedence over follow_regex.')
         o = opt_factory('ignore_regex', self._ignore_regex, d, REGEX)
         ol.add(o)
 
+        d = 'DO NOT crawl links that use these extensions.'
+        h = ('This configuration parameter is commonly used to ignore'
+             ' static files such as zip, pdf, jpeg, etc. It is possible to'
+             ' ignore these files using `ignore_regex`, but configuring'
+             ' this parameter is easier and performs case insensitive'
+             ' matching.')
+        o = opt_factory('ignore_extensions', self._ignore_extensions, d, LIST, help=h)
+        ol.add(o)
+
         return ol
 
     def set_options(self, options_list):
@@ -488,10 +507,14 @@ def set_options(self, options_list):
         :return: No value is returned.
         """
         self._only_forward = options_list['only_forward'].get_value()
+
         self._ignore_regex = options_list['ignore_regex'].get_value()
         self._follow_regex = options_list['follow_regex'].get_value()
         self._compile_re()
 
+        self._ignore_extensions = options_list['ignore_extensions'].get_value()
+        self._ignore_extensions = [ext.lower() for ext in self._ignore_extensions]
+
     def _compile_re(self):
         """
         Compile the regular expressions that are going to be used to ignore
@@ -520,21 +543,22 @@ def get_long_desc(self):
         This plugin is a classic web spider, it will request a URL and extract
         all links and forms from the response.
 
-        Three configurable parameter exist:
+        Four configurable parameter exist:
             - only_forward
-            - ignore_regex
             - follow_regex
+            - ignore_regex
+            - ignore_extensions
 
         ignore_regex and follow_regex are commonly used to configure the
-        web_spider to spider all URLs except the "logout" or some other more
-        exciting link like "Reboot Appliance" that would make the w3af run
-        finish without the expected result.
+        web_spider to crawl all URLs except "/logout" or some more
+        exciting link like "Reboot Appliance" that would greatly reduce
+        the scan test coverage.
 
         By default ignore_regex is an empty string (nothing is ignored) and
         follow_regex is '.*' (everything is followed). Both regular expressions
-        are normal regular expressions that are compiled with Python's re
-        module.
-
-        The regular expressions are applied to the URLs that are found using the
-        match function.
+        are compiled with Python's re module and applied to URLs (with query
+        string included).
+        
+        The ignore_extensions configuration parameter is commonly used to ignore
+        static files such as zip, jpeg, pdf, etc.
         """