diff --git a/Allura/allura/lib/helpers.py b/Allura/allura/lib/helpers.py index 9e9f8b294..c822d9e00 100644 --- a/Allura/allura/lib/helpers.py +++ b/Allura/allura/lib/helpers.py @@ -14,6 +14,8 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from __future__ import annotations + import base64 import sys import os @@ -21,9 +23,9 @@ import difflib import jinja2 -import six.moves.urllib.request -import six.moves.urllib.parse -import six.moves.urllib.error +import urllib.request +import urllib.parse +import urllib.error import re import unicodedata import json @@ -65,6 +67,7 @@ from allura.lib import exceptions as exc from allura.lib import utils +from allura.lib import validators import urllib.parse as urlparse from urllib.parse import urlencode import math @@ -201,16 +204,16 @@ def patchem(func): def urlquote(url, safe=b"/"): try: - return six.moves.urllib.parse.quote(str(url), safe=safe) + return urllib.parse.quote(str(url), safe=safe) except UnicodeEncodeError: - return six.moves.urllib.parse.quote(url.encode('utf-8'), safe=safe) + return urllib.parse.quote(url.encode('utf-8'), safe=safe) def urlquoteplus(url, safe=b""): try: - return six.moves.urllib.parse.quote_plus(str(url), safe=safe) + return urllib.parse.quote_plus(str(url), safe=safe) except UnicodeEncodeError: - return six.moves.urllib.parse.quote_plus(url.encode('utf-8'), safe=safe) + return urllib.parse.quote_plus(url.encode('utf-8'), safe=safe) def urlquote_path_only(url): @@ -1027,7 +1030,18 @@ def inner(*args, **kwargs): return inner -def urlopen(url, retries=3, codes=(408, 500, 502, 503, 504), timeout=None): +class NoRedirectToInternal(urllib.request.HTTPRedirectHandler): + def redirect_request(self, req: urllib.request.Request, fp, code, msg, headers, newurl: str): + if not asbool(tg.config.get('urlopen_allow_internal_hostnames', 'false')): + validators.URLIsPrivate().to_python(newurl, None) + return super().redirect_request(req, fp, code, msg, headers, newurl) + + +opener = urllib.request.build_opener(NoRedirectToInternal) +urllib.request.install_opener(opener) + + +def urlopen(url: str | urllib.request.Request, retries=3, codes=(408, 500, 502, 503, 504), timeout=None): """Open url, optionally retrying if an error is encountered. Socket and other IO errors will always be retried if retries > 0. @@ -1037,12 +1051,22 @@ def urlopen(url, retries=3, codes=(408, 500, 502, 503, 504), timeout=None): :param codes: HTTP error codes that should be retried. """ + if isinstance(url, urllib.request.Request): + url_str = url.full_url + else: + url_str = url + if not url_str.startswith(('http://', 'https://')): + raise ValueError(f'URL must be http(s), got {url_str}') + if not asbool(tg.config.get('urlopen_allow_internal_hostnames', 'false')): + # will raise error if hostname resolves to private address space: + validators.URLIsPrivate().to_python(url_str, None) + attempts = 0 while True: try: - return six.moves.urllib.request.urlopen(url, timeout=timeout) + return urllib.request.urlopen(url, timeout=timeout) except OSError as e: - no_retry = isinstance(e, six.moves.urllib.error.HTTPError) and e.code not in codes + no_retry = isinstance(e, urllib.error.HTTPError) and e.code not in codes if attempts < retries and not no_retry: attempts += 1 continue diff --git a/Allura/allura/tests/test_helpers.py b/Allura/allura/tests/test_helpers.py index 9db093a81..9a12062bd 100644 --- a/Allura/allura/tests/test_helpers.py +++ b/Allura/allura/tests/test_helpers.py @@ -454,13 +454,13 @@ def test_plain2markdown(): class TestUrlOpen(TestCase): - @patch('six.moves.urllib.request.urlopen') + @patch('urllib.request.urlopen') def test_no_error(self, urlopen): - r = h.urlopen('myurl') + r = h.urlopen('http://example.com') self.assertEqual(r, urlopen.return_value) - urlopen.assert_called_once_with('myurl', timeout=None) + urlopen.assert_called_once_with('http://example.com', timeout=None) - @patch('six.moves.urllib.request.urlopen') + @patch('urllib.request.urlopen') def test_socket_timeout(self, urlopen): import socket @@ -468,10 +468,10 @@ def side_effect(url, timeout=None): raise socket.timeout() urlopen.side_effect = side_effect - self.assertRaises(socket.timeout, h.urlopen, 'myurl') + self.assertRaises(socket.timeout, h.urlopen, 'http://example.com') self.assertEqual(urlopen.call_count, 4) - @patch('six.moves.urllib.request.urlopen') + @patch('urllib.request.urlopen') def test_socket_reset(self, urlopen): import socket import errno @@ -480,10 +480,10 @@ def side_effect(url, timeout=None): raise OSError(errno.ECONNRESET, 'Connection reset by peer') urlopen.side_effect = side_effect - self.assertRaises(socket.error, h.urlopen, 'myurl') + self.assertRaises(socket.error, h.urlopen, 'http://example.com') self.assertEqual(urlopen.call_count, 4) - @patch('six.moves.urllib.request.urlopen') + @patch('urllib.request.urlopen') def test_handled_http_error(self, urlopen): from urllib.error import HTTPError @@ -491,10 +491,10 @@ def side_effect(url, timeout=None): raise HTTPError('url', 408, 'timeout', None, io.BytesIO()) urlopen.side_effect = side_effect - self.assertRaises(HTTPError, h.urlopen, 'myurl') + self.assertRaises(HTTPError, h.urlopen, 'http://example.com') self.assertEqual(urlopen.call_count, 4) - @patch('six.moves.urllib.request.urlopen') + @patch('urllib.request.urlopen') def test_unhandled_http_error(self, urlopen): from urllib.error import HTTPError @@ -502,7 +502,7 @@ def side_effect(url, timeout=None): raise HTTPError('url', 404, 'timeout', None, io.BytesIO()) urlopen.side_effect = side_effect - self.assertRaises(HTTPError, h.urlopen, 'myurl') + self.assertRaises(HTTPError, h.urlopen, 'http://example.com') self.assertEqual(urlopen.call_count, 1) diff --git a/Allura/development.ini b/Allura/development.ini index 7e3850d50..0f93fcc41 100644 --- a/Allura/development.ini +++ b/Allura/development.ini @@ -471,6 +471,9 @@ bulk_export_download_instructions = Sample instructions for {project} importer_upload_path = /tmp/importer_upload/{nbhd}/{project} +; If you want to allow importing resources from internally resolving hostnames, enable this: +;urlopen_allow_internal_hostnames = false + ; To disable any plugin, tool, importer, etc from being available, you can use the disable_entry_points config option. ; Specify the keys and values as they are declared in the tool's "setup.py" file. ; Examples: diff --git a/ForgeImporters/forgeimporters/base.py b/ForgeImporters/forgeimporters/base.py index f21539c29..bd07200c5 100644 --- a/ForgeImporters/forgeimporters/base.py +++ b/ForgeImporters/forgeimporters/base.py @@ -20,15 +20,13 @@ import logging from io import BytesIO -import six.moves.urllib.request -import six.moves.urllib.parse -import six.moves.urllib.error +import urllib.request +import urllib.parse from collections import defaultdict import traceback from urllib.parse import urlparse from urllib.parse import unquote from datetime import datetime -import six from bs4 import BeautifulSoup from tg import expose, validate, flash, redirect, config @@ -159,17 +157,17 @@ class ProjectExtractor: PAGE_MAP = {} - def __init__(self, project_name, page_name=None, **kw): + def __init__(self, project_name, page_name_or_url=None, **kw): self.project_name = project_name self._page_cache = {} self.url = None self.page = None - if page_name: - self.get_page(page_name, **kw) + if page_name_or_url: + self.get_page(page_name_or_url, **kw) @staticmethod def urlopen(url, retries=3, codes=(408, 500, 502, 503, 504), timeout=120, unredirected_hdrs=None, **kw): - req = six.moves.urllib.request.Request(url, **kw) + req = urllib.request.Request(url, **kw) if unredirected_hdrs: for key, val in unredirected_hdrs.items(): req.add_unredirected_header(key, val) @@ -211,7 +209,7 @@ def get_page_url(self, page_name, **kw): """ return self.PAGE_MAP[page_name].format( - project_name=six.moves.urllib.parse.quote(self.project_name), **kw) + project_name=urllib.parse.quote(self.project_name), **kw) def parse_page(self, page): """Transforms the result of a `urlopen` call before returning it from diff --git a/ForgeImporters/forgeimporters/tests/test_base.py b/ForgeImporters/forgeimporters/tests/test_base.py index fe3d18d36..07672023c 100644 --- a/ForgeImporters/forgeimporters/tests/test_base.py +++ b/ForgeImporters/forgeimporters/tests/test_base.py @@ -34,16 +34,44 @@ class TestProjectExtractor(TestCase): @mock.patch('forgeimporters.base.h.urlopen') - @mock.patch('six.moves.urllib.request.Request') + @mock.patch('urllib.request.Request') def test_urlopen(self, Request, urlopen): - r = base.ProjectExtractor.urlopen('myurl', data='foo') - Request.assert_called_once_with('myurl', data='foo') + r = base.ProjectExtractor.urlopen('https://example.com/asdf', data='foo') + Request.assert_called_once_with('https://example.com/asdf', data='foo') req = Request.return_value req.add_header.assert_called_once_with( 'User-Agent', 'Allura Data Importer (https://allura.apache.org/)') urlopen.assert_called_once_with(req, retries=3, codes=(408, 500, 502, 503, 504), timeout=120) self.assertEqual(r, urlopen.return_value) + def test_urlopen_invalid(self): + with pytest.raises(ValueError): + base.ProjectExtractor.urlopen('myurl', data='foo') + with pytest.raises(ValueError): + base.ProjectExtractor.urlopen('file:///etc/passwd', data='foo') + with pytest.raises(ValueError): + base.ProjectExtractor.urlopen('ftp://something.com', data='foo') + + def test_urlopen_internal_blocked(self): + # by default this is invalid + with pytest.raises(Invalid): + base.ProjectExtractor.urlopen('http://localhost:1234/blah', data='foo') + + # redirect to external site ok + base.ProjectExtractor.urlopen('https://httpbin.org/redirect-to?url=http%3A%2F%2Fexample.com%2F') + + # redirect to internal is blocked + with pytest.raises(Invalid): + base.ProjectExtractor.urlopen('https://httpbin.org/redirect-to?url=http%3A%2F%2Flocalhost%2F') + + @mock.patch('forgeimporters.base.h.urlopen') + @mock.patch('urllib.request.Request') + def test_urlopen_internal_ok(self, Request, urlopen): + # does NOT raise Invalid + with mock.patch.dict(base.config, {'urlopen_allow_internal_hostnames': 'true'}): + base.ProjectExtractor.urlopen('http://localhost:1234/blah', data='foo') + Request.assert_called_once_with('http://localhost:1234/blah', data='foo') + @mock.patch.object(base, 'datetime') @mock.patch.object(base, 'M')