Skip to content

Commit

Permalink
[#8525] urlopen cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
brondsem committed Nov 3, 2023
1 parent 07b084c commit a484f50
Show file tree
Hide file tree
Showing 5 changed files with 86 additions and 33 deletions.
44 changes: 34 additions & 10 deletions Allura/allura/lib/helpers.py
Expand Up @@ -14,16 +14,18 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from __future__ import annotations

import base64
import sys
import os
import os.path
import difflib
import jinja2

import six.moves.urllib.request
import six.moves.urllib.parse
import six.moves.urllib.error
import urllib.request
import urllib.parse
import urllib.error
import re
import unicodedata
import json
Expand Down Expand Up @@ -65,6 +67,7 @@

from allura.lib import exceptions as exc
from allura.lib import utils
from allura.lib import validators
import urllib.parse as urlparse
from urllib.parse import urlencode
import math
Expand Down Expand Up @@ -201,16 +204,16 @@ def patchem(func):

def urlquote(url, safe=b"/"):
try:
return six.moves.urllib.parse.quote(str(url), safe=safe)
return urllib.parse.quote(str(url), safe=safe)
except UnicodeEncodeError:
return six.moves.urllib.parse.quote(url.encode('utf-8'), safe=safe)
return urllib.parse.quote(url.encode('utf-8'), safe=safe)


def urlquoteplus(url, safe=b""):
try:
return six.moves.urllib.parse.quote_plus(str(url), safe=safe)
return urllib.parse.quote_plus(str(url), safe=safe)
except UnicodeEncodeError:
return six.moves.urllib.parse.quote_plus(url.encode('utf-8'), safe=safe)
return urllib.parse.quote_plus(url.encode('utf-8'), safe=safe)


def urlquote_path_only(url):
Expand Down Expand Up @@ -1027,7 +1030,18 @@ def inner(*args, **kwargs):
return inner


def urlopen(url, retries=3, codes=(408, 500, 502, 503, 504), timeout=None):
class NoRedirectToInternal(urllib.request.HTTPRedirectHandler):
def redirect_request(self, req: urllib.request.Request, fp, code, msg, headers, newurl: str):
if not asbool(tg.config.get('urlopen_allow_internal_hostnames', 'false')):
validators.URLIsPrivate().to_python(newurl, None)
return super().redirect_request(req, fp, code, msg, headers, newurl)


opener = urllib.request.build_opener(NoRedirectToInternal)
urllib.request.install_opener(opener)


def urlopen(url: str | urllib.request.Request, retries=3, codes=(408, 500, 502, 503, 504), timeout=None):
"""Open url, optionally retrying if an error is encountered.
Socket and other IO errors will always be retried if retries > 0.
Expand All @@ -1037,12 +1051,22 @@ def urlopen(url, retries=3, codes=(408, 500, 502, 503, 504), timeout=None):
:param codes: HTTP error codes that should be retried.
"""
if isinstance(url, urllib.request.Request):
url_str = url.full_url
else:
url_str = url
if not url_str.startswith(('http://', 'https://')):
raise ValueError(f'URL must be http(s), got {url_str}')
if not asbool(tg.config.get('urlopen_allow_internal_hostnames', 'false')):
# will raise error if hostname resolves to private address space:
validators.URLIsPrivate().to_python(url_str, None)

attempts = 0
while True:
try:
return six.moves.urllib.request.urlopen(url, timeout=timeout)
return urllib.request.urlopen(url, timeout=timeout)
except OSError as e:
no_retry = isinstance(e, six.moves.urllib.error.HTTPError) and e.code not in codes
no_retry = isinstance(e, urllib.error.HTTPError) and e.code not in codes
if attempts < retries and not no_retry:
attempts += 1
continue
Expand Down
22 changes: 11 additions & 11 deletions Allura/allura/tests/test_helpers.py
Expand Up @@ -454,24 +454,24 @@ def test_plain2markdown():

class TestUrlOpen(TestCase):

@patch('six.moves.urllib.request.urlopen')
@patch('urllib.request.urlopen')
def test_no_error(self, urlopen):
r = h.urlopen('myurl')
r = h.urlopen('http://example.com')
self.assertEqual(r, urlopen.return_value)
urlopen.assert_called_once_with('myurl', timeout=None)
urlopen.assert_called_once_with('http://example.com', timeout=None)

@patch('six.moves.urllib.request.urlopen')
@patch('urllib.request.urlopen')
def test_socket_timeout(self, urlopen):
import socket

def side_effect(url, timeout=None):
raise socket.timeout()

urlopen.side_effect = side_effect
self.assertRaises(socket.timeout, h.urlopen, 'myurl')
self.assertRaises(socket.timeout, h.urlopen, 'http://example.com')
self.assertEqual(urlopen.call_count, 4)

@patch('six.moves.urllib.request.urlopen')
@patch('urllib.request.urlopen')
def test_socket_reset(self, urlopen):
import socket
import errno
Expand All @@ -480,29 +480,29 @@ def side_effect(url, timeout=None):
raise OSError(errno.ECONNRESET, 'Connection reset by peer')

urlopen.side_effect = side_effect
self.assertRaises(socket.error, h.urlopen, 'myurl')
self.assertRaises(socket.error, h.urlopen, 'http://example.com')
self.assertEqual(urlopen.call_count, 4)

@patch('six.moves.urllib.request.urlopen')
@patch('urllib.request.urlopen')
def test_handled_http_error(self, urlopen):
from urllib.error import HTTPError

def side_effect(url, timeout=None):
raise HTTPError('url', 408, 'timeout', None, io.BytesIO())

urlopen.side_effect = side_effect
self.assertRaises(HTTPError, h.urlopen, 'myurl')
self.assertRaises(HTTPError, h.urlopen, 'http://example.com')
self.assertEqual(urlopen.call_count, 4)

@patch('six.moves.urllib.request.urlopen')
@patch('urllib.request.urlopen')
def test_unhandled_http_error(self, urlopen):
from urllib.error import HTTPError

def side_effect(url, timeout=None):
raise HTTPError('url', 404, 'timeout', None, io.BytesIO())

urlopen.side_effect = side_effect
self.assertRaises(HTTPError, h.urlopen, 'myurl')
self.assertRaises(HTTPError, h.urlopen, 'http://example.com')
self.assertEqual(urlopen.call_count, 1)


Expand Down
3 changes: 3 additions & 0 deletions Allura/development.ini
Expand Up @@ -471,6 +471,9 @@ bulk_export_download_instructions = Sample instructions for {project}

importer_upload_path = /tmp/importer_upload/{nbhd}/{project}

; If you want to allow importing resources from internally resolving hostnames, enable this:
;urlopen_allow_internal_hostnames = false

; To disable any plugin, tool, importer, etc from being available, you can use the disable_entry_points config option.
; Specify the keys and values as they are declared in the tool's "setup.py" file.
; Examples:
Expand Down
16 changes: 7 additions & 9 deletions ForgeImporters/forgeimporters/base.py
Expand Up @@ -20,15 +20,13 @@
import logging
from io import BytesIO

import six.moves.urllib.request
import six.moves.urllib.parse
import six.moves.urllib.error
import urllib.request
import urllib.parse
from collections import defaultdict
import traceback
from urllib.parse import urlparse
from urllib.parse import unquote
from datetime import datetime
import six

from bs4 import BeautifulSoup
from tg import expose, validate, flash, redirect, config
Expand Down Expand Up @@ -159,17 +157,17 @@ class ProjectExtractor:

PAGE_MAP = {}

def __init__(self, project_name, page_name=None, **kw):
def __init__(self, project_name, page_name_or_url=None, **kw):
self.project_name = project_name
self._page_cache = {}
self.url = None
self.page = None
if page_name:
self.get_page(page_name, **kw)
if page_name_or_url:
self.get_page(page_name_or_url, **kw)

@staticmethod
def urlopen(url, retries=3, codes=(408, 500, 502, 503, 504), timeout=120, unredirected_hdrs=None, **kw):
req = six.moves.urllib.request.Request(url, **kw)
req = urllib.request.Request(url, **kw)
if unredirected_hdrs:
for key, val in unredirected_hdrs.items():
req.add_unredirected_header(key, val)
Expand Down Expand Up @@ -211,7 +209,7 @@ def get_page_url(self, page_name, **kw):
"""
return self.PAGE_MAP[page_name].format(
project_name=six.moves.urllib.parse.quote(self.project_name), **kw)
project_name=urllib.parse.quote(self.project_name), **kw)

def parse_page(self, page):
"""Transforms the result of a `urlopen` call before returning it from
Expand Down
34 changes: 31 additions & 3 deletions ForgeImporters/forgeimporters/tests/test_base.py
Expand Up @@ -34,16 +34,44 @@
class TestProjectExtractor(TestCase):

@mock.patch('forgeimporters.base.h.urlopen')
@mock.patch('six.moves.urllib.request.Request')
@mock.patch('urllib.request.Request')
def test_urlopen(self, Request, urlopen):
r = base.ProjectExtractor.urlopen('myurl', data='foo')
Request.assert_called_once_with('myurl', data='foo')
r = base.ProjectExtractor.urlopen('https://example.com/asdf', data='foo')
Request.assert_called_once_with('https://example.com/asdf', data='foo')
req = Request.return_value
req.add_header.assert_called_once_with(
'User-Agent', 'Allura Data Importer (https://allura.apache.org/)')
urlopen.assert_called_once_with(req, retries=3, codes=(408, 500, 502, 503, 504), timeout=120)
self.assertEqual(r, urlopen.return_value)

def test_urlopen_invalid(self):
with pytest.raises(ValueError):
base.ProjectExtractor.urlopen('myurl', data='foo')
with pytest.raises(ValueError):
base.ProjectExtractor.urlopen('file:///etc/passwd', data='foo')
with pytest.raises(ValueError):
base.ProjectExtractor.urlopen('ftp://something.com', data='foo')

def test_urlopen_internal_blocked(self):
# by default this is invalid
with pytest.raises(Invalid):
base.ProjectExtractor.urlopen('http://localhost:1234/blah', data='foo')

# redirect to external site ok
base.ProjectExtractor.urlopen('https://httpbin.org/redirect-to?url=http%3A%2F%2Fexample.com%2F')

# redirect to internal is blocked
with pytest.raises(Invalid):
base.ProjectExtractor.urlopen('https://httpbin.org/redirect-to?url=http%3A%2F%2Flocalhost%2F')

@mock.patch('forgeimporters.base.h.urlopen')
@mock.patch('urllib.request.Request')
def test_urlopen_internal_ok(self, Request, urlopen):
# does NOT raise Invalid
with mock.patch.dict(base.config, {'urlopen_allow_internal_hostnames': 'true'}):
base.ProjectExtractor.urlopen('http://localhost:1234/blah', data='foo')
Request.assert_called_once_with('http://localhost:1234/blah', data='foo')


@mock.patch.object(base, 'datetime')
@mock.patch.object(base, 'M')
Expand Down

0 comments on commit a484f50

Please sign in to comment.