Skip to content

Commit

Permalink
added encode_url function to encode unicode URLs as per IDNA and RFC …
Browse files Browse the repository at this point in the history
…3986 (sebix#7)
  • Loading branch information
hupf committed Feb 20, 2012
1 parent de5a16b commit a92a821
Showing 1 changed file with 53 additions and 3 deletions.
56 changes: 53 additions & 3 deletions textile/functions.py
Expand Up @@ -21,7 +21,8 @@
import re
import uuid
import string
from urlparse import urlparse
import urlparse
import urllib

from textile.tools import sanitizer, imagesize

Expand Down Expand Up @@ -673,7 +674,7 @@ def isRelURL(self, url):
True
"""
(scheme, netloc) = urlparse(url)[0:2]
(scheme, netloc) = urlparse.urlparse(url)[0:2]
return not scheme and not netloc

def relURL(self, url):
Expand All @@ -686,7 +687,7 @@ def relURL(self, url):
'#'
"""
scheme = urlparse(url)[0]
scheme = urlparse.urlparse(url)[0]
if self.restricted and scheme and scheme not in self.url_schemes:
return '#'
return url
Expand Down Expand Up @@ -801,6 +802,7 @@ def fLink(self, match):
url = url[:-1]

url = self.checkRefs(url)
url = self.encode_url(url)

atts = self.pba(atts)
if title:
Expand All @@ -818,6 +820,54 @@ def fLink(self, match):
out = self.shelve(out)
return ''.join([pre, out, post])

def encode_url(self, url):
"""
Converts a (unicode) URL to an ASCII URL, with the domain part
IDNA-encoded and the path part %-encoded (as per RFC 3986).
Fixed version of the following code fragment from Stack Overflow:
http://stackoverflow.com/questions/804336/best-way-to-convert-a-unicode-url-to-ascii-utf-8-percent-escaped-in-python/804380#804380
"""
# turn string into unicode
if not isinstance(url, unicode):
url = url.decode('utf8')

# parse it
parsed = urlparse.urlsplit(url)

# divide the netloc further
netloc_pattern = re.compile(r"""
(?:(?P<user>[^:@]+)(?::(?P<password>[^:@]+))?@)?
(?P<host>[^:]+)
(?::(?P<port>[0-9]+))?
""", re.X | re.U)
netloc_parsed = netloc_pattern.match(parsed.netloc).groupdict()

# encode each component
scheme = parsed.scheme.encode('utf8')
user = netloc_parsed['user'] and urllib.quote(netloc_parsed['user'].encode('utf8'))
password = netloc_parsed['password'] and urllib.quote(netloc_parsed['password'].encode('utf8'))
host = netloc_parsed['host'].encode('idna')
port = netloc_parsed['port'] and netloc_parsed['port'].encode('utf8')
path = '/'.join( # could be encoded slashes!
urllib.quote(urllib.unquote(pce).encode('utf8'),'')
for pce in parsed.path.split('/')
)
query = urllib.quote(urllib.unquote(parsed.query).encode('utf8'), '=&?/')
fragment = urllib.quote(urllib.unquote(parsed.fragment).encode('utf8'))

# put it back together
netloc = ''
if user:
netloc += user
if password:
netloc += '@'+password
netloc += ':'
netloc += host
if port:
netloc += ':'+port
return urlparse.urlunsplit((scheme, netloc, path, query, fragment))

def span(self, text):
"""
>>> t = Textile()
Expand Down

0 comments on commit a92a821

Please sign in to comment.