Initial commit.

aljungberg · Aug 1, 2013 · ada3a48 · ada3a48
commit ada3a48
Show file tree

Hide file tree

Showing 7 changed files with 375 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+/dist/
+*.pyc
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,26 @@
+Copyright (c) 2010-2013 Alexander Ljungberg.
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+Redistributions of source code must retain the above copyright notice, this
+list of conditions and the following disclaimer. Redistributions in binary
+form must reproduce the above copyright notice, this list of conditions and
+the following disclaimer in the documentation and/or other materials provided
+with the distribution. Neither the name of Alexander Ljungberg nor the names
+of its contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
diff --git a/MANIFEST b/MANIFEST
@@ -0,0 +1,4 @@
+# file GENERATED by distutils, do NOT edit
+setup.py
+hexahexacontadecimal/__init__.py
+hexahexacontadecimal/num_encode_base64.py
diff --git a/README.md b/README.md
@@ -0,0 +1,101 @@
+Hexahexacontadecimal
+====================
+
+*Hexahexacontadecimal is the most compact way to encode a number into a URL.*
+
+Hexahexacontadecimal is a compact format to express a number in a URL. It uses all characters allowed in
+a URL without escaping -- the [unreserved characters](http://tools.ietf.org/html/rfc3986#section-2.3) --
+making it the shortest possible way to express an integer in a URL.
+
+## Usage
+
+    from hexahexacontadecimal import hexahexacontadecimal_encode_int, hexahexacontadecimal_decode_int
+
+    print hexahexacontadecimal_encode_int(302231454903657293676544)  # 'iFsGUkO.0tsxw'
+    print hexahexacontadecimal_decode_int('iFsGUkO.0tsxw')           # 302231454903657293676544L
+
+Note that urllib.quote escapes tilde (~) (http://bugs.python.org/issue16285), which is not necessary as
+of RFC3986.
+
+### Hexahexacontadecimal vs Base 64 in URLs
+
+    >>> n = 292231454903657293676544
+    >>> import base64
+    >>> urlquote(base64.urlsafe_b64encode(long_to_binary(n)))
+    'PeHmHzZFTcAAAA%3D%3D'
+    >>> urlquote(hexahexacontadecimal_encode_int(n))
+    'gpE4Xoy7fw5AO'
+
+Worst case scenario for plain Base 64:
+
+    >>> n = 64 ** 5 + 1
+    >>> urlquote(base64.urlsafe_b64encode(long_to_binary(n)))
+    'QAAAAQ%3D%3D'
+    >>> urlquote(hexahexacontadecimal_encode_int(n))
+    'ucrDZ'
+
+Worst case for hexahexacontadecimal:
+
+    >>> n = 66 ** 5 + 1
+    >>> urlquote(base64.urlsafe_b64encode(long_to_binary(n)))
+    'SqUUIQ%3D%3D'
+    >>> urlquote(hexahexacontadecimal_encode_int(n))
+    '100001'
+
+That big SHA-512 you always wanted to write in a URL:
+
+    >>> n = 2 ** 512
+    >>> urlquote(base64.urlsafe_b64encode(long_to_binary(n)))
+    'AQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA%3D'
+    >>> urlquote(hexahexacontadecimal_encode_int(n))
+    'JK84xqGD9FMXPNubPghADlRhBUzlqRscC2h~8xmi99PvuQsUCIB2CHGhMUQR8FLm72.Hbbctkqi89xspay~y4'
+
+### Are the savings really significant?
+
+If you're currently doing your BASE64 encoding the naive way, then yes:
+
+    >>> sum(len(urlquote(base64.urlsafe_b64encode(long_to_binary(n)))) for n in xrange(10 ** 5))
+    531584
+    >>> sum(len(urlquote(hexahexacontadecimal_encode_int(n))) for n in xrange(10 ** 5))
+    295578
+
+### But what if I use Base64 without padding?
+
+Then the savings are not as significant. But it's still an improvement. Using the code from http://stackoverflow.com/a/561704/76900:
+
+    >>> from hexahexacontadecimal.num_encode_base64 import num_encode as num_encode_base64
+    >>> n = 64 ** 5 + 1
+    >>> urlquote(num_encode_base64(n))
+    'BAAAAB'
+    >>> urlquote(hexahexacontadecimal_encode_int(n))
+    'ucrDZ'
+    >>> n = 66 ** 5 + 1
+    >>> urlquote(num_encode_base64(n))
+    'BKpRQh'
+    >>> urlquote(hexahexacontadecimal_encode_int(n))
+    '100001'
+    >>> n = 2 ** 512
+    >>> urlquote(num_encode_base64(n))
+    'EAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA'
+    >>> urlquote(hexahexacontadecimal_encode_int(n))
+    'JK84xqGD9FMXPNubPghADlRhBUzlqRscC2h~8xmi99PvuQsUCIB2CHGhMUQR8FLm72.Hbbctkqi89xspay~y4'
+    >>> sum(len(urlquote(num_encode_base64(n))) for n in xrange(10 ** 5))
+    295840
+    >>> sum(len(urlquote(hexahexacontadecimal_encode_int(n))) for n in xrange(10 ** 5))
+    295578
+
+## Installation
+
+    pip install hexahexacontadecimal
+
+## Documentation
+
+This file and docstrings.
+
+## License
+
+Free to use and modify under the terms of the BSD open source license.
+
+## Author
+
+Alexander Ljungberg
diff --git a/hexahexacontadecimal/__init__.py b/hexahexacontadecimal/__init__.py
@@ -0,0 +1,176 @@
+#!/usr/bin/env python
+# -*- coding: utf8 -*-
+
+"""Encode and decode hexahexacontadecimal numbers.
+
+Hexahexacontadecimal is a compact format to express a number in a URL. It uses all characters allowed in
+a URL without escaping -- the [unreserved characters](http://tools.ietf.org/html/rfc3986#section-2.3) --
+making it the shortest possible way to express an integer in a URL.
+
+Note that urllib.quote escapes tilde (~) (http://bugs.python.org/issue16285), which is not necessary as
+of RFC3986.
+
+## Hexahexacontadecimal vs Base 64 in URLs
+
+>>> n = 292231454903657293676544
+>>> import base64
+>>> urlquote(base64.urlsafe_b64encode(long_to_binary(n)))
+'PeHmHzZFTcAAAA%3D%3D'
+>>> urlquote(hexahexacontadecimal_encode_int(n))
+'gpE4Xoy7fw5AO'
+
+Worst case scenario for plain Base 64:
+
+>>> n = 64 ** 5 + 1
+>>> urlquote(base64.urlsafe_b64encode(long_to_binary(n)))
+'QAAAAQ%3D%3D'
+>>> urlquote(hexahexacontadecimal_encode_int(n))
+'ucrDZ'
+
+Worst case for hexahexacontadecimal:
+
+>>> n = 66 ** 5 + 1
+>>> urlquote(base64.urlsafe_b64encode(long_to_binary(n)))
+'SqUUIQ%3D%3D'
+>>> urlquote(hexahexacontadecimal_encode_int(n))
+'100001'
+
+That big SHA-512 you always wanted to write in a URL:
+
+>>> n = 2 ** 512
+>>> urlquote(base64.urlsafe_b64encode(long_to_binary(n)))
+'AQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA%3D'
+>>> urlquote(hexahexacontadecimal_encode_int(n))
+'JK84xqGD9FMXPNubPghADlRhBUzlqRscC2h~8xmi99PvuQsUCIB2CHGhMUQR8FLm72.Hbbctkqi89xspay~y4'
+
+## Are the savings really significant?
+
+If you're currently doing your BASE64 encoding the naive way, then yes:
+
+>>> sum(len(urlquote(base64.urlsafe_b64encode(long_to_binary(n)))) for n in xrange(10 ** 5))
+531584
+>>> sum(len(urlquote(hexahexacontadecimal_encode_int(n))) for n in xrange(10 ** 5))
+295578
+
+## But what if I use Base64 without padding?
+
+Then the savings are not as significant. But it's still an improvement. Using the code from http://stackoverflow.com/a/561704/76900:
+
+>>> from hexahexacontadecimal.num_encode_base64 import num_encode as num_encode_base64
+>>> n = 64 ** 5 + 1
+>>> urlquote(num_encode_base64(n))
+'BAAAAB'
+>>> urlquote(hexahexacontadecimal_encode_int(n))
+'ucrDZ'
+>>> n = 66 ** 5 + 1
+>>> urlquote(num_encode_base64(n))
+'BKpRQh'
+>>> urlquote(hexahexacontadecimal_encode_int(n))
+'100001'
+>>> n = 2 ** 512
+>>> urlquote(num_encode_base64(n))
+'EAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA'
+>>> urlquote(hexahexacontadecimal_encode_int(n))
+'JK84xqGD9FMXPNubPghADlRhBUzlqRscC2h~8xmi99PvuQsUCIB2CHGhMUQR8FLm72.Hbbctkqi89xspay~y4'
+>>> sum(len(urlquote(num_encode_base64(n))) for n in xrange(10 ** 5))
+295840
+>>> sum(len(urlquote(hexahexacontadecimal_encode_int(n))) for n in xrange(10 ** 5))
+295578
+
+Why settle for less?
+
+"""
+
+from io import StringIO
+import urllib
+
+BASE66_ALPHABET = u"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz-_.~"
+BASE = len(BASE66_ALPHABET)
+
+
+def urlquote(s, safe=None):
+    """Like urllib.quote() but don't escape ~, in accordance with RFC3986."""
+
+    return urllib.quote(s, safe='~' + (safe or ''))
+
+
+def long_to_binary(n):
+    """Take an integer and write it as a binary string.
+
+    >>> long_to_binary(0)
+    '\\x00'
+    >>> long_to_binary(255)
+    '\\xff'
+    >>> long_to_binary(512 + 3)
+    '\\x02\\x03'
+    """
+
+    h = '%x' % n
+    return ('0' * (len(h) % 2) + h).decode('hex')
+
+
+def binary_to_long(b):
+    """Take a binary string and read it as an integer.
+
+    >>> binary_to_long('\\x00')
+    0
+    >>> binary_to_long('\\xff')
+    255
+    >>> binary_to_long('\\x02\\x03')
+    515
+    """
+
+    return int(b.encode('hex'), 16)
+
+
+def hexahexacontadecimal_encode_int(n):
+    """Represent a number in hexahexacontadecimal, a compact format of unreserved URL characters.
+
+    >>> hexahexacontadecimal_encode_int(0)
+    '0'
+    >>> hexahexacontadecimal_encode_int(1)
+    '1'
+    >>> hexahexacontadecimal_encode_int(65)
+    '~'
+    >>> hexahexacontadecimal_encode_int(66)
+    '10'
+    >>> hexahexacontadecimal_encode_int(67)
+    '11'
+    >>> hexahexacontadecimal_encode_int(302231454903657293676544)
+    'iFsGUkO.0tsxw'
+
+    """
+
+    if n == 0:
+        return BASE66_ALPHABET[0].encode('ascii')
+
+    r = StringIO()
+    while n:
+        n, t = divmod(n, BASE)
+        r.write(BASE66_ALPHABET[t])
+    return r.getvalue().encode('ascii')[::-1]
+
+
+def hexahexacontadecimal_decode_int(s):
+    """Parse a number expressed in hexahexacontadecimal as an integer (or long).
+
+    >>> hexahexacontadecimal_decode_int('0')
+    0
+    >>> hexahexacontadecimal_decode_int('1')
+    1
+    >>> hexahexacontadecimal_decode_int('~')
+    65
+    >>> hexahexacontadecimal_decode_int('10')
+    66
+    >>> hexahexacontadecimal_decode_int('11')
+    67
+    >>> hexahexacontadecimal_decode_int('iFsGUkO.0tsxw')
+    302231454903657293676544L
+
+    """
+
+    n = 0
+    for c in s:
+        n = n * BASE + BASE66_ALPHABET.index(c)
+
+    return n
diff --git a/hexahexacontadecimal/num_encode_base64.py b/hexahexacontadecimal/num_encode_base64.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python
+# -*- coding: utf8 -*-
+
+"""
+
+An alternative to hexahexacontadecimal for comparison and testing purposes, by Miles at http://stackoverflow.com/a/561704/76900.
+
+"""
+
+import string
+
+ALPHABET = string.ascii_uppercase + string.ascii_lowercase + \
+           string.digits + '-_'
+ALPHABET_REVERSE = dict((c, i) for (i, c) in enumerate(ALPHABET))
+BASE = len(ALPHABET)
+SIGN_CHARACTER = '$'
+
+
+def num_encode(n):
+    if n < 0:
+        return SIGN_CHARACTER + num_encode(-n)
+    s = []
+    while True:
+        n, r = divmod(n, BASE)
+        s.append(ALPHABET[r])
+        if n == 0:
+            break
+    return ''.join(reversed(s))
+
+
+def num_decode(s):
+    if s[0] == SIGN_CHARACTER:
+        return -num_decode(s[1:])
+    n = 0
+    for c in s:
+        n = n * BASE + ALPHABET_REVERSE[c]
+    return n
diff --git a/setup.py b/setup.py
@@ -0,0 +1,29 @@
+from distutils.core import setup
+
+try:
+    import pypandoc
+    # This bit requires pandoc. On Mac OS X:
+    #   brew install haskell-platform && cabal update && cabal install pandoc
+    description = pypandoc.convert('README.md', 'rst', format='markdown')
+except (IOError, OSError, ImportError):
+    description = ''
+
+setup(
+    name='hexahexacontadecimal',
+    version='1.0',
+    description='Encode and decode hexahexacontadecimal numbers, a compact number representation for URLs.',
+    author='Alexander Ljungberg',
+    author_email='aljungberg@slevenbits.com',
+    url='https://github.com/aljungberg/hexahexacontadecimal',
+    packages=['hexahexacontadecimal'],
+    keywords=["base64", "hexahexacontadecimal", "base66", "url"],
+    classifiers=[
+        "Programming Language :: Python",
+        "Development Status :: 5 - Production/Stable",
+        "Intended Audience :: Developers",
+        "Topic :: Software Development :: Libraries",
+        "License :: OSI Approved :: BSD License",
+        "Operating System :: POSIX",
+    ],
+    long_description=description
+)