diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..4302b37 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,4 @@ +include *.py +include *.jpg +include README.rst +include LICENSE.txt diff --git a/README.rst b/README.rst index 4b60fa9..9ec7e95 100644 --- a/README.rst +++ b/README.rst @@ -5,33 +5,35 @@ dhash is a Python library that generates a "difference hash" for a given image -- a `perceptual hash`_ based on Neal Krawetz's dHash algorithm in `this "Hacker Factor" blog entry`_. -The library is `on the Python Package Index (PyPI)`_, so to install it, fire -up a command prompt, activate your virtualenv if you're using one, and type: +The library is `on the Python Package Index (PyPI)`_ and works on both Python +3 and Python 2.7. To install it, fire up a command prompt, activate your +virtual environment if you're using one, and type: :: - pip install graphyte + pip install dhash The algorithm to create a difference hash is very simple: * Convert the image to grayscale -* Downsize it to a 9x9 thumbnail (size=8 means 8+1 by 8+1) -* Produce a 64-bit "row hash", with 1 meaning pixel intensity is increasing in - the x direction, 0 meaning it's decreasing +* Downsize it to a 9x9 thumbnail (size=8 means an 8+1 by 8+1 image) +* Produce a 64-bit "row hash": a 1 bit means the pixel intensity is increasing + in the x direction, 0 means it's decreasing * Do the same to produce a 64-bit "column hash" in the y direction * Combine the two values to produce the final 128-bit hash value -The library defaults to producing a "size 8" dhash, but you can override this -easily, for example, to produce a more accurate (but slower to work with) -"size 16" dhash of 512 bits. - -I've found that the dhash is great for detecting near duplicates (at Jetsetter -we find dupes using a size=8 dhash with a maximum delta of 2 bits), but -because of the simplicity of the algorithm, it's not great at finding similar -images or duplicate-but-cropped images -- you'd need a more sophisticated -image fingerprint if you want that. However, the dhash is good for finding -exact duplicates and near duplicates, for example, the same image with -slightly altered lighting, a few pixels of cropping, or very light +The library defaults to producing a size 8 dhash, but you can override this +easily by passing ``size=N`` as a keyword argument to most functions. For +example, you can produce a more accurate (but slower to work with) dhash of +512 bits by specifying ``size=16``. + +I've found that the dhash is great for detecting near duplicates (at +`Jetsetter`_ we find dupes using a size 8 dhash with a maximum delta of 2 +bits). But because of the simplicity of the algorithm, it's not great at +finding similar images or duplicate-but-cropped images -- you'd need a more +sophisticated image fingerprint if you want that. However, the dhash is good +for finding exact duplicates and near duplicates, for example, the same image +with slightly altered lighting, a few pixels of cropping, or very light photoshopping. To use the dhash library, you need either the `wand`_ ImageMagick binding or @@ -39,6 +41,10 @@ the `Pillow (PIL)`_ library installed. Pick one and stick with it -- they will produce slightly different dhash values due to differences in their grayscale conversion and resizing algorithms. +If you have both libraries installed, dhash will use wand by default. To +override this and force use of Pillow/PIL, call ``dhash.use_pil()`` before +using the library. + To produce a dhash value using wand: .. code:: python @@ -46,7 +52,7 @@ To produce a dhash value using wand: import dhash from wand.image import Image - with Image(filename='test.jpg') as image: + with Image(filename='dhash-test.jpg') as image: row, col = dhash.dhash_row_col(image) print(dhash.format_hex(row, col)) @@ -57,7 +63,7 @@ To produce a dhash value using Pillow: import dhash from PIL import Image - image = Image.open('test.jpg') + image = Image.open('dhash-test.jpg') row, col = dhash.dhash_row_col(image) print(dhash.format_hex(row, col)) @@ -67,22 +73,69 @@ integer pixel intensities (for example, from 0 to 255). For example: .. code:: python - >>> row, col = dhash_row_col([0,0,1,1,1, 0,1,1,3,4, 0,1,6,6,7, 7,7,7,7,9, 8,7,7,8,9], size=4) + >>> import dhash + >>> row, col = dhash.dhash_row_col([0,0,1,1,1, 0,1,1,3,4, 0,1,6,6,7, 7,7,7,7,9, 8,7,7,8,9], size=4) >>> format(row, '016b') '0100101111010001' >>> format(col, '016b') '0101001111111001' +To produce the hash value as a 128-bit integer directly, use +``dhash_int(image, size=N)``. To format the hash value in various ways, use +the ``format_*`` functions: + +.. code:: python + + >>> row, col = (13962536140006260880, 9510476289765573406) + >>> dhash.format_bytes(row, col) + b'\xc1\xc4\xe4\xa4\x84\xa0\x80\x90\x83\xfb\xff\xcc\x00@\x83\x1e' + >>> dhash.format_hex(row, col) + 'c1c4e4a484a0809083fbffcc0040831e' + +To compute the number of bits different (hamming distance) between two +hashes, you can use the ``get_num_bits_different(hash1, hash2)`` helper +function: + +.. code:: python + + >>> import dhash + >>> dhash.get_num_bits_different(0x4bd1, 0x5bd2) + 3 + You can also use dhash to generate the difference hash for a specific image from the command line: :: - python -m dhash TODO - -There are command line arguments to format the output in various ways, and to -produce the bit delta (hamming distance) between two images. Type -``python -m dhash --help`` for help. + $ python -m dhash dhash-test.jpg + c1c4e4a484a0809083fbffcc0040831e + + $ python -m dhash --format=decimal dhash-test.jpg + 13962536140006260880 9510476289765573406 + + # show the 8x8 row and column grids + $ python -m dhash --format=matrix dhash-test.jpg + * * . . . . . * + * * . . . * . . + * * * . . * . . + * . * . . * . . + * . . . . * . . + * . * . . . . . + * . . . . . . . + * . . * . . . . + + * . . . . . * * + * * * * * . * * + * * * * * * * * + * * . . * * . . + . . . . . . . . + . * . . . . . . + * . . . . . * * + . . . * * * * . + + # compute the bit delta between two images + $ python -m dhash dhash-test.jpg similar.jpg + 1 bit differs out of 128 (0.8%) Read the code in `dhash.py`_ for more details – it's pretty small! diff --git a/dhash-test.jpg b/dhash-test.jpg new file mode 100644 index 0000000..6b0d536 Binary files /dev/null and b/dhash-test.jpg differ diff --git a/dhash.py b/dhash.py index 4646563..7ca8cf8 100644 --- a/dhash.py +++ b/dhash.py @@ -31,13 +31,13 @@ def get_grays(image, width, height): """Convert image to grayscale, downsize to width*height, and return list - of grayscale pixel values (0 to 255). + of grayscale integer pixel values (for example, 0 to 255). >>> get_grays([0,0,1,1,1, 0,1,1,3,4, 0,1,6,6,7, 7,7,7,7,9, 8,7,7,8,9], 5, 5) [0, 0, 1, 1, 1, 0, 1, 1, 3, 4, 0, 1, 6, 6, 7, 7, 7, 7, 7, 9, 8, 7, 7, 8, 9] >>> import os - >>> test_filename = os.path.join(os.path.dirname(__file__), 'test', 'test-30x20.jpg') + >>> test_filename = os.path.join(os.path.dirname(__file__), 'dhash-test.jpg') >>> with wand.image.Image(filename=test_filename) as image: ... get_grays(image, 9, 9)[:18] [95, 157, 211, 123, 94, 79, 75, 75, 78, 96, 116, 122, 113, 93, 75, 82, 81, 79] @@ -49,7 +49,7 @@ def get_grays(image, width, height): return image if wand is None and PIL is None: - raise ImportError('must have wand or PIL/Pillow installed to use dhash on images') + raise ImportError('must have wand or Pillow/PIL installed to use dhash on images') if wand is not None and isinstance(image, wand.image.Image): with image.clone() as small_image: @@ -82,7 +82,7 @@ def dhash_row_col(image, size=8): '0101001111111001' >>> import os - >>> test_filename = os.path.join(os.path.dirname(__file__), 'test', 'test-30x20.jpg') + >>> test_filename = os.path.join(os.path.dirname(__file__), 'dhash-test.jpg') >>> with wand.image.Image(filename=test_filename) as image: ... row, col = dhash_row_col(image) >>> (row, col) == (13962536140006260880, 9510476289765573406) @@ -204,6 +204,14 @@ def format_grays(grays, size=8): return '\n'.join(lines) +def force_pil(): + """If both wand and Pillow/PIL are installed, force the use of Pillow/PIL.""" + global wand + if PIL is None: + raise ValueError('Pillow/PIL library must be installed to use force_pil()') + wand = None + + if __name__ == '__main__': import argparse @@ -212,20 +220,30 @@ def format_grays(grays, size=8): help='width and height of dhash image size, default %(default)d') parser.add_argument('-f', '--format', default='hex', choices=['hex', 'decimal', 'matrix', 'grays'], help='hash output format, default %(default)s') + parser.add_argument('-p', '--pil', action='store_true', + help='if both wand and Pillow/PIL installed, force use of Pillow/PIL') parser.add_argument('filename', nargs='*', help='name of image file to hash (or two to calculate the delta)') args = parser.parse_args() + if args.pil: + try: + force_pil() + except ValueError: + sys.stderr.write('You must have Pillow/PIL installed to use --pil\n') + sys.exit(1) + def load_image(filename): if wand is not None: return wand.image.Image(filename=filename) elif PIL is not None: return PIL.Image.open(filename) else: - sys.stderr.write('You must have wand or PIL/Pillow installed to use the dhash command\n') + sys.stderr.write('You must have wand or Pillow/PIL installed to use the dhash command\n') sys.exit(1) if len(args.filename) == 0: + # NOTE: doctests require "wand" to be installed import doctest doctest.testmod() @@ -252,8 +270,10 @@ def load_image(filename): hash1 = dhash_int(image1, size=args.size) hash2 = dhash_int(image2, size=args.size) num_bits_different = get_num_bits_different(hash1, hash2) - print('{} bits differ out of {} ({:.1f}%)'.format( - num_bits_different, args.size * args.size * 2, + print('{} {} out of {} ({:.1f}%)'.format( + num_bits_different, + 'bit differs' if num_bits_different == 1 else 'bits differ', + args.size * args.size * 2, 100 * num_bits_different / (args.size * args.size * 2))) else: