the local simhash pooled index tester

In [13]:
# a shallow fork of simhashindex (https://github.com/liangsun/simhash/blob/master/simhash/__init__.py)
# what we need to do, in a db-free test env, is retain more than the string and the hash of
# the string (we have the sha of the source, the string, and the simhash of the string)
# and a way to exclude a sha from the result set - find equivalent objects in another response

# we are now trying pathos

from simhash import Simhash
# from multiprocessing import Pool
# import multiprocessing
# from collections import defaultdict
import pathos.multiprocessing as mp

from itertools import chain
from operator import itemgetter
import os

# import copy_reg
# import types
# from copy_reg import pickle
# from types import MethodType


class IndexBucket(object):
    def __init__(self, f=64, k=2):
        self.bucket = {}
        self.k = k
        self.f = f

    def get_near_dups(self, simhash):
        """
        `simhash` is an instance of Simhash
        return a list of obj_id (pipe-delimited string of sha|text|distance)
        """
        assert simhash.f == self.f

        ans = set()

        for key in self.get_keys(simhash):
            dups = self.bucket.get(key, set())

            for dup in dups:
                sim2, obj_blob = dup.split(',', 1)
                sim2 = Simhash(long(sim2, 16), self.f)

                d = simhash.distance(sim2)
                if d <= self.k:
                    ans.add('{0}|{1}'.format(obj_blob, d))
        return list(ans)
    
    def add(self, obj_id, obj_str, simhash):
        """
        `obj_id` is a string
        `simhash` is an instance of Simhash
        """
        assert simhash.f == self.f

        for key in self.get_keys(simhash):
            v = '%x,%s|%s' % (simhash.value, obj_id, obj_str)

            self.bucket.setdefault(key, set())
            self.bucket[key].add(v)

    def delete(self, obj_id, obj_str, simhash):
        """
        `obj_id` is a string
        `simhash` is an instance of Simhash
        """
        assert simhash.f == self.f

        for key in self.get_keys(simhash):
            v = '%x,%s|%s' % (simhash.value, obj_id, obj_str)

            if v in self.bucket.get(key, set()):
                self.bucket[key].remove(v)
    
    @property
    def offsets(self):
        """
        You may optimize this method according to <http://www.wwwconference.org/www2007/papers/paper215.pdf>
        """
        return [self.f // (self.k + 1) * i for i in range(self.k + 1)]

    def get_keys(self, simhash):
        for i, offset in enumerate(self.offsets):
            m = (i == len(self.offsets) - 1 and 2 ** (self.f - offset) - 1 or 2 ** (self.offsets[i + 1] - offset) - 1)
            c = simhash.value >> offset & m
            yield '%x:%x' % (c, i)

    def bucket_size(self):
        return len(self.bucket)
    

class HashIndex(object):
    # this is now a wrapper for the pooled indexing process
    def mapper(hashindex, arr):
        hashindex._map(arr)
    
    def _partition(self):
        # chunk out our store into smaller bits
        breakpoint = int(round(len(self.store) / self.workers + 0.5))
        i = 0
        while i < len(self.store):
            yield self.store[i:i+breakpoint]
            i += breakpoint

    def _map(arr):
        # make a bucket and compare the simhash to
        # that local set, return if distance < k
        bucket = IndexBucket(k=self.k, f=self.f)

        for i, q in enumerate(arr):
            bucket.add(*q)

        # do the comparison
        near_dupes = bucket.get_near_dups(self.simhash)

        # return tuples of object strings, distance scores
        return [tuple(n.split('|')) for n in near_dupes]

    def _reduce(mappings):
        # join and sort asc
        near_dupes = list(chain.from_iterable(mappings))

        return sorted(near_dupes, key=lambda x: itemgetter(1))
    
    def get_near_dupes(self, simhash):
        # run the pooled process against the store
        pool = mp.Pool(processes=self.workers,)
        
        # pool.map args workaround #1
        self.simhash = simhash
        
        partitions = self._partition()
        
        # with pathos
        mapped_buckets = pool.map(_map, partitions)
        
#         # won't pickle
#         #mapped_buckets = pool.map(_map, partitions)
        
# #         mapped_buckets = pool.map(_calling, args=(self, '_map', partitions))
# #         try the async i guess
#         async_results = [pool.apply_async(mapper, args=(self, (p,))) for p in partitions]
#         pool.close()
#         map(multiprocessing.pool.ApplyResult.wait, async_results)
        
#         print async_results
        
#         mapped_buckets = [r.get() for r in async_results]       
        
        # for multiple args and a helper that's gone, didn't work (??)
        #mapped_buckets = pool.map(_map_helper, izip(partitions, repeat((simhash, self.k, self.f))))
        near_dupes = self._reduce(mapped_buckets)
        
        return near_dupes
        
    def __init__(self, objs, f=64, k=2):
        """
        `objs` is a list of (sha, source obj (str), simhash)
        obj_id is a string, simhash is an instance of Simhash
        `f` is the same with the one for Simhash
        `k` is the tolerance
        """
        self.k = k
        self.f = f
        
        self.workers= 10
        #count = len(objs)
        
        self.store = objs 
    
def _calling(instance, name, args=(), kwargs=None):
    if kwargs is None:
        kwargs = {}
    return getattr(instance, name)(*args, **kwargs)
    
    
# # pickling things
# def _pickle_method(method):
#     func_name = method.im_func.__name__
#     obj = method.im_self
#     cls = method.im_class
#     return _unpickle_method, (func_name, obj, cls)

# def _unpickle_method(func_name, obj, cls):
#     for cls in cls.mro():
#         try:
#             func = cls.__dict__[func_name]
#         except KeyError:
#             pass
#         else:
#             break
            
# copy_reg.pickle(types.MethodType, _pickle_method, _unpickle_method)

In [14]:
from uuid import uuid4  

# the test is uuid similarity (which is junk but auto-generated junk)
big_list = [str(uuid4()).split('-')[0]for i in xrange(0, 1000)]
big_list = [(b, Simhash(b)) for b in big_list]

index = HashIndex(big_list)
test_item = big_list[1]
print 'test item = ', test_item[0], test_item[1].value

near_dupes = index.get_near_dupes(test_item[1])
near_dupes


test item =  649e490e 13504959716208044071


NameError: global name '_map' is not defined