In [1]:
import numpy as np
import math

In [2]:
# Port from Go to Python referencing: https://github.com/pantsing/flylsh
class FlyLSH(object):
    def __init__(self, n_grams=4, n_orns=32, n_kcs=320, seed=1, proj_ratio=0.1, apl_ratio=0.05):
        self.n_grams = n_grams
        self.n_orns = n_orns
        self.n_kcs = n_kcs
        self.seed = seed
        self.proj_ratio = proj_ratio
        self.apl_ratio = apl_ratio
        self.byte_size = 8
        self.mask = np.array([1<<i for i in range(self.byte_size)], dtype=np.int64)
        
        # Set random seed (allows for repeatable analysis)
        np.random.seed(seed)
        
        # Others variable in FlyLSH struct
        self.pns = np.zeros(n_orns, dtype=np.float64)
        self.m = np.zeros((n_kcs, int(n_orns / self.byte_size + 1)), dtype=np.uint8)
        self.kcs = np.zeros(n_kcs, dtype=np.float64)
        self.n_apl = int(math.ceil(n_kcs * apl_ratio))
        
        # Sample number of ORNs pre Kenyon cell
        self.orn_samples = int(math.ceil(proj_ratio * n_orns))
        
        # Mark connection between ORN and KC in matrix
        for j in range(n_kcs):
            for m in range(self.orn_samples):
                i = np.random.randint(0, n_orns)
                self.m[j][int(i/self.byte_size)] |= 1 << (i & (self.byte_size - 1))

    def get_hash(self, s):
        v = self.vector(s)
        self.pns = v - np.mean(v)
        self.random_project()
        self.apl_wta()
        hash_bytes = self.apl_indices()
        self.clear()
        return hash_bytes
    
    def vector(self, s):
        s = np.array(list(s.encode()), dtype=np.int64)
        x = self.rolling(s, self.n_grams)
        v = np.clip(x[:,:,np.newaxis] & self.mask, 0, 1)
        return v.sum(axis=0).flatten()
    
    def rolling(self, s, n):
        shape = s.shape[:-1] + (s.shape[-1] - n + 1, n)
        strides = s.strides + (s.strides[-1],)
        return np.lib.stride_tricks.as_strided(s, shape, strides)
    
    def random_project(self):
        for j in range(self.n_kcs):
            for i in range(self.n_orns):
                if self.m[j][int(i/self.byte_size)] & (1<<(i & (self.byte_size - 1))) > 0:
                    self.kcs[j] += self.pns[i]
            self.kcs[j] /= self.orn_samples
    
    def apl_wta(self):
        heap = np.zeros(self.n_apl, dtype=np.float64)
        for v in self.kcs:
            if v > heap[0]:
                heap[0] = v
                k = 0
                while k < (self.n_apl/2):
                    if (heap[k] > heap[2*k]) | (heap[k] > heap[2*k+1]):
                        if heap[2*k] < heap[2*k+1]:
                            heap[2*k], heap[k] = heap[k], heap[2*k]
                            k = 2 * k
                        else:
                            heap[2*k+1], heap[k] = heap[k], heap[2*k+1]
                            k = 2 * k + 1
                    else:
                        break
        for i in range(self.n_kcs):
            if self.kcs[i] < heap[0]:
                self.kcs[i] = 0
    
    def apl_indices(self):
        s = np.zeros(int(self.n_kcs/self.byte_size), dtype=np.uint8)
        for i in range(self.n_kcs):
            if self.kcs[i] > 0:
                s[int(i/self.byte_size)] |= 1 << (i & (self.byte_size - 1))
        return s
    
    def clear(self):
        self.pns *= 0
        self.kcs *= 0
        
    def compare(self, h_1, h_2):
        return sum([bin(x).count('1') for x in h_1 ^ h_2])

In [3]:
fly_lsh = FlyLSH()

docs = [
    'this is a test phrase',
    'this is a test phrass',
    'this is one of test phrases',
    'different test phrase'
]
    
hashes = []
for doc in docs:
    hashes.append(fly_lsh.get_hash(doc))
    hex_str = ''.join('{:02X}'.format(x) for x in hashes[-1])
    print(f'flylsh of "{doc}": {hex_str}')

print(f'Comparison of "{docs[0]}" and "{docs[1]}": {fly_lsh.compare(hashes[0], hashes[1])}')
print(f'Comparison of "{docs[0]}" and "{docs[2]}": {fly_lsh.compare(hashes[0], hashes[2])}')
print(f'Comparison of "{docs[0]}" and "{docs[3]}": {fly_lsh.compare(hashes[0], hashes[3])}')

flylsh of "this is a test phrase": 00000000010000012000000080102082000200000000440008001040020100000008000000000000
flylsh of "this is a test phrass": 00000000010000012000000080102082000200000000440008001040020100000008000000000000
flylsh of "this is one of test phrases": 00000000010008002000000080102042000200000000440008001040020000000008000000010000
flylsh of "different test phrase": 200000000380080000000000C0002042000000000000440008002040020000000028000000000000
Comparison of "this is a test phrase" and "this is a test phrass": 0
Comparison of "this is a test phrase" and "this is one of test phrases": 6
Comparison of "this is a test phrase" and "different test phrase": 15
