In [1]:
import re
import hashlib
from collections import defaultdict

In [2]:


def getTokens(text):
    
    text = re.sub(r'[^\w\s]', '', text).lower()
    
    tokens = text.split()
    return tokens

In [3]:
def getTf(tokens):
    
    tf = defaultdict(int)
    for token in tokens:
        tf[token] += 1
    return tf

In [4]:
def hashToken(token):
    
    return hashlib.sha1(token.encode('utf-8')).hexdigest()

In [5]:
def binHash(bin_hash):
    
    vec = []
    for bit in bin_hash:
        if bit == '1':
            vec.append(1)
        else:
            vec.append(-1)
    return vec

In [6]:
def simhash(text, hash_bits=64):
    tokens = getTokens(text)
    print("tokens:",tokens)
    tf = getTf(tokens)
    print("weights:",tf)

    
    final_vec = [0] * hash_bits

    for token, weight in tf.items():
        
        token_hash = hashToken(token)
        bin_hash = bin(int(token_hash, 16))[2:].zfill(hash_bits)
        vec = binHash(bin_hash)

        
        for i in range(hash_bits):
            final_vec[i] += vec[i] * weight

    
    final_hash = ''
    for v in final_vec:
        if v > 0:
            final_hash += '1'
        else:
            final_hash += '0'

    return final_hash

In [7]:
def hamming_distance(hash1, hash2):
   
    return sum(ch1 != ch2 for ch1, ch2 in zip(hash1, hash2))

In [13]:
text1 = "This is a sample document to test"
text2 = "This is a sample document to test simhash"
hash1 = simhash(text1)
hash2 = simhash(text2)

print(f"SimHash 1: {hash1}")
print(f"SimHash 2: {hash2}")
print(f"Hamming Distance: {hamming_distance(hash1, hash2)}")

tokens: ['this', 'is', 'a', 'sample', 'document', 'to', 'test']
weights: defaultdict(<class 'int'>, {'this': 1, 'is': 1, 'a': 1, 'sample': 1, 'document': 1, 'to': 1, 'test': 1})
tokens: ['this', 'is', 'a', 'sample', 'document', 'to', 'test', 'simhash']
weights: defaultdict(<class 'int'>, {'this': 1, 'is': 1, 'a': 1, 'sample': 1, 'document': 1, 'to': 1, 'test': 1, 'simhash': 1})
SimHash 1: 1000011001010101001101111111110101101001101100111110111101100100
SimHash 2: 1000000001000100001101001011010101001000101000101010110101100100
Hamming Distance: 14


In [10]:
hash1 = simhash(text1)
hash2 = simhash(text1)

print(f"SimHash 1: {hash1}")
print(f"SimHash 2: {hash2}")
print(f"Hamming Distance: {hamming_distance(hash1, hash2)}")

tokens: ['this', 'is', 'a', 'sample', 'document', 'to', 'test']
weights: defaultdict(<class 'int'>, {'this': 1, 'is': 1, 'a': 1, 'sample': 1, 'document': 1, 'to': 1, 'test': 1})
tokens: ['this', 'is', 'a', 'sample', 'document', 'to', 'test']
weights: defaultdict(<class 'int'>, {'this': 1, 'is': 1, 'a': 1, 'sample': 1, 'document': 1, 'to': 1, 'test': 1})
SimHash 1: 1000011001010101001101111111110101101001101100111110111101100100
SimHash 2: 1000011001010101001101111111110101101001101100111110111101100100
Hamming Distance: 0


In [12]:
text1 = "This is a sample document to test"
text2 = "i am sooo diffrent from last text yohooo"
hash1 = simhash(text1)
hash2 = simhash(text2)
print(f"SimHash 1: {hash1}")
print(f"SimHash 2: {hash2}")
print(f"Hamming Distance: {hamming_distance(hash1, hash2)}")

tokens: ['this', 'is', 'a', 'sample', 'document', 'to', 'test']
weights: defaultdict(<class 'int'>, {'this': 1, 'is': 1, 'a': 1, 'sample': 1, 'document': 1, 'to': 1, 'test': 1})
tokens: ['i', 'am', 'sooo', 'diffrent', 'from', 'last', 'text', 'yohooo']
weights: defaultdict(<class 'int'>, {'i': 1, 'am': 1, 'sooo': 1, 'diffrent': 1, 'from': 1, 'last': 1, 'text': 1, 'yohooo': 1})
SimHash 1: 1000011001010101001101111111110101101001101100111110111101100100
SimHash 2: 1001000011111010000000100011001010100100111011010001100000000000
Hamming Distance: 39
