# A step by step implementation of MinHash and LSH for finding similar documents

In [35]:
# load the data
import pandas as pd
dataset = pd.read_csv('../data/rent_rome_text.csv')
dataset["doc_id"] = dataset.index
dataset['text'] = dataset['Short Description']
dataset = dataset[['doc_id', 'text']].copy()
dataset.head()

Unnamed: 0,doc_id,text
0,0,Affitto studio a professionisti preferibilment...
1,1,"Privato affitta negozio 169 mq, al piano terra..."
2,2,Negozio c/1 roma tiburtina centro via eugenio ...
3,3,"Studio medico avviato, composto da tre studi c..."
4,4,"Donna lavoratrice, non residente, con reddito ..."


In [36]:
# keep two similar rows and one different row
dataset = dataset[dataset['doc_id'].isin([911, 1923, 133])].copy()
# shorten the text
dataset['text'] = dataset['text'].apply(lambda x: x[:16])
dataset.head()

Unnamed: 0,doc_id,text
133,133,Privato affitta
911,911,Avviato studio d
1923,1923,Avviato studio d


In [59]:
import hashlib
def get_hashed_shingles(shingle, salt_len=20, hash_len=8):
    salt = str(0).zfill(salt_len)[-salt_len:].encode('utf-8')
    return int(
                hashlib.sha1(
                    shingle.encode("utf-8") 
                    + salt).hexdigest()
                    [-hash_len:], 
                16)

In [72]:
import re
# set the shingle size
shingle_size = 10
resultSize = 8  # how many bytes we want back
salt_len = 20  # length of the salt

# shingle the text
for index, row in dataset.iterrows():
    document = row["text"]
    # process the text
    document = re.sub("( )+|(\n)+", " ", document).lower()
    print("\nThe processed document: \n")
    print(document)
    print("\nNumber of characters in the document: ", len(document))
    print("Shingling size: ", shingle_size)
    # create the set of shingles
    shingles = [
        document[i:i + shingle_size]
        for i in range(len(document) - shingle_size + 1)
    ]
    print("Number of shingles: ", len(shingles))
    print(shingles)
    # create the set of hashed shingles
    hashed_shingles = [
        get_hashed_shingles(shingle, salt_len, resultSize)
        for shingle in shingles
    ]
    print("hashed shingles:")
    print(hashed_shingles)
    # sort the hashed shingles
    hashed_shingles = sorted(hashed_shingles)
    print("sorted hashed shingles:")
    print(hashed_shingles)
    # write the hashed shingles to the dataframe
    dataset.loc[index, "hashed_shingles"] = str(hashed_shingles)

dataset.head()


The processed document: 

privato affitta 

Number of characters in the document:  16
Shingling size:  10
Number of shingles:  7
['privato af', 'rivato aff', 'ivato affi', 'vato affit', 'ato affitt', 'to affitta', 'o affitta ']
hashed shingles:
[2729853409, 1978696595, 2833477683, 847371075, 4158715906, 1993375736, 171310880]
sorted hashed shingles:
[171310880, 847371075, 1978696595, 1993375736, 2729853409, 2833477683, 4158715906]

The processed document: 

avviato studio d

Number of characters in the document:  16
Shingling size:  10
Number of shingles:  7
['avviato st', 'vviato stu', 'viato stud', 'iato studi', 'ato studio', 'to studio ', 'o studio d']
hashed shingles:
[365799992, 2977033780, 2537022184, 602976845, 498592907, 4190303751, 1869341980]
sorted hashed shingles:
[365799992, 498592907, 602976845, 1869341980, 2537022184, 2977033780, 4190303751]

The processed document: 

avviato studio d

Number of characters in the document:  16
Shingling size:  10
Number of shingles:  7


Unnamed: 0,doc_id,text,hashed_shingles
133,133,Privato affitta,"[171310880, 847371075, 1978696595, 1993375736,..."
911,911,Avviato studio d,"[365799992, 498592907, 602976845, 1869341980, ..."
1923,1923,Avviato studio d,"[365799992, 498592907, 602976845, 1869341980, ..."


## Creating the hashed shingles

In [65]:
# add a salt to the shingle to increase the randomness
# making it less likely to be the same as another shingle hash
salt = str(0).zfill(salt_len)[-salt_len:].encode('utf-8')
print("Salt: ", salt)
hash_value = hashlib.sha1(
            'avviato st'.encode("utf-8") 
            + salt).hexdigest()
print("Hash value: ", hash_value)
# 8 hex digits
short_hash = hash_value[-resultSize:] 
print("Short hash: ", short_hash)
# convert the hex to an integer
int_hash = int(short_hash, 16) 
print("Integer hash: ", int_hash)

Salt:  b'00000000000000000000'
Hash value:  f990c24df45ea7c1fb50d44073514f3715cdaa38
Short hash:  15cdaa38
Integer hash:  365799992


## Compute the signature matrix

In [None]:
sig_size = 50

hash_functions = [hashFamily(randint(0, 10000000000)) for _ in range(sig_size)

min(h_funct.get_hash_value(el) for el in set_)
for h_funct in self.hash_functions
