# LSH algorithm implementation
Let us first import relevant packages.

In [None]:
import pandas as pd
from datasketch import MinHash, MinHashLSH
import numpy as np

We will first see how it is used in an example from the documentation and then we can implement in on our own.

In [None]:
set1 = set(['minhash', 'is', 'a', 'probabilistic', 'data', 'structure', 'for', # Taken from https://ekzhu.com/datasketch/lsh.html
            'estimating', 'the', 'similarity', 'between', 'datasets'])
set2 = set(['minhash', 'is', 'a', 'probability', 'data', 'structure', 'for',
            'estimating', 'the', 'similarity', 'between', 'documents'])
set3 = set(['minhash', 'is', 'probability', 'data', 'structure', 'for',
            'estimating', 'the', 'similarity', 'between', 'documents'])

m1 = MinHash(num_perm=128)
m2 = MinHash(num_perm=128)
m3 = MinHash(num_perm=128)
for d in set1:
    m1.update(d.encode('utf8'))
for d in set2:
    m2.update(d.encode('utf8'))
for d in set3:
    m3.update(d.encode('utf8'))

# Create LSH index
lsh = MinHashLSH(threshold=0.5, num_perm=128)
lsh.insert("m2", m2)
lsh.insert("m3", m3)
result = lsh.query(m1)
print("Approximate neighbours with Jaccard similarity > 0.5", result)

Approximate neighbours with Jaccard similarity > 0.5 ['m2', 'm3']


Thus every set/ vector needs to have a minhash of its own which is then inserted into the index. Moreover, `merge()` is a useful tool for merge two indexes thus allowing us to have parallel processing.

In [None]:
set1 = set([5, 3, 2, 1, 1, 4, 3, 2, 6, 2, 4, 1])
set2 = set([1, 2, 3, 1, 2, 3, 4, 5, 1, 2, 4, 5])
set3 = set([1, 2, 3, 1, 2, 3, 4, 5, 1, 2, 9, 5])
m1 = MinHash(num_perm=128)
m2 = MinHash(num_perm=128)
m3 = MinHash(num_perm=128)
for d in set1:
    m1.update(str(d).encode('utf8'))
for d in set2:
    m2.update(str(d).encode('utf8'))
for d in set3:
    m3.update(str(d).encode('utf8'))

# Create LSH index
lsh = MinHashLSH(threshold=0.5, num_perm=128)
lsh.insert("m2", m2)
lsh.insert("m3", m3)
result = lsh.query(m1)
print("Approximate neighbours with Jaccard similarity > 0.5", result)

Approximate neighbours with Jaccard similarity > 0.5 ['m2', 'm3']


Now we will write some generic code for doing the same while keeping in mind that our data will have to gleaned from another source.

In [None]:
# Read the .csv file
df = pd.read_csv('file.csv')

# Create LSH index
lsh = MinHashLSH(threshold=0.5, num_perm=128)

# Iterate over each row of the dataframe
for index, row in df.iterrows():
    my_set = set(row)
    m1 = MinHash(num_perm=128)
    for d in set1:
        m1.update(str(d).encode('utf8'))
    lsh.insert(f"row = {index}", m1)