In [5]:
import random
from collections import defaultdict
import dill

In [42]:
from collections import defaultdict
import random

class LSH:
    def __init__(self, num_hashes, num_bands, shingle_size):
        self.num_hashes = num_hashes
        self.num_bands = num_bands
        self.shingle_size = shingle_size
        self.rows_per_band = num_hashes // num_bands
        self.buckets = [defaultdict(list) for _ in range(num_bands)]
        self.documents = {}
        self.max_shingle_id = 2**32 - 1
        self.hash_functions = self._generate_hash_functions()

    def _generate_hash_functions(self):
        """Generate a list of hash functions."""
        hash_functions = []
        for _ in range(self.num_hashes):
            a = random.randint(1, self.max_shingle_id)
            b = random.randint(0, self.max_shingle_id)
            hash_functions.append(lambda x, a=a, b=b: (a * x + b) % self.max_shingle_id)
        return hash_functions

    def _shingle_document(self, document):
        """Generate k-shingles for a given document."""
        shingles = set()
        for i in range(len(document) - self.shingle_size + 1):
            shingle = document[i:i + self.shingle_size]
            shingles.add(shingle)
        return shingles

    def _minhash_signature(self, shingles):
        """Generate a minhash signature for a set of shingles."""
        signature = []
        for func in self.hash_functions:
            min_hash = min(func(hash(shingle)) for shingle in shingles)
            signature.append(min_hash)
        return signature

    def add(self, doc_id, document):
        """Add a document to the LSH data structure."""
        shingles = self._shingle_document(document)
        signature = self._minhash_signature(shingles)
        self.documents[doc_id] = document
        
        for i in range(self.num_bands):
            start_index = i * self.rows_per_band
            end_index = (i + 1) * self.rows_per_band
            band = tuple(signature[start_index:end_index])
            self.buckets[i][band].append(doc_id)

    def query(self, document):
        """Query the LSH data structure for approximate neighbors."""
        shingles = self._shingle_document(document)
        signature = self._minhash_signature(shingles)
        candidates = set()
        
        for i in range(self.num_bands):
            start_index = i * self.rows_per_band
            end_index = (i + 1) * self.rows_per_band
            band = tuple(signature[start_index:end_index])
            if band in self.buckets[i]:
                candidates.update(self.buckets[i][band])

        return {self.documents[doc_id] for doc_id in candidates}

# Example usage
num_hashes = 100
num_bands = 30
shingle_size = 5

lsh = LSH(num_hashes, num_bands, shingle_size)

# Adding documents
# documents = ["hello world", "hello", "world", "hello there", "world of warcraft"]
# for idx, doc in enumerate(documents):
    # lsh.add(idx, doc)

# Query for approximate neighbors of a new document
# query_document = "war craft"
# neighbors = lsh.query(query_document)
# print("Approximate neighbors of the document:", neighbors)

In [None]:
!pip install datasets
!pip install mwparserfromhell

In [9]:
from datasets import load_dataset

In [10]:
dataset = load_dataset("wikipedia", "20220301.simple")

In [101]:
len(dataset['train']['text'])

205328

In [32]:
def generate_similar_string(original):
    words = original.split()
    new_words = words.copy()

    modification_type = random.choice(['add', 'remove', 'swap'])
    for _ in range(1000):
        if modification_type == 'add':
            new_words.insert(random.randint(0, len(new_words)), random.choice(words))
        elif modification_type == 'remove' and len(new_words) > 1:
            new_words.pop(random.randint(0, len(new_words) - 1))
        elif modification_type == 'swap' and len(new_words) > 1:
            idx1, idx2 = random.sample(range(len(new_words)), 2)
            new_words[idx1], new_words[idx2] = new_words[idx2], new_words[idx1]

    return ' '.join(new_words)

In [12]:
test_set = dataset['train']['text'][0:50]

In [33]:
test_set = dataset['train']['text'][0:50]
test_set_altered = map(generate_similar_string, test_set)
test_set_altered = list(test_set_altered)
test_set_more_altered = map(generate_similar_string, test_set_altered)
test_set_more_altered = list(test_set_more_altered)
data = [test_set, test_set_altered, test_set_more_altered]

In [49]:
def add_to_lsh(data):
    id=0
    for s in data:
        for item in s:
            print(id)
            if len(item) >= lsh.shingle_size:
                lsh.add(id, item)
            else: print('too small to index')
            id+=1

In [52]:
len(data)

3

In [53]:
for item in data:
    print(item[0])
    print('===')

April is the fourth month of the year in the Julian and Gregorian calendars, and comes between March and May. It is one of four months to have 30 days.

April always begins on the same day of week as July, and additionally, January in leap years. April always ends on the same day of the week as December.

April's flowers are the Sweet Pea and Daisy. Its birthstone is the diamond. The meaning of the diamond is innocence.

The Month 

April comes between March and May, making it the fourth month of the year. It also comes first in the year out of the four months that have 30 days, as June, September and November are later in the year.

April begins on the same day of the week as July every year and on the same day of the week as January in leap years. April ends on the same day of the week as December every year, as each other's last days are exactly 35 weeks (245 days) apart.

In common years, April starts on the same day of the week as October of the previous year, and in leap years, M

In [None]:
num_hashes = 100
num_bands = 30
shingle_size = 5

lsh = LSH(num_hashes, num_bands, shingle_size)
add_to_lsh(data)

In [44]:
(lsh.query(test_set[0]))

{'April is 7 the fourth Year month of the falls year in the Julian and Gregorian calendars, and comes between March and May. It is one independence of four months Battle to have land 30 days. declares April always same Sweden April begins on April the same day of of week Maundy a as July, and additionally, January in leap years. April of always ends on the autumn/fall same day Luxembourg of take the week as 9 December. April\'s flowers are the Sweet Pea and Daisy. the Its birthstone is the diamond. The meaning of the diamond is innocence. The Month April in comes the April between March and June May, making it the fourth month of the year. Inc. It also comes first in the year British out Kaczynski. of the April four months in that have 30 days, as could - June, September and November are later in the year. April begins on originally the same day territory of Flag the week - as In July Netherlands every The year the and on the - same day of the independence. week as January in leap year

In [None]:
num_hashes = 100
num_bands = 35
shingle_size = 5

lsh = LSH(num_hashes, num_bands, shingle_size)
add_to_lsh(data)

In [51]:
(lsh.query(test_set[0]))

{"Apple is the edible fruit of a number of trees, known for this juicy, green or red fruits. The tree (Malus spp.) is grown worldwide. Its fruit is low-cost and popular, and is harvested all over the world. \n\nApplewood is a type of wood that comes from this tree.\n\nThe apple tree comes from southern Kazakhstan, Kyrgyzstan, Uzbekistan, and northwestern part of China. Apples have been grown for thousands of years in Asia and Europe. They were brought to North America by European settlers. Apples have religious and mythological significance in many cultures.\n\nApples are generally propagated by grafting, although wild apples grow readily from seed. Apple trees are large if grown from seed, but small if grafted onto roots (rootstock). There are more than 10000 known cultivars of apples, with a range of desired characteristics. Different cultivars are bred for various tastes and uses: cooking, eating raw and cider production are the most common uses. \n\nTrees and fruit are attacked by 

In [208]:
import dill 

In [209]:
with open('lsh.pkl', 'wb+') as f:
    dill.dump(lsh, f)

In [6]:
with open('lsh.pkl', 'rb') as f:
    lsh = dill.load(f)