In [68]:
from collections import Counter
from random import choices
from typing import Iterable, List, TypeVar
import numpy as np

FeatureVector = TypeVar('FeatureVector', List[int], List[float])

In [69]:
class SimHash:
    def __init__(self, bits_per_hash: int, input_dims: int):
        self.bits_per_hash = bits_per_hash
        self.input_dims = input_dims
        self.hash_buckets = {}
        self.projection_vectors = np.array(
            [[2 * choices([1, 0, -1], [1/8, 3/4, 1/8])[0] for i in range(self.input_dims) #dimension of input-N
              ] for i in range(self.bits_per_hash) #bits per hash-M
            ]
        )

    def create_binary_hash(self, input_vector: FeatureVector):
        projection_results = np.dot(input_vector, self.projection_vectors.T)
        hash_bits = (projection_results > 0).astype('int')
        return ''.join(hash_bits.astype('str'))

    def __setitem__(self, input_vector: FeatureVector, content: str):
        hash_signature = self.create_binary_hash(input_vector)
        self.hash_buckets[hash_signature] = self.hash_buckets.get(hash_signature, []) + [content]

    def __getitem__(self, input_vector: FeatureVector):
        hash_signature = self.create_binary_hash(input_vector)
        return self.hash_buckets.get(hash_signature, [])


In [70]:
class LSH:
    def __init__(self, num_tables: int, bits_per_hash: int, input_dims: int):
        self.num_tables = num_tables
        self.bits_per_hash = bits_per_hash
        self.input_dims = input_dims
        self.simhash_tables = []
        for _ in range(self.num_tables):
            self.simhash_tables.append(SimHash(self.bits_per_hash, self.input_dims))

    def __setitem__(self, input_vector: FeatureVector, content: str):
        for table in self.simhash_tables:
            table[input_vector] = content

    def __getitem__(self, input_vector: FeatureVector):
        candidate_matches = []
        for table in self.simhash_tables:
            candidate_matches.extend(table[input_vector])
        match_counts = Counter(candidate_matches)
        return [(entry, match_counts.get(entry)) for entry in set(candidate_matches)]


In [71]:
def compute_cosine_similarity(vector_a: FeatureVector, vector_b: FeatureVector):
    return np.dot(vector_a, vector_b) / (np.linalg.norm(vector_a) * np.linalg.norm(vector_b))

In [72]:
def text_to_feature_vector(word_index_map: dict, text: str):
    feature_vector = []
    tokens = text.split()
    for word, index in word_index_map.items():
        feature_vector.insert(index, 1 if word in tokens else 0)
    return feature_vector

def extract_unique_terms(text: Iterable[str]) :
    unique_terms = set()
    for doc in text:
        for term in doc.split():
            unique_terms.add(term)
    return unique_terms

def create_term_index_map(text: Iterable[str]) :
    term_indices = {}
    current_index = 1
    for term in text:
        term_indices[term] = current_index
        current_index += 1
    return term_indices


In [73]:
if __name__ == '__main__':
    similarity_threshold = 0.0
    num_hash_tables = 64
    hash_bit_length = 11
    document_collection = [
    "I think we should be able to play in a sport if we have a grade C. I think i would be not fear for student that have a good grade like c to play in a sport.",
    "If we had a D or an F i would understand that but a C i nothing.",
    "Not a lot of kid get A or Bs and if we do.",
    "Some of those kids don't like to play a sport they like to do all there homework not that i am saying that a bad C grade people do there homework to.",
    "If there is only 1 out of 4 percent of student that get A and B, They all don't like the same sports and some don't like to do sports so it wouldn't be a hole team in that sport.",
    "that means you would have to cancel all the sport teams in the school.",
    "That why you should let C student play an a sport.",
    "Some schools require summer projects for students; these projects need to be student designed.",
    "When teachers plan a project some students don't enjoy completing the assigned task.",
    "For students to learn they have to be interested in what they are doing.",
    "... There are many ways to assign a school project, it could be hands on, writing, individually, in groups etc.",
    "Each student has different skills, they should do a project that best fits that skill to succeed.",
    "Summer projects need to be student designed so that they are more motivated to complete the assignment, to have fun with the assignment and to have interest in the task.",
    "Summer projects need to be student designed to enhance motivation.",
    "When students are motivated this encodes the topic in their memory.",
    "When students are motivated they get better grades, the project is better quality and students do the project to their best ability.",
    "As a student when a project gets assigned and I am not interested in the assignment I try to get it done as quickly as possible and put no effort into it.",
    "I end up getting a lower grade.",
    "Students will also want to keep working on the project if they are motivated to complete it.",
    "... With more time spent on the project this equals better quality and higher grades.",
    "Students won't procrastinate if the project interests them and will be motivated to start the project early.",
    "As a student in High School lots of students procrastinate when given a project.",
    "If projects were student designed this would motivate them to learn about the topic.",
    "Summer projects need to be student designed so that they have fun with it.",
    "If they had fun with the assignment then they would have a memory of their experience.",
    "If schools made learning fun they are more likely to remember the topic of the project.",
    "If students were in a group for the project this would enhance team work skills if they had fun.",
    "As a student I found out that if I had fun learning about a certain topic I remember it more clearly and can connect it to other things in life.",
    "Summer projects need to be student designed to have students have interest in the topic.",
    "... This would help students explore their interests that they never knew they were interested in.",
    "This may help students what they want to do after graduation.",
    "Chances are if a student is really interested in what they learned in class then they might explore that topic outside of school.",
    "When I had a project assigned to me that I was interested in I enjoyed completing the project and continued to learn more about it outside of school.",
    "Summer projects that have students interested is a key to success in their academic career.",
    "In conclusion, summer projects need to be student designed so that they look forward to working on the assignment, enjoy completing the task and continue learning about the topic outside of school.",
    "Some people say that projects should be teacher designed they are wrong because students then dont get the freedom of doing what they enjoy most.",
    "It also isn't equal to all students.",
    "... For example, a teacher assigns a research paper on a certain topic, some students have a love for writing and it is their strength and others don't like to write.",
    "This is why student designed projects are important.",
    "Driverless cars have been argued and talked about for years.",
    "There are many aspects that make the driverless car seem irresistable.",
    "However, there are a few concerns about driverless cars that are completely understandable.",
    "Driverless cars are a positive asset for this country.",
    "These cars will shape our future and change our world for the better.",
    "There have been many bumps in the road for the developement of driverless cars.",
    "This will not deter the developers though and many people are still working on them today.",
    "Not only will they help form our future, they will help improve the present.",
    "Currently, only a few companies are working on driverless cars.",
    "Some of these companies include, General Motors, Google, and BMW.",
    "... The birth of driverless cars started with special tracks that the cars could drive on.",
    "In the late 1950s, General Motors worked on a car that would run on a certain road.",
    "This road sent radio signals to the car itself and this is how the car would drive.",
    "This supported the idea that we didn't need a smarter car; we needed a smarted road.",
    "This system worked incredibly well; however, this idea was too expensive.",
    "After the reality of special tracks failed, engineers started working on smarter cars.",
    "This created a problem at first because the technology was not yet advanced enough to create driverless cars.",
    "In 2013, BMW had developed a car that could handle driving up to twenty-five miles per hour.",
    "The driver still had to pay attention to drive occasionally and the car wasn't perfect.",
    "Google had been working on a car that could drive itself since 2009.",
    "In the reading it talks about how the technology to create driverless cars was not there.",
    "... Sebastian Thrun, founder of the Google Car project mentions, \"There was no way, before 2000, to make something intersting\".",
    "He is refering to the driverless cars.",
    "The cars made in 2009 were suprisingly successful.",
    "However, the cars were not one hundred percent driverless.",
    "The google cars still needed a driver that could take over in certain conditions.",
    "This brought up the arguemtn, would people want a driverless car that they had to drive.",
    "However, in a few years the technology should be able to make these cars almost completely driverless.",
    "The argument of allowing driverless cars is on-going.",
    "There are many facts that make using driverless car seem like a good idea  One of the facts that support the driverless car is that by using driverless cars, we would use half the fuel of today's taxi system.",
    "This means we would save more money, and save the planet at the exact same time.",
    "The driverless car would have more flexibility than a bus as well.",
    "... You would call the car to your house and then let it take you wherever you needed to go.",
    "A driverless car would allow for shorter travel times because the driver would be able to take breaks.",
    "This would result in the driver arriving places quicker.",
    "Driverless cars would decrease the amount of crashes as well.",
    "These cars would eliminate human error while driving.",
    "Stop lights would not be run, and police wouldn't have to chase after cars because they would have to go the speed limit.",
    "This would cause the crime rate to go down as well.",
    "The last thing needed to have driverless cars on the road is making in legal in more states.",
    "A passage from the reading states, \"California, Nevada, Florida, and the District of Colimbia have led the country in allowing limited use of semi-autonomous car\".",
    "The cars driven in these states have been relatively sucessful.",
    "The rest of the country should use this as an example and consider allowing driverless cars.",
    "... Driverless cars are an idea of the future.",
    "However, that future is closer than many believe.",
    "These cars will limit gas, reduce crime rate, and decrese travel time.",
    "Driverless cars are an incredibly brilliant idea for America and soon the whole planet.",
    "The technology will soon allow for completely driverless cars.",
    "This will improve the world.",
    "Hopefully, driverless cars will be incorporated in daily life soon.",
    "In conclusion, instead of stressing out about driving, people will be able to sit back and let the car drive.",
    "The author of \"The Challenge of Exploring Venus\", uses many scientific facts, data, and explanations to support their idea of exploring Venus.",
    "In the passage, many examples are given of why scientists would want to explore the possibilities of the contents of Venus despite all the dangers.",
    "The text gives some support like declaring it the most Earth-like planet, the possibility of supporting life forms, and multiple explorations to Venus that have been successful.... Venus' relation to Earth is significant because it is the closest example of another planet like Earth.",
    "The author claims that Venus is even called Earth's twin and that not only is it like Earth in its density and size, but also in the distance between the two planets.",
    "Even though the planet is very similar to Earth there are challenges to get accurate data on the planet because of the many spacecrafts that were unable to withstand the harshness of the planet.",
    "As technology advanced, the author claims that more and more missions to Venus occured, and even around the time of World War II there was a spacecraft that survived in Venus conditions for about three weeks.",
    "Venus' conditions are very harmful to the life on Earth but there may have been a possibility that life could sustain on the planet.",
    "The author describes Venus as a planet mostly covered in large oceans that could have suppported life growing there.",
    "... All of these examples that the author has given supports his idea of the need to explore Venus tremedously and gives very convincing evidence and data that supports his idea.",
    "Although Venus is a very harsh planet and there may not be any possibility of life, it would still be worth it to explore this planet and how it works."
    ]

    unique_terms = extract_unique_terms(document_collection)
    term_indices = create_term_index_map(unique_terms)
    vocab_size = len(unique_terms)

    lsh_index = LSH(num_hash_tables, hash_bit_length, vocab_size)

    for doc in document_collection:
        doc_vector = text_to_feature_vector(term_indices, doc)
        lsh_index[doc_vector] = doc

    query_text = 'car crashes in recent years'
    query_vector = text_to_feature_vector(term_indices, query_text)

    candidate_matches = lsh_index[query_vector]

    print(f'Query: {query_text}')
    for candidate, match_count in candidate_matches:
        similarity = compute_cosine_similarity(query_vector, text_to_feature_vector(term_indices, candidate))
        if similarity > similarity_threshold:
            print(f'Similar document (score: {similarity:.2f}, matches: {match_count}): {candidate}')

Query: car crashes in recent years
Similar document (score: 0.13, matches: 2): This road sent radio signals to the car itself and this is how the car would drive.
Similar document (score: 0.24, matches: 1): However, in a few years the technology should be able to make these cars almost completely driverless.
Similar document (score: 0.15, matches: 2): When students are motivated this encodes the topic in their memory.
Similar document (score: 0.11, matches: 1): A passage from the reading states, "California, Nevada, Florida, and the District of Colimbia have led the country in allowing limited use of semi-autonomous car".
Similar document (score: 0.11, matches: 1): The author describes Venus as a planet mostly covered in large oceans that could have suppported life growing there.
Similar document (score: 0.11, matches: 1): In conclusion, instead of stressing out about driving, people will be able to sit back and let the car drive.
Similar document (score: 0.13, matches: 1): In the late

In [83]:
if __name__ == '__main__':
    similarity_threshold = 0.0
    num_hash_tables = 64
    hash_bit_length = 11
    with open('2.txt', 'r', encoding='utf-8') as f:
        text = f.read()
    import re
    document_collection = [para.strip() for para in re.split(r'\n\s*\n', text) if para.strip()]

    unique_terms = extract_unique_terms(document_collection)
    term_indices = create_term_index_map(unique_terms)
    vocab_size = len(unique_terms)

    lsh_index = LSH(num_hash_tables, hash_bit_length, vocab_size)

    for doc in document_collection:
        doc_vector = text_to_feature_vector(term_indices, doc)
        lsh_index[doc_vector] = doc

    query_text = 'Books read in past or future'
    query_vector = text_to_feature_vector(term_indices, query_text)

    candidate_matches = lsh_index[query_vector]

    print(f'Query: {query_text}')
    for candidate, match_count in candidate_matches:
        similarity = compute_cosine_similarity(query_vector, text_to_feature_vector(term_indices, candidate))
        if similarity > similarity_threshold:
            print(f'Similar document (score: {similarity:.2f}, matches: {match_count}): {candidate}')

Query: Books read in past or future
Similar document (score: 0.09, matches: 1): In suits at common law, where the value in controversy shall exceed
twenty dollars, the right of trial by jury shall be preserved,
and no fact tried by a jury shall be otherwise re-examined in any court
of the United States, than according to the rules of the common law.
Similar document (score: 0.18, matches: 1): The right of the people to be secure in their persons, houses, papers,
and effects, against unreasonable searches and seizures, shall not be violated,
and no Warrants shall issue, but upon probable cause, supported by oath
or affirmation, and particularly describing the place to be searched,
and the persons or things to be seized.
Similar document (score: 0.15, matches: 1): [1]  Only give exact copies of it.  Among other things, this
     requires that you do not remove, alter or modify the
     etext or this "small print!" statement.  You may however,
     if you wish, distribute this etext in ma

In [85]:

with open('2.txt', 'r', encoding='utf-8') as f:
    text = f.read()


import re
document_collection = [para.strip() for para in re.split(r'\n\s*\n', text) if para.strip()]


print(document_collection[:5])


['**Welcome To The World of Free Plain Vanilla Electronic Texts**', '**Etexts Readable By Both Humans and By Computers, Since 1971**', '*These Etexts Prepared By Hundreds of Volunteers and Donations*', 'December, 1972  [Etext #2]', '****The Project Gutenberg Etext of The U. S. Bill of Rights****\n*******This file should be named 2.txt or 2.zip******']
