In [669]:
from math import sqrt
import random
import numpy as np
from os import listdir
from os.path import isfile, join
import re
from operator import itemgetter

## Read documents

In [670]:
input_dir = "./docs"
onlyfiles = [f for f in listdir(input_dir) if isfile(join(input_dir, f))]
docs = []
for fname in onlyfiles:
    with open(join(input_dir, fname), "r") as file:
        docs += [file.read()]

## Clean documents (optional)

In [671]:
docs = [re.sub('\W+', ' ', doc) for doc in docs]

## Compute shingles' hash values

In [672]:
class Shingling():
    def __init__(self, k):
        self.k = k
        self.hash = hash # Use python built-in hashing function
    def transform(self, doc):
        # Compute shingles
        shingles = np.array([doc[i:i+self.k] for i in range(0, len(doc) - self.k + 1)])
        # Filter out duplicates and sort
        hashes = sorted(set([self.hash(shingle) for shingle in shingles]))
        
        return hashes

In [673]:
sh = Shingling(9)
sets = [sh.transform(doc) for doc in docs]

## Compare shingle hashes with Jaccard similarity

In [674]:
def compare_shingles(a, b):
    """Returns the jaccard similarity between two sets.
        a -> set
        b -> set
    """
    return len(set(a) & set(b)) / len(set(a) | set(b))

In [675]:
similarities = {(onlyfiles[i],onlyfiles[j]): compare_shingles(sets[i], sets[j]) 
                for i in range(0, len(docs)) 
                for j in range(i+1, len(docs))}
# Show similarities greater than a threshold
sorted([(k,v) for k,v in similarities.items() if v > 0.4], key=itemgetter(1), reverse=True)

[(('B1.txt', 'B2.txt'), 1.0),
 (('A1.txt', 'A1small.txt'), 0.9586281981491562),
 (('B1.txt', 'B1small.txt'), 0.45179335307666996),
 (('B2.txt', 'B1small.txt'), 0.45179335307666996)]

## Compute minHash signatures

In [676]:
class MinHashing():
    def __init__(self, k):
        self.k = k
        self.c = 4294967311 # Big prime
        self.a_coeffs = random.sample(range(1, 2**31), k)
        self.b_coeffs = random.sample(range(1, 2**31), k)
        
    def _hash(self, x, i):
        return (self.a_coeffs[i]*x + self.b_coeffs[i])%self.c
        
    def transform(self, shingles):
        signatures = np.array([[self._hash(shingle, i) for shingle in shingles] for i in range(self.k)])
        return np.array([l[np.argmin(l)] for l in signatures])


In [677]:
mh = MinHashing(100)
signatures = [mh.transform(s) for s in sets]

## Compare minHash signatures

In [678]:
def compare_signatures(a, b):
    """Computes the similarity between 2 signatures.
        a: numpy array
        b: numpy array
    """
    if len(a) != len(b):
        raise ValueError("Signatures lengths differ.")
    return np.mean(a == b)

In [679]:
estimations = {(onlyfiles[i],onlyfiles[j]):compare_signatures(signatures[i], signatures[j]) 
                for i in range(0, len(signatures)) 
                for j in range(i+1, len(signatures))}
# Show similarity estimations greater than a threshold
sorted([(k,v) for k,v in estimations.items() if v > 0.4], key=itemgetter(1), reverse=True)

[(('B1.txt', 'B2.txt'), 1.0),
 (('A1.txt', 'A1small.txt'), 0.96999999999999997),
 (('B1.txt', 'B1small.txt'), 0.56000000000000005),
 (('B2.txt', 'B1small.txt'), 0.56000000000000005)]

## Compare minHash estimations against real similarities

In [680]:
errors = {(onlyfiles[i],onlyfiles[j]):abs(estimations[(onlyfiles[i],onlyfiles[j])] - similarities[(onlyfiles[i],onlyfiles[j])])
           for i in range(0, len(docs)) 
           for j in range(i+1, len(docs))}
# Show errors greater than a threshold (e.g. 5%)
sorted([(k,v) for k,v in errors.items() if v > 0.05], key=itemgetter(1), reverse=True)

[(('B1.txt', 'B1small.txt'), 0.1082066469233301),
 (('B2.txt', 'B1small.txt'), 0.1082066469233301),
 (('A1small.txt', 'a0200029.txt'), 0.069539628563955),
 (('A1.txt', 'a0200029.txt'), 0.054129032258064508),
 (('B1.txt', 'a0200021.txt'), 0.053495934959349588),
 (('B2.txt', 'a0200021.txt'), 0.053495934959349588)]