In [338]:
from math import sqrt
import random
import numpy as np
from os import listdir
from os.path import isfile, join

## Read documents

In [339]:
input_dir = "./docs"
onlyfiles = [f for f in listdir(input_dir) if isfile(join(input_dir, f))]
docs = []
for fname in onlyfiles:
    with open(join(input_dir, fname), "r") as file:
        docs += [file.read()]

## Compute shingles' hash values

In [340]:
class Shingling():
    def __init__(self, k):
        self.k = k
        self.hash = hash # Use python built-in hashing function
    def transform(self, doc):
        # Compute shingles
        shingles = [doc[i:i+self.k] for i in range(0, len(doc) - self.k + 1)]
        hashes = sorted(set([abs(self.hash(shingle)) for shingle in shingles]))
        
        return hashes

In [341]:
sh = Shingling(5)
sets = [sh.transform(doc) for doc in docs]

## Compare shingle hashes with Jaccard similarity

In [342]:
def compare_shingles(a, b):
        return len(set(a) & set(b)) / len(set(a) | set(b))

In [343]:
similarities = {(onlyfiles[i],onlyfiles[j]): compare_shingles(sets[i], sets[j]) 
                for i in range(0, len(docs)) 
                for j in range(i+1, len(docs))}
[(k,v) for k,v in similarities.items() if v > 0.3]

[(('A1.txt', 'A1small.txt'), 0.9565720903300521),
 (('B1.txt', 'B2.txt'), 1.0),
 (('B1.txt', 'B1small.txt'), 0.4695366654505655),
 (('B2.txt', 'B1small.txt'), 0.4695366654505655)]

## Compute minHash signatures

In [344]:
class MinHashing():
    def __init__(self, k):
        c = 4294967311 # Big prime
        a_coeffs = random.sample(range(1, 2**31), k)
        b_coeffs = random.sample(range(1, 2**31), k)
        self.hashes = [lambda x: (a_coeffs[i]*x + b_coeffs[i])%c for i in range(k)]
        
    def transform(self, shingles):
        signatures = [[f(shingle) for shingle in shingles] for f in self.hashes]
        return [shingles[np.argmin(l)] for l in signatures]


In [345]:
mh = MinHashing(100)
signatures = [mh.transform(s) for s in sets]

## Compare minHash signatures

In [346]:
def compare_signatures(a, b):
    if len(a) != len(b):
        raise ValueError("Signatures lengths differ.")
    #return sum([int(a[i] == b[i]) for i in range(len(a))]) / (len(a)*1.0)
    return np.mean(a == b)

In [347]:
estimations = {(onlyfiles[i],onlyfiles[j]):compare_signatures(signatures[i], signatures[j]) 
                for i in range(0, len(signatures)) 
                for j in range(i+1, len(signatures))}
[(k,v) for k,v in estimations.items() if v > 0.3]

[(('A1.txt', 'A1small.txt'), 1.0),
 (('B1.txt', 'B2.txt'), 1.0),
 (('B1.txt', 'B1small.txt'), 1.0),
 (('a0200021.txt', 'a0200045.txt'), 1.0),
 (('B2.txt', 'B1small.txt'), 1.0),
 (('a0200019.txt', 'a0200047.txt'), 1.0)]